# Get all questions and question meta-data for a given topic ID

### About this script:
* **Input**: Topic ID
* **Output**: A list of questions with other meta-data: Question ID, Question Text, Question Data, Question URL
* **Topics and their topic ID used in the article**:
    - 2016年美国总统大选 (2016 US Election) = 20019119 
    - 美国政治 (US Politics) = 19662206 
    - 美国社会 (US Society) = 19646100 
    - 中美比较 (China-US Comparison) = 19906729 
    - 美国经济 (US Economy) = 19586038 
    
### General info:
* **Forum**: https://www.zhihu.com/ (Sign-up required)
* **References**: https://blog.csdn.net/wenxuhonghe/article/details/86515558; https://blog.csdn.net/wenxuhonghe/article/details/107122978 --I want to thank the code creator, 机灵鹤 ("Smart Crane"), for answering my questions about modifying his code for my own project!
* **Author**: Di Zhou (NYU Sociology)
* **Last Run**: Dec. 2020 
* **Disclaimer**: The forum constantly updates its security and webpage information architecture. This scraping code and its reference may need modifications in order to scrape data from the forum when you access it.

In [1]:
import requests
import json
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup as bs
# usage: datetime.fromtimestamp(timestamp)

In [2]:
def fetchHotel(url):
    # Set request and get data
    # May need to update cookie
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
        #'cookie': '_zap=863b81d4-0839-43bf-b501-7eafdab8619b; _xsrf=bb368175-49d2-456b-81bd-b6ea0e4b29b3; d_c0="AHAcCKrw_xCPTjS-RPIdgnNL4D2Vk6Pruh8=|1584833755"; _ga=GA1.2.1506193728.1584833760; q_c1=c956189d6bc1401f9e330c82f043d3b1|1594734830000|1584971899000; __utmc=51854390; __utmv=51854390.100--|2=registration_date=20110621=1^3=entry_date=20110621=1; tshl=; tst=r; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1594912792,1594912793,1594946598,1595012772; __utmz=51854390.1595941243.10.8.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/question/378861096; l_n_c=1; l_cap_id="MDQ1Zjc5NmM1NGM5NDhhZDk5YzY2OGRmYTY1M2M1OTM=|1595945669|8ce414da844dcaeac51212a643822e3f3ebc123a"; r_cap_id="ZmVhODBiNDUzOWFlNGExMjgyMDFjMzQwY2E4Y2UwYzc=|1595945669|2a9260ff946ab9de5a8ce6b9dce89aa2a2b799fa"; cap_id="ODg0ZjY1ZDc3YmQzNDI4NmFhZTdmZGQxYzg2ZDQ2Yjc=|1595945669|c806e64399c257578ccc27895b0d7f97324ff65c"; n_c=1; capsion_ticket="2|1:0|10:1595945676|14:capsion_ticket|44:MTkxYTY1NTNkMDc4NDgyNThhYjY3Zjc2ZThkMzg1MmU=|06237b9fadbddb3a759d559081472b2a79e0144d3d12994f76826c850ad2bc70"; SESSIONID=n5Yd8KjAFfHcnoiIcH9VXaXRDeGI9aViSW2iNaRr6lW; JOID=VV8SB0mM09tR8oG3YY9mCe6gGNBw4uaBM8_K3SPPtZMxmuveKELi_g33gbRk4lqpvhgTElJdzleFchHHQF5Ou9I=; osd=UVsUCkuI191c8IWzZ4JkDeqmFdJ05uCMMcvO2y7NsZc3l-naLETv_Anzh7lm5l6vsxoXFlRQzFOBdBzFRFpIttA=; z_c0="2|1:0|10:1595945679|4:z_c0|92:Mi4xa01zQUFBQUFBQUFBY0J3SXF2RF9FQ1lBQUFCZ0FsVk56NEFOWUFEdXVWaTJRZUNLMFBfNjRFWExFVHJfS2xJTF9R|9c1ed104322dbe9787932ca865b4f0cda2e5412583c20de9f2a08568eef4e15a"; __utma=51854390.1506193728.1584833760.1595945582.1595950496.12; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1596287977; _gid=GA1.2.407247657.1596287980; KLBRSID=031b5396d5ab406499e2ac6fe1bb1a43|1596288508|1596287944'
    }
 
    # Send request
    r = requests.get(url,headers=headers)
    r.encoding = 'Unicode'
    return r.text

In [3]:
def parseJson(text):
    json_data = json.loads(text)
    lst = json_data['data']
    nextUrl = json_data['paging']['next']
    
    if not lst:
        return;
 
    for item in lst:
        type = item['target']['type']
        
        if type == 'answer':
            # Answer 回答
            cn_type = '问题_来自回答'
            question = item['target']['question']
            id = question['id']
            title = question['title']
            url = 'https://www.zhihu.com/question/' + str(id)
            question_date = datetime.fromtimestamp(question['created']).strftime("%Y-%m-%d")
            #question_follower = question['follower_count']
            #answer_count = question['answer_count']
            print("问题：",id,title)
            sml_list = [cn_type, id, title, url, question_date]
            q_list.append(sml_list)

        elif type == 'question':
            # Question 问题
            cn_type = '问题'
            question = item['target']
            id = question['id']
            title = question['title']
            url = 'https://www.zhihu.com/question/' + str(id)
            question_date = datetime.fromtimestamp(question['created']).strftime("%Y-%m-%d")
            #question_follower = question['follower_count']
            #answer_count = question['answer_count']
            print("问题：",id,title)
            sml_list = [cn_type, id, title, url, question_date]
            q_list.append(sml_list)
            
        elif type == 'article':
            # Article 专栏
            cn_type = '专栏'
            zhuanlan = item['target']
            id = zhuanlan['id']
            title = zhuanlan['title']
            url = zhuanlan['url']
            article_date = datetime.fromtimestamp(zhuanlan['created']).strftime("%Y-%m-%d")
            #vote = zhuanlan['voteup_count']
            #cmts = zhuanlan['comment_count']
            #auth = zhuanlan['author']['name']
            print("专栏：",id,title)
            sml_list = [cn_type, id, title, url, article_date]
            q_list.append(sml_list)
 
    return nextUrl

In [4]:
def save_data(q_list):

    filename = 'data/美国大选2016_questions.csv'
    # filename = 'data/美国政治_questions.csv'
    # filename = 'data/美国社会_questions.csv'
    # filename = 'data/中美比较_questions.csv'
    # filename = 'data/美国经济_questions.csv'
    
    dataframe = pd.DataFrame(q_list)
    dataframe.to_csv(filename, mode='a', index=False, sep=',', header=['type','id','title','url','date'])


In [None]:
if __name__ == '__main__':
    
    # 2016年美国总统大选 (2016 US Election) = 20019119 
    # 美国政治 (US Politics) = 19662206 
    # 美国社会 (US Society) = 19646100 
    # 中美比较 (China-US Comparison) = 19906729 
    # 美国经济 (US Economy) = 19586038 
    
    topicID = '20019119'
    # List variable order: type, id, title, url, date
    q_list = []
    
    # Discussion 讨论
    url = 'https://www.zhihu.com/api/v4/topics/' + topicID + '/feeds/top_activity?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.paid_info%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&after_id=0'
    while url:
        text = fetchHotel(url)
        url = parseJson(text)
 
    # Selected posts精华
    url = 'https://www.zhihu.com/api/v4/topics/' + topicID + '/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.paid_info%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=0'
    while url:
        text = fetchHotel(url)
        url = parseJson(text)
 
    # Awaiting answers 等待回答
    url = 'https://www.zhihu.com/api/v4/topics/' + topicID + '/feeds/top_question?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.paid_info%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=0'
    while url:
        text = fetchHotel(url)
        url = parseJson(text)
        
    save_data(q_list)