# Get question meta-data for a given question ID

### About this script:
* **Input**: A list of question ID
* **Output**: A list of question meta-data by question ID: Question Text, Answer Count, Follower Count, View Count, Tags

### General info:
* **Forum**: https://www.zhihu.com/ (Sign-up required)
* **References**: https://blog.csdn.net/wenxuhonghe/article/details/86515558; https://blog.csdn.net/wenxuhonghe/article/details/107122978 --I want to thank the code creator, 机灵鹤 ("Smart Crane"), for answering my questions about modifying his code for my own project!
* **Author**: Di Zhou (NYU Sociology)
* **Last Run**: Dec. 2020 
* **Disclaimer**: The forum constantly updates its security and webpage information architecture. This scraping code and its reference may need modifications in order to scrape data from the forum when you access it.

In [1]:
import pandas as pd
import requests
import json
from datetime import datetime
from bs4 import BeautifulSoup as bs
import pickle

In [2]:
def get_question_data(q_id):

    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
    }
    
    url = 'https://www.zhihu.com/question/' + str(q_id)
    
    try:
        r = requests.get(url, headers = headers)
        r.raise_for_status()
        bsobj = bs(r.text, 'lxml')
        
        qContent = bsobj.find_all('meta', attrs={'itemprop':'name'})[0]['content']
        followerCount = bsobj.find_all('strong', attrs={'class':'NumberBoard-itemValue'})[0]['title']
        viewCount = bsobj.find_all('strong', attrs={'class':'NumberBoard-itemValue'})[1]['title']
        answerCount = bsobj.find_all('meta', attrs={'itemprop':'answerCount'})[0]['content']
        topicTag = bsobj.find_all('meta', attrs={'itemprop':'keywords'})[0]['content']
        
        return [q_id, qContent, followerCount, viewCount, answerCount, topicTag]
    
    except requests.HTTPError as e:
        print(e)
        print("HTTPError")
        return [q_id, e, e, e, e, e]
    
    except requests.RequestException as e:
        print(e)
        return [q_id, e, e, e, e, e]
    
    except:
        print("Unknown Error !")
        return [q_id, "UnknownError", "UnknownError", "UnknownError", "UnknownError", "UnknownError"]
    

In [4]:
def save_data(q_info_list):
    filename = 'data/q_meta.csv'
    dataframe = pd.DataFrame(q_info_list)
    # dataframe.to_csv(filename, mode='a', index=False, sep=',', header=False)
    dataframe.to_csv(filename, mode='a', index=False, sep=',', header=['q_id','q_content','followerCount','viewCount', 'answerCount', 'topicTag'])

In [5]:
def save_list(q_info_list):
    with open('data/q_meta.data', 'wb') as filehandle:
        pickle.dump(q_info_list, filehandle)

In [6]:
# Import csv (a list of Question ID), convert to list
df = pd.read_csv('data/q_list.csv')  
q_list = df.values.tolist()

In [None]:
# A loop that run all questions
def main():
    
    q_info_list = []    
    
    for i in range(len(q_list)): 
        q_id = q_list[i][0]
        print('working on', 'question', i+1 , ': ', q_list[i][0], q_list[i][7])
        q_info_list.append(get_question_data(q_id))
    try:
        save_data(q_info_list)
    except:
        save_list(q_info_list)
    
if __name__ == '__main__':
    main()
    print("Finish！！")
        