# Get Additional Author Information by Author ID

### About this script:
* **Input**: A list of author ID
* **Output**: A list of additional information by author ID: Total upvote received, total follower, total answer posted, total article posted, top-write and other forum achievements

### General info:
* **Forum**: https://www.zhihu.com/ (Sign-up required)
* **References**: https://blog.csdn.net/wenxuhonghe/article/details/86515558; https://blog.csdn.net/wenxuhonghe/article/details/107122978 --I want to thank the code creator, 机灵鹤 ("Smart Crane"), for answering my questions about modifying his code for my own project!
* **Author**: Di Zhou (NYU Sociology)
* **Last Run**: Dec. 2020 
* **Disclaimer**: The forum constantly updates its security and webpage information architecture. This scraping code and its reference may need modifications in order to scrape data from the forum when you access it.

In [None]:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup as bs
import re

In [None]:
# Import csv, convert to list
df = pd.read_csv('data/user_list.csv')  
u_list = df.values.tolist()

In [None]:
len(u_list)

In [None]:
def get_achievement(user_token): 
    '''
    Task：A function that with input user url token, return achievement info
    Parameter：user url token
    Return：a list of achievement info
    '''
    
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
    }
    
    url = 'https://www.zhihu.com/people/' + str(user_token) 
 
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        bsobj = bs(r.text, 'lxml')
        voteupCount = bsobj.find_all('meta', attrs={'itemprop':'zhihu:voteupCount'})[0]['content'] # Total upvote
        thankedCount = bsobj.find_all('meta', attrs={'itemprop':'zhihu:thankedCount'})[0]['content'] # Total thanked
        followerCount = bsobj.find_all('meta', attrs={'itemprop':'zhihu:followerCount'})[0]['content'] # Total follower
        answerCount = bsobj.find_all('meta', attrs={'itemprop':'zhihu:answerCount'})[0]['content'] # Total answers posted
        articlesCount = bsobj.find_all('meta', attrs={'itemprop':'zhihu:articlesCount'})[0]['content'] # Total articles posted
        achievement_raw = bsobj.find_all('div', attrs={'class':'css-vurnku'}) # Top-writer and other achievements soup object
        achievement = re.sub(r"\<.*?\>|\.css.*?\;\}", '', str(achievement_raw)) # get achievement info from raw
        
        return [user_token, voteupCount, thankedCount, followerCount, answerCount, articlesCount, achievement]
    
    except requests.HTTPError as e:
        print(e)
        print("HTTPError")
    except requests.RequestException as e:
        print(e)
    except:
        print("Unknown Error !")

In [None]:
def save_data(u_ach_list, filename):

    filename = filename
    dataframe = pd.DataFrame(u_ach_list)
    dataframe.to_csv(filename, mode='a', index=False, sep=',', header=False)

In [None]:
u_ach_list = []

for i in range(len(u_list)):
    user_token = u_list[i][7]
    print('working on user ', i+1, ': ', u_list[i][5])
    if get_achievement(user_token) == None: 
        pass
    else:
        u_ach_list.append(get_achievement(user_token))

In [None]:
if None in u_ach_list:
    for i in range(len(u_ach_list)):
        if u_ach_list[i] is None:
            print('element index', i, 'is NoneType, Deleting')
            u_ach_list.pop(i)

In [None]:
save_data(u_ach_list, 'data/u_ach_list.csv')