# Grab Altmetric.com Data

## Request Functions
This part contains functions we need to fetch the web data and should also handle the exceptions while fetching here.

In [None]:
from grab_util import *
import json
import pandas as pd
import numpy as np
import math
import threading


In [None]:
origin_folder = 'SUSTC_Journals/articles_all'

## Grab altmetric.com Ids

In [None]:
def get_altmetric_id(doi):
    detail_id = ''
    score = 0
    res = grab_from_url_json('https://api.altmetric.com/v1/doi/' + doi)
    if res is not None:
        detail_id = res['altmetric_id']
        score = res['score']
    return detail_id, score

In [None]:
all_articles_df = pd.read_csv(f'{origin_folder}/all.csv', usecols=['SO', 'DI'])
df = pd.DataFrame(columns=['DI', 'altmetric_id'])

In [None]:
def grab_altmetric_ids(start_index, end_index):
    dfy =pd.DataFrame()

    all_articles_df_slice = all_articles_df[start_index:end_index]
    for index, doi in enumerate(all_articles_df_slice['DI']):
        if doi in df['DI'].values:
            continue
        if index % 500 == 0:
             print(index)
        dfx = pd.DataFrame(columns=['DI', 'altmetric_id', 'score'])
        dfx['DI'] = [doi]
        res = get_altmetric_id(str(doi))
        dfx['altmetric_id'] = [res[0]]
        dfx['score'] = [res[1]]
        dfy = dfy.append(dfx, ignore_index=True)
    return dfy

In [None]:
class grabIdThread(threading.Thread):
    def __init__(self, threadID, name, start_index, end_index):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.start_index = start_index
        self.end_index = end_index
    def run(self):
        global df
        dfy = grab_altmetric_ids(self.start_index, self.end_index)
        threadLock.acquire()
        df = df.append(dfy)
        threadLock.release()

threadLock = threading.Lock()


In [None]:
%%time
step = 1000
threads = []

prior_end = 0
for i in range(0, 10):
    for j in range(0, (int)(len(all_articles_df) / (step * 10))):
        start_index = prior_end
        end_index = start_index + step
        if end_index + step > len(all_articles_df):
            end_index = len(all_articles_df)

        prior_end = end_index
        
        if start_index >= end_index:
            break

        threadx = grabIdThread(j, f'thread_{start_index}_{end_index}', start_index, end_index)
        print(threadx)
        threadx.start()
        threads.append(threadx)
    for t in threads:
        t.join()

df = df.sort_index()
df = df.drop_duplicates()
df.to_csv(f'{origin_folder}/all_altmetric_id.csv', index=False)


## Grab altmetric.com Details

In [None]:
def grab_detail_altmetric(doi, altmetric_id):
    df = pd.DataFrame()
    b_list = ['news outlets', 'blogs', 'policy', 'tweeters', 'weibo', 'facebook pages', 'wikipedia', 'redditors', 'f1000', 'video uploader', 'dimensions_citation', 'mendeley', 'citeulike']

    df['DI'] = [doi]
    df['altmetric_id'] = [altmetric_id]
    if altmetric_id != '' and not math.isnan(altmetric_id):
        altmetric_id = int(altmetric_id)
        news_anchor = 'news</dt><dd><a href="/details/' + str(altmetric_id) + '/news"><strong>'
        blogs_anchor = 'blogs</dt><dd><a href="/details/' + str(altmetric_id) + '/blogs"><strong>'
        policy_anchor = 'policy</dt><dd><a href="/details/' + str(altmetric_id) + '/policy-documents"><strong>'
        twitter_anchor = 'twitter</dt><dd><a href="/details/' + str(altmetric_id) + '/twitter"><strong>'
        weibo_anchor = 'weibo</dt><dd><a href="/details/' + str(altmetric_id) + '/weibo"><strong>'
        facebook_anchor = 'facebook</dt><dd><a href="/details/' + str(altmetric_id) + '/facebook"><strong>'
        wikipedia_anchor = 'wikipedia</dt><dd><a href="/details/' + str(altmetric_id) + '/wikipedia"><strong>'
        redditors_anchor = 'reddit</dt><dd><a href="/details/' + str(altmetric_id) + '/reddit"><strong>'
        f1000_anchor = 'f1000</dt><dd><a href="/details/' + str(altmetric_id) + '/f1000"><strong>'
        video_anchor = 'video</dt><dd><a href="/details/' + str(altmetric_id) + '/video"><strong>'
        dimensions_citation_anchor = 'dimensions_citation</dt><dd><a href="/details/' + str(altmetric_id) + '/citations"><strong>'
        mendeley_anchor = 'mendeley</dt><dd><a href="/details/' + str(altmetric_id) + '#mendeley-demographics"><strong>'
        citeulike_anchor = 'citeulike</dt><dd><strong>'
        c_list = [news_anchor, blogs_anchor, policy_anchor, twitter_anchor, weibo_anchor, facebook_anchor, wikipedia_anchor, redditors_anchor, f1000_anchor, video_anchor, dimensions_citation_anchor, mendeley_anchor, citeulike_anchor]

        end_anchor = '</strong>'

        res = grab_from_url_content('https://www.altmetric.com/details/' + str(altmetric_id))
        if res is not None:

            for i in range(0, len(c_list)):
                start_index = res.find(c_list[i])
                if start_index > 0:
                    start_index += len(c_list[i])
                    number = 0
                    end_index = res.find(end_anchor, start_index, start_index + 100)
                    number_temp = res[start_index: end_index]

                    if number_temp is not '':
                        number = number_temp
                    df[b_list[i]] = int(number)
                else:
                    df[b_list[i]] = 0
        else:
            for i in range(0, len(b_list)):
                df[b_list[i]] = 0
    else:
        for i in range(0, len(b_list)):
            df[b_list[i]] = 0

    return df

In [None]:
def grab_altmetric_detail_slice(start_index, end_index):
    dfy =pd.DataFrame()

    all_articles_df_slice = all_articles_df[start_index:end_index]
    for index, row in all_articles_df_slice.iterrows():
        if row['DI'] in dfg['DI'].values:
            continue
        if index % 100 == 0:
            print(index)
        dfx = grab_detail_altmetric(row['DI'], row['altmetric_id'])
        dfy = dfy.append(dfx, ignore_index=True)
    return dfy

In [None]:
class grabDetailThread(threading.Thread):
    def __init__(self, threadID, name, start_index, end_index):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.start_index = start_index
        self.end_index = end_index
    def run(self):
        global dfg
        dfy = grab_altmetric_detail_slice(self.start_index, self.end_index)
        threadLock.acquire()
        dfg = dfg.append(dfy)
        threadLock.release()

threadLock = threading.Lock()

In [None]:
%%time
step = 1000
threads = []
all_articles_df = pd.read_csv(f'{origin_folder}/all_altmetric_id.csv')

prior_end = 0
for i in range(0, 20 + 1):
    print(i)
    dfg = pd.DataFrame(columns=['DI'])
    for j in range(0, (int)(len(all_articles_df) / (step * 20))):
        start_index = prior_end
        end_index = start_index + step
        if end_index + step > len(all_articles_df):
            end_index = len(all_articles_df)
        prior_end = end_index

        if(start_index >= end_index):
            break
        
        threadx = grabDetailThread(j, f'thread_{start_index}_{end_index}', start_index, end_index)
        print(threadx)
        threadx.start()
        threads.append(threadx)
    for t in threads:
        t.join()

    dfg = dfg.sort_index()
    dfg = dfg.drop_duplicates()
    dfg.to_csv(f'{origin_folder}/all_altmetric_detail_{i}.csv', index=False)

## Grab altmetric.com's Tweets Detail

### Parser
We will get the html content from the url which is not listed as we want it be, so we need parser to parse them into listed data, in json form.

In [None]:
from html.parser import HTMLParser

class AltmetricHTMLParser(HTMLParser):
    tweets = []
    retweets = []
    articles = {'tweets': tweets, 'retweets': retweets}
    in_article = False
    is_reply = False
    has_article = False

    def handle_starttag(self, tag, attrs):
        if tag == 'body':
            self.has_article = False
        if tag == 'article':
            self.has_article = True
            self.in_article = True

        if self.in_article and (tag == 'a'):
            for attr in attrs:
                if (attr[0] == 'class') and (attr[1] == 'reply'):
                    self.is_reply = True
                if self.is_reply and (attr[0] == 'href'):
                    self.tweets.append(attr[1].split('=')[1])
                    break
        return
                    
    def handle_endtag(self, tag):
        if tag == 'article':
            self.in_article = False
        self.is_reply = False
        return

    def handle_data(self, data):
        pass

    def handle_comment(self, data):
        pass

    def handle_entityref(self, name):
        pass

    def handle_charref(self, name):
        pass

    def handle_decl(self, data):
        pass

parser = AltmetricHTMLParser()

### Grab Data

In [None]:
def get_mention(source, dfx):
    headers = {
        'cookie': "_ga=GA1.2.501259425.1572141776; weibo_license_acknowledgement=false; explorer_user=WWs2eDNOOUVjVHJTcHRjSm9Oc0ZuZEh0eDRkTmlPd0tCWFMzODNyNlIweFBvdjllM3RkcmF0RVdtRWdvRDN4Ny0tQlNpeTBsWU5NK0ZNTnFpK1FpVFUzdz09--8bac413ca9fd623b9918b311729130cfbb4e66e1; _altmetric-explorer_session=eExKU3g1UmJhWXpXbmVtS05YZmxVS3YyaVNDSGdPL2JyaWVYaE9mRTdPT3RlYlRLaTdrRmRWcnZpeFMrMFJLb0FIVDBvNHI3N09MWS9HZDhDRDdIbTZPZElvMWNPNzMvTm5JVkovWThiQVplZVdTRlZheDBWYWdvd2lIMXo2NHJUZkFJd2MxcWpZSmpCaXVXRW9CYlZnZ1l4L1lXMGhIa1k1d0o1bllvS09uWDBQb3JXMWUrazZ2YWVwaUwzei93cDdUZkhwanFtSy9lSkxLUmdRdEIrZz09LS1DckIxZFJZUVBWNmhsTEJXcWtkR2x3PT0%3D--884ad24217914b545556a68678c2d803c65ce9cf; intercom-session-9dnltu6y=alcwSlZxZHMzZlY3Wm8yaFpvWFg1OGxEYVZ4MXRRUDMrbkk1Q01Bdm5XNXJkQXNZcVAyeEZYVHFYS3RUbWJSbi0tclFPUDlZcUVpMFZKQVlxdlMwRllVQT09--46bb4ebcf64063d6a3712dd82a1d2bff57eed87f, _ga=GA1.2.501259425.1572141776; weibo_license_acknowledgement=false; explorer_user=WWs2eDNOOUVjVHJTcHRjSm9Oc0ZuZEh0eDRkTmlPd0tCWFMzODNyNlIweFBvdjllM3RkcmF0RVdtRWdvRDN4Ny0tQlNpeTBsWU5NK0ZNTnFpK1FpVFUzdz09--8bac413ca9fd623b9918b311729130cfbb4e66e1; _altmetric-explorer_session=eExKU3g1UmJhWXpXbmVtS05YZmxVS3YyaVNDSGdPL2JyaWVYaE9mRTdPT3RlYlRLaTdrRmRWcnZpeFMrMFJLb0FIVDBvNHI3N09MWS9HZDhDRDdIbTZPZElvMWNPNzMvTm5JVkovWThiQVplZVdTRlZheDBWYWdvd2lIMXo2NHJUZkFJd2MxcWpZSmpCaXVXRW9CYlZnZ1l4L1lXMGhIa1k1d0o1bllvS09uWDBQb3JXMWUrazZ2YWVwaUwzei93cDdUZkhwanFtSy9lSkxLUmdRdEIrZz09LS1DckIxZFJZUVBWNmhsTEJXcWtkR2x3PT0%3D--884ad24217914b545556a68678c2d803c65ce9cf; intercom-session-9dnltu6y=alcwSlZxZHMzZlY3Wm8yaFpvWFg1OGxEYVZ4MXRRUDMrbkk1Q01Bdm5XNXJkQXNZcVAyeEZYVHFYS3RUbWJSbi0tclFPUDlZcUVpMFZKQVlxdlMwRllVQT09--46bb4ebcf64063d6a3712dd82a1d2bff57eed87f; Cookie_1=value",
        'User-Agent': "PostmanRuntime/7.19.0",
        'Accept': "*/*",
        'Cache-Control': "no-cache",
        'Postman-Token': "30d13c93-aa9a-44e1-84e0-8876f98ae1ba,b45c8eb6-7f77-4c48-b961-921c89280079",
        'Host': "www.altmetric.com",
        'Accept-Encoding': "gzip, deflate",
        'Connection': "keep-alive",
        'cache-control': "no-cache"
    }

    articles_dictx = {}
    
    for index, row in dfx.iterrows():
        if index % 100 == 0:
            print(str(index))
        if row['DI'] in articles_dict:
            continue
        if row['DI'] == '' or str(row['DI']).lower() == 'nan' or row['altmetric_id'] == '' or str(row['altmetric_id']).lower() == 'nan':
            articles_dict[row['DI']] = {'altmetric_id': '', f'{source}_num': 0, f'{source}s': []}
            continue

        articles = []
        grab_url = f'https://www.altmetric.com/explorer/json_data/mentions?identifier={row["DI"]}&mention_sources%5B%5D=type%3A{source}&scope=all&page='
        for i in range(1, 100000):
            articles_json = grab_from_url_json(grab_url + str(i), headers=headers)
            if articles_json == None:
                break

            articles.extend(articles_json['data'])
            # print('page: ', str(i))
            if articles_json['lastPage']:
                break
        
        articles_dictx[row['DI']] = {'altmetric_id': row['altmetric_id'], f'{source}_num': len(articles), f'{source}s': articles}
    return articles_dictx

In [None]:
class grabTweetsThread(threading.Thread):
    def __init__(self, threadID, name, dfx):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.dfx = dfx
    def run(self):
        articles_dictx = get_mention('tweet', self.dfx)
        global articles_dict
        threadLock.acquire()
        articles_dict.update(articles_dictx)
        threadLock.release()

threadLock = threading.Lock()

In [None]:
%%time
df = pd.read_csv(f'{origin_folder}/all_altmetric_id.csv', usecols=['DI', 'altmetric_id'], dtype={'altmetric_id':str})
split1 = 4
split2 = 500
step = (int)(len(df) / (split1*split2))

threads = []

for i in range(0, split1):
    articles_dict = {}
    for j in range(0, split2):
        start_index = i * split2 * step + j * step
        end_index = start_index + step
        if end_index + step > len(df):
            end_index = len(df)
        
        dfx = df[start_index:end_index]
        threadx = grabTweetsThread(j, f'thread_{start_index}_{end_index}', dfx)
        print(threadx)
        threadx.start()
        threads.append(threadx)

    for t in threads:
        t.join()
    
    with open(f'{origin_folder}/article_tweets_altmetric_{i}.json', "w+") as dump_f:
        dump_f.write(json.dumps(articles_dict))

In [None]:
del articles_dict

In [None]:
def order_tweets(i):
    print(i)
    with open(f'{origin_folder}/article_tweets_altmetric_{i}.json', "r") as f:
        article_tweets = json.load(f)

    df_tweets = pd.DataFrame(columns=['doi', 'altmetric_id', 'tweet_id', 'tweet_url', 'postedAt', 'postType', 'originalPost'])
    saved_count = 0
    doi_count = 0

    for doi in article_tweets:
        doi_count += 1
        if doi_count % 500 == 0:
            print(doi_count)
        for tweet_chain in article_tweets[doi]['tweets']:
            for tweet in tweet_chain[1]:
                if tweet['url'] in df_tweets['tweet_url']:
                    continue
                df_tweet = pd.DataFrame(columns=['doi', 'altmetric_id', 'tweet_id', 'tweet_url', 'postedAt', 'postType', 'originalPost'])
                df_tweet['doi'] = [doi]
                df_tweet['altmetric_id'] = [article_tweets[doi]['altmetric_id']]
                df_tweet['tweet_id'] = [str(tweet['url']).split('/')[-1]]
                df_tweet['tweet_url'] = [tweet['url']]
                df_tweet['postedAt'] = [tweet['postedAt']]
                df_tweet['postType'] = [tweet['postType']]
                df_tweet['originalPost'] = [str(tweet['originalPost'])]

                df_tweets = df_tweets.append(df_tweet, ignore_index=True)

        if len(df_tweets) > 200000:
            df_tweets.to_csv(f'{origin_folder}/article_tweets_altmetric_{i}_{saved_count}.csv', index=False)
            df_tweets = pd.DataFrame(columns=['doi', 'altmetric_id', 'tweet_id', 'tweet_url', 'postedAt', 'postType', 'originalPost'])
            saved_count += 1

    df_tweets.to_csv(f'{origin_folder}/article_tweets_altmetric_{i}_{saved_count}.csv', index=False)

In [None]:
%%time

for i in range(0, 4):
    order_tweets(i)

In [None]:
df_all = pd.read_csv(f'{origin_folder}/all_altmetric_detail.csv')
df_tweets = pd.read_csv(f'{origin_folder}/article_tweets_altmetric.csv')

In [None]:
df_tweets_count = df_tweets.groupby(by=['doi'])['altmetric_id'].count()
df_all = df_all.merge(df_tweets_count, left_on=['DI'], right_on=['doi'], how='left', suffixes=['', '_tweets_count'])
df_all['altmetric_id_tweets_count'] = df_all['altmetric_id_tweets_count'].fillna(0)
df_all = df_all.rename({'altmetric_id_tweets_count': 'tweets_count'}, axis=1)
df_tweets_count = df_tweets[df_tweets['originalPost'] == True].groupby(by=['doi'])['altmetric_id'].count()
df_all = df_all.merge(df_tweets_count, left_on=['DI'], right_on=['doi'], how='left', suffixes=['', '_tweets_origin_count'])
df_all['altmetric_id_tweets_origin_count'] = df_all['altmetric_id_tweets_origin_count'].fillna(0)
df_all = df_all.rename({'altmetric_id_tweets_origin_count': 'tweets_origin_count'}, axis=1)
df_all


In [None]:
df_all = pd.read_csv(f'{origin_folder}/all_altmetric_detail.csv')
df_score = pd.read_csv(f'{origin_folder}/all_altmetric_id_score.csv', usecols=['DI', 'score'])
df_all = df_all.merge(df_score, on='DI', how='left')
df_all['has_alt_id'] = df_all['altmetric_id'].isnull()
df_all['has_alt_id'] = df_all['has_alt_id'].apply(lambda x: 0 if x else 1)
df_all

In [None]:
all_info = pd.read_csv(f'{origin_folder}/all.csv', usecols=['DI', 'SO'])
df_all = df_all.merge(all_info, on='DI', how='left')

In [None]:
df_all.to_csv(f'{origin_folder}/all_altmetric_detail.csv', index=False)

In [None]:
%%time
articles_dict = {}
get_mention('fbwall')

In [None]:
with open(f'{origin_folder}/article_fbwalls_altmetric.json', "w+") as dump_f:
    dump_f.write(json.dumps(articles_dict))