# Grab Altmetric.com Data

## Request Functions
This part contains functions we need to fetch the web data and should also handle the exceptions while fetching here.

In [ ]:
from grab_util import *
import json
import pandas as pd
import numpy as np
import math


In [ ]:
origin_folder = 'data/outputs/OR'

## Grab altmetric.com Ids

In [ ]:
def get_altmetric_id(doi):
    detail_id = ''
    res = grab_from_url_json('https://api.altmetric.com/v1/doi/' + doi)
    if res is not None:
        detail_id = res['altmetric_id']
    return detail_id

In [ ]:
# all_articles_df = pd.read_excel(f'{origin_folder}/all.xlsx', usecols=['SO', 'DI'])
all_articles_df = pd.read_csv(f'{origin_folder}/all.csv', usecols=['SO', 'DI'])
df = pd.DataFrame(columns=['DI', 'altmetric_id'])

In [ ]:
%%time
for index, doi in enumerate(all_articles_df['DI']):
    if doi in df['DI'].values:
        continue
    if index % 100 == 0:
        print(index)
    dfx = pd.DataFrame(columns=['DI', 'altmetric_id'])
    dfx['DI'] = [doi]
    dfx['altmetric_id'] = [get_altmetric_id(str(doi))]
    df = df.append(dfx, ignore_index=True)

df.to_csv(f'{origin_folder}/all_altmetric_id.csv', index=False)

## Grab altmetric.com Details

In [ ]:
def grab_detail_altmetric(doi, altmetric_id, so):
    df = pd.DataFrame()
    b_list = ['news outlets', 'blogs', 'policy', 'tweeters', 'weibo', 'facebook pages', 'wikipedia', 'redditors', 'f1000', 'video uploader', 'dimensions_citation', 'mendeley', 'citeulike']

    df['DI'] = [doi]
    df['altmetric_id'] = [altmetric_id]
    if altmetric_id != '' and not math.isnan(altmetric_id):
        altmetric_id = int(altmetric_id)
        news_anchor = 'news</dt><dd><a href="/details/' + str(altmetric_id) + '/news"><strong>'
        blogs_anchor = 'blogs</dt><dd><a href="/details/' + str(altmetric_id) + '/blogs"><strong>'
        policy_anchor = 'policy</dt><dd><a href="/details/' + str(altmetric_id) + '/policy-documents"><strong>'
        twitter_anchor = 'twitter</dt><dd><a href="/details/' + str(altmetric_id) + '/twitter"><strong>'
        weibo_anchor = 'weibo</dt><dd><a href="/details/' + str(altmetric_id) + '/weibo"><strong>'
        facebook_anchor = 'facebook</dt><dd><a href="/details/' + str(altmetric_id) + '/facebook"><strong>'
        wikipedia_anchor = 'wikipedia</dt><dd><a href="/details/' + str(altmetric_id) + '/wikipedia"><strong>'
        redditors_anchor = 'reddit</dt><dd><a href="/details/' + str(altmetric_id) + '/reddit"><strong>'
        f1000_anchor = 'f1000</dt><dd><a href="/details/' + str(altmetric_id) + '/f1000"><strong>'
        video_anchor = 'video</dt><dd><a href="/details/' + str(altmetric_id) + '/video"><strong>'
        dimensions_citation_anchor = 'dimensions_citation</dt><dd><a href="/details/' + str(altmetric_id) + '/citations"><strong>'
        mendeley_anchor = 'mendeley</dt><dd><a href="/details/' + str(altmetric_id) + '#mendeley-demographics"><strong>'
        citeulike_anchor = 'citeulike</dt><dd><strong>'
        c_list = [news_anchor, blogs_anchor, policy_anchor, twitter_anchor, weibo_anchor, facebook_anchor, wikipedia_anchor, redditors_anchor, f1000_anchor, video_anchor, dimensions_citation_anchor, mendeley_anchor, citeulike_anchor]

        end_anchor = '</strong>'

        res = grab_from_url_content('https://www.altmetric.com/details/' + str(altmetric_id))
        if res is not None:

            for i in range(0, len(c_list)):
                start_index = res.find(c_list[i])
                if start_index > 0:
                    start_index += len(c_list[i])
                    number = 0
                    end_index = res.find(end_anchor, start_index, start_index + 100)
                    number_temp = res[start_index: end_index]

                    if number_temp is not '':
                        number = number_temp
                    df[b_list[i]] = int(number)
                else:
                    df[b_list[i]] = 0
        else:
            for i in range(0, len(b_list)):
                df[b_list[i]] = 0
    else:
        for i in range(0, len(b_list)):
            df[b_list[i]] = 0

    return df

In [ ]:
%%time
all_articles_df = pd.read_csv(f'{origin_folder}/all_altmetric_id.csv')
df = pd.DataFrame(columns=['DI'])
for index, row in all_articles_df.iterrows():
    if row['DI'] in df['DI'].values:
        continue
    if index % 100 == 0:
        print(index)
    dfx = grab_detail_altmetric(row['DI'], row['altmetric_id'])
    df = df.append(dfx, ignore_index=True)

df.to_csv(f'{origin_folder}/all_altmetric_detail.csv', index=False)

## Grab altmetric.com's Tweets Detail

### Parser
We will get the html content from the url which is not listed as we want it be, so we need parser to parse them into listed data, in json form.

In [ ]:
from html.parser import HTMLParser

class AltmetricHTMLParser(HTMLParser):
    tweets = []
    retweets = []
    articles = {'tweets': tweets, 'retweets': retweets}
    in_article = False
    is_reply = False
    has_article = False

    def handle_starttag(self, tag, attrs):
        if tag == 'body':
            self.has_article = False
        if tag == 'article':
            self.has_article = True
            self.in_article = True

        if self.in_article and (tag == 'a'):
            for attr in attrs:
                if (attr[0] == 'class') and (attr[1] == 'reply'):
                    self.is_reply = True
                if self.is_reply and (attr[0] == 'href'):
                    self.tweets.append(attr[1].split('=')[1])
                    break
        return
                    
    def handle_endtag(self, tag):
        if tag == 'article':
            self.in_article = False
        self.is_reply = False
        return

    def handle_data(self, data):
        pass

    def handle_comment(self, data):
        pass

    def handle_entityref(self, name):
        pass

    def handle_charref(self, name):
        pass

    def handle_decl(self, data):
        pass

parser = AltmetricHTMLParser()

In [ ]:
redirect_str = '<html><body>You are being <a href="https://www.altmetric.com/details/4236878">redirected</a>.</body></html>'

### Grab Data

In [ ]:
import pandas as pd
import numpy as np

df = pd.read_csv(f'{origin_folder}/all_altmetric_id.csv', usecols=['DI', 'altmetric_id'], dtype={'altmetric_id':str})
df

In [ ]:
# %%time
# articles_dict = {}
# for index, row in df.iterrows():
#     if row['altmetric_id'] == '' or str(row['altmetric_id']).lower() == 'nan':
#         articles_dict[row['DI']] = {'altmetric_id': '', 'twitter_num': 0, 'tweets': []}
#         continue
#     # print(row['DI'], row['altmetric_id'])
#     grab_url = f'https://www.altmetric.com/details/{row["altmetric_id"]}/twitter/page:'
#     print(str(index), grab_url)
#     parser.articles = []
#     for i in range(1, 100000):
#         parser.feed(grab_from_url_content(grab_url + str(i), headers=headers))
#         print('page: ', str(i))
#         if not parser.has_article:
#             break
#     articles_dict[row['DI']] = {'altmetric_id': row['altmetric_id'], 'twitter_num': len(parser.articles), 'tweets': parser.articles}

In [ ]:
# articles_dict = {}
def get_mention(source):
    headers = {
        'cookie': "_ga=GA1.2.501259425.1572141776; weibo_license_acknowledgement=false; explorer_user=WWs2eDNOOUVjVHJTcHRjSm9Oc0ZuZEh0eDRkTmlPd0tCWFMzODNyNlIweFBvdjllM3RkcmF0RVdtRWdvRDN4Ny0tQlNpeTBsWU5NK0ZNTnFpK1FpVFUzdz09--8bac413ca9fd623b9918b311729130cfbb4e66e1; _altmetric-explorer_session=eExKU3g1UmJhWXpXbmVtS05YZmxVS3YyaVNDSGdPL2JyaWVYaE9mRTdPT3RlYlRLaTdrRmRWcnZpeFMrMFJLb0FIVDBvNHI3N09MWS9HZDhDRDdIbTZPZElvMWNPNzMvTm5JVkovWThiQVplZVdTRlZheDBWYWdvd2lIMXo2NHJUZkFJd2MxcWpZSmpCaXVXRW9CYlZnZ1l4L1lXMGhIa1k1d0o1bllvS09uWDBQb3JXMWUrazZ2YWVwaUwzei93cDdUZkhwanFtSy9lSkxLUmdRdEIrZz09LS1DckIxZFJZUVBWNmhsTEJXcWtkR2x3PT0%3D--884ad24217914b545556a68678c2d803c65ce9cf; intercom-session-9dnltu6y=alcwSlZxZHMzZlY3Wm8yaFpvWFg1OGxEYVZ4MXRRUDMrbkk1Q01Bdm5XNXJkQXNZcVAyeEZYVHFYS3RUbWJSbi0tclFPUDlZcUVpMFZKQVlxdlMwRllVQT09--46bb4ebcf64063d6a3712dd82a1d2bff57eed87f, _ga=GA1.2.501259425.1572141776; weibo_license_acknowledgement=false; explorer_user=WWs2eDNOOUVjVHJTcHRjSm9Oc0ZuZEh0eDRkTmlPd0tCWFMzODNyNlIweFBvdjllM3RkcmF0RVdtRWdvRDN4Ny0tQlNpeTBsWU5NK0ZNTnFpK1FpVFUzdz09--8bac413ca9fd623b9918b311729130cfbb4e66e1; _altmetric-explorer_session=eExKU3g1UmJhWXpXbmVtS05YZmxVS3YyaVNDSGdPL2JyaWVYaE9mRTdPT3RlYlRLaTdrRmRWcnZpeFMrMFJLb0FIVDBvNHI3N09MWS9HZDhDRDdIbTZPZElvMWNPNzMvTm5JVkovWThiQVplZVdTRlZheDBWYWdvd2lIMXo2NHJUZkFJd2MxcWpZSmpCaXVXRW9CYlZnZ1l4L1lXMGhIa1k1d0o1bllvS09uWDBQb3JXMWUrazZ2YWVwaUwzei93cDdUZkhwanFtSy9lSkxLUmdRdEIrZz09LS1DckIxZFJZUVBWNmhsTEJXcWtkR2x3PT0%3D--884ad24217914b545556a68678c2d803c65ce9cf; intercom-session-9dnltu6y=alcwSlZxZHMzZlY3Wm8yaFpvWFg1OGxEYVZ4MXRRUDMrbkk1Q01Bdm5XNXJkQXNZcVAyeEZYVHFYS3RUbWJSbi0tclFPUDlZcUVpMFZKQVlxdlMwRllVQT09--46bb4ebcf64063d6a3712dd82a1d2bff57eed87f; Cookie_1=value",
        'User-Agent': "PostmanRuntime/7.19.0",
        'Accept': "*/*",
        'Cache-Control': "no-cache",
        'Postman-Token': "30d13c93-aa9a-44e1-84e0-8876f98ae1ba,b45c8eb6-7f77-4c48-b961-921c89280079",
        'Host': "www.altmetric.com",
        'Accept-Encoding': "gzip, deflate",
        'Connection': "keep-alive",
        'cache-control': "no-cache"
    }
    
    for index, row in df.iterrows():
        if row['DI'] in articles_dict:
            continue
        if index % 100 == 0:
            print(str(index))
        if row['DI'] == '' or str(row['DI']).lower() == 'nan' or row['altmetric_id'] == '' or str(row['altmetric_id']).lower() == 'nan':
            articles_dict[row['DI']] = {'altmetric_id': '', f'{source}_num': 0, f'{source}s': []}
            continue

        articles = []
        grab_url = f'https://www.altmetric.com/explorer/json_data/mentions?identifier={row["DI"]}&mention_sources%5B%5D=type%3A{source}&scope=all&page='
        for i in range(1, 100000):
            articles_json = grab_from_url_json(grab_url + str(i), headers=headers)
            articles.extend(articles_json['data'])
            # print('page: ', str(i))
            if articles_json['lastPage']:
                break
            
        articles_dict[row['DI']] = {'altmetric_id': row['altmetric_id'], f'{source}_num': len(articles), f'{source}s': articles}
    return

In [ ]:
articles_dict = {}

In [ ]:
%%time
get_mention('tweet')

In [ ]:
with open(f'{origin_folder}/article_tweets_altmetric.json', "w+") as dump_f:
    dump_f.write(json.dumps(articles_dict))

In [ ]:
len(articles_dict)

In [ ]:
with open(f'{origin_folder}/article_tweets_altmetric.json', "r") as f:
    article_tweets = json.load(f)

df_tweets = pd.DataFrame()

In [ ]:
%%time
for doi in article_tweets:
    for tweet_chain in article_tweets[doi]['tweets']:
        for tweet in tweet_chain[1]:
            if tweet['url'] in df_tweets['tweet_url']:
                continue
            df_tweet = pd.DataFrame(columns=['doi', 'altmetric_id', 'tweet_id', 'tweet_url', 'postedAt', 'postType', 'originalPost'])
            df_tweet['doi'] = [doi]
            df_tweet['altmetric_id'] = [article_tweets[doi]['altmetric_id']]
            df_tweet['tweet_id'] = [str(tweet['url']).split('/')[-1]]
            df_tweet['tweet_url'] = [tweet['url']]
            df_tweet['postedAt'] = [tweet['postedAt']]
            df_tweet['postType'] = [tweet['postType']]
            df_tweet['originalPost'] = [str(tweet['originalPost'])]

            df_tweets = df_tweets.append(df_tweet, ignore_index=True)

In [ ]:
df_tweets.to_csv(f'{origin_folder}/article_tweets_altmetric.csv', index=False)

In [ ]:
df_all = pd.read_csv(f'{origin_folder}/all_altmetric_detail.csv')
df_tweets = pd.read_csv(f'{origin_folder}/article_tweets_altmetric.csv')

In [ ]:
df_tweets_count = df_tweets.groupby(by=['doi'])['altmetric_id'].count()
df_all = df_all.merge(df_tweets_count, left_on=['DI'], right_on=['doi'], how='left', suffixes=['', '_tweets_count'])
df_all['altmetric_id_tweets_count'] = df_all['altmetric_id_tweets_count'].fillna(0)
df_all = df_all.rename({'altmetric_id_tweets_count': 'tweets_count'}, axis=1)
df_tweets_count = df_tweets[df_tweets['originalPost'] == True].groupby(by=['doi'])['altmetric_id'].count()
df_all = df_all.merge(df_tweets_count, left_on=['DI'], right_on=['doi'], how='left', suffixes=['', '_tweets_origin_count'])
df_all['altmetric_id_tweets_origin_count'] = df_all['altmetric_id_tweets_origin_count'].fillna(0)
df_all = df_all.rename({'altmetric_id_tweets_origin_count': 'tweets_origin_count'}, axis=1)
df_all


In [7]:
all_info = pd.read_csv(f'{origin_folder}/all.csv', usecols=['DI', 'SO', 'TC'])
df_all = df_all.merge(all_info, on='DI', how='left')

In [8]:
df_all.to_csv(f'{origin_folder}/all_altmetric_detail.csv', index=False)

In [ ]:
%%time
articles_dict = {}
get_mention('fbwall')

In [ ]:
with open(f'{origin_folder}/article_fbwalls_altmetric.json', "w+") as dump_f:
    dump_f.write(json.dumps(articles_dict))