# Grab Dimensions Data

https://app.dimensions.ai/discover/publication

This notebook handles the downloads of Dimensions data.


In [1]:
import pandas as pd
import numpy as np
import json

## Load Journals

In [14]:
origin_folder = 'SUSTC_Journals'

In [15]:
journals_file = f'{origin_folder}/WOS期刊汇总_DimentionsSearch.csv'
ori_journals = pd.read_csv(journals_file, usecols=['search title', 'search id'])
ori_journals = ori_journals.dropna()

In [16]:
len(ori_journals)

3890

In [17]:
start_index = 0
end_index = 38
sep_len = 100

In [18]:
ori_journals

Unnamed: 0,search title,search id
0,Abacus,jour.1327791
2,Academic Psychiatry,jour.1086415
3,Academy of Management Annals,jour.1042145
4,Academy of Management Journal,jour.1086344
5,Academy of Management Learning and Education,jour.1046551
...,...,...
5367,Journal of Product Innovation Management,jour.1053471
5368,Management Science,jour.1123547
5369,Ocean & Coastal Management,jour.1042461
5370,Omega,jour.1120987


## Request Functions

This part contains functions we need to fetch the web data and should also handle the exceptions while fetching here.

In [7]:
import requests
import json

from requests import ConnectionError, ReadTimeout

def grab_from_url_content(url):
    headers = {'Accept': '* / *',
               'Accept-Language': 'zh-TW, zh; q=0.9, en-US; q=0.8, en; q=0.7, zh-CN; q=0.6',
               'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36'
               }
    rescontent = ''
    try:
        res = requests.get(url, headers=headers, timeout=10)
        rescontent = res.text
    except ConnectionError as ce:
        print('ConnectionError: ' + str(ce))
        return grab_from_url_content(url)
    except ReadTimeout as rte:
        print('ReadTimeout: ' + str(rte))
        return grab_from_url_content(url)

    return rescontent



## Parser

We will get the html content from the url which is not listed as we want it be, so we need parser to parse them into listed data, in json form.

In [ ]:
from html.parser import HTMLParser

class DimensionsHTMLParser(HTMLParser):
    articles = []
    in_article = False
    is_next_trigger = False
    next_trigger_url = ''

    def handle_starttag(self, tag, attrs):
        if tag == 'article':
            self.in_article = True
        
        if self.in_article:
            for attr in attrs:
                if attr[0] == 'data-doc':
                    self.articles.append(json.loads(attr[1]))
                    break

        if tag == 'a':
            for attr in attrs:
                if (attr[0] == 'class') and (attr[1] == 'nextPage-trigger'):
                    self.is_next_trigger = True
                if self.is_next_trigger and (attr[0] == 'href'):
                    self.next_trigger_url = f'https://app.dimensions.ai{attr[1]}&{search_params}'
                    break
        return
                    
    def handle_endtag(self, tag):
        if tag == 'article':
            self.in_article = False
        self.is_next_trigger = False
        return

    def handle_data(self, data):
        pass

    def handle_comment(self, data):
        pass

    def handle_entityref(self, name):
        pass

    def handle_charref(self, name):
        pass

    def handle_decl(self, data):
        pass
    
parser = DimensionsHTMLParser()

## Grab Data

The data fetching starts from here.

In [ ]:
%%time

if end_index > int(len(ori_journals) / sep_len):
    end_index = int(len(ori_journals) / sep_len)
for l in range(start_index, end_index):
    parser.articles = []

    journals_index = str(1 + l * sep_len) + '-' + str((l + 1) * sep_len)
    start = int(journals_index.split('-')[0])
    end = int(journals_index.split('-')[1])
    if end < len(ori_journals):
        journals = ori_journals[start - 1:end]
    else:
        journals = ori_journals[start - 1:]
    print(journals_index)

    for index, row in journals.iterrows():
        journal_search = row['search id']
        # This is very important for data fetching, generally you get url in the form as 'https://app.dimensions.ai/discover/publication?and_facet_for=2209&and_facet_for=2202&or_facet_source_title=jour.1082997', and search_params is just the substring after '?'.
        # Here is where we need to change for each search.
        search_params = f'or_facet_year=2014&or_facet_year=2015&or_facet_year=2016&or_facet_year=2017&or_facet_year=2018&or_facet_source_title={journal_search}'
        grab_url = f'https://app.dimensions.ai/discover/publication.contents.html?{search_params}'
        print(grab_url)

        parser.feed(grab_from_url_content(grab_url))
        next_trigger_anchor = ''
        i = 0
        while parser.next_trigger_url != next_trigger_anchor:
            if i % 10 == 0:
                print(str(i))
            i+=1
            next_trigger_anchor = parser.next_trigger_url
            parser.feed(grab_from_url_content(parser.next_trigger_url))
    
    print(len(parser.articles))
    with open(f"{origin_folder}/articles_wos_{journals_index}.json", "w") as dump_f:
        dump_f.write(json.dumps(parser.articles))

In [19]:
origin_folder = origin_folder + '/articles_wos'

In [24]:
%%time
# for l in range(2, int(len(ori_journals) / sep_len)):
for journals_index in ['1-200', '3801-3900']:
    # journals_index = str(1 + l * sep_len) + '-' + str((l + 1) * sep_len)
    with open(f'{origin_folder}/articles_wos_{journals_index}.json') as f:
        articles = json.load(f)

    columns_extract = ['SO', 'SO_id', 'doi', 'dimensions_id', 'title', 'abstract', 'authors', 'references', 'pub_date', 'affiliations_json', 'dimensions_cited']
    df_articles = pd.DataFrame(columns=columns_extract)

    for article in articles:
        if 'doi' not in article:
            continue
        if article['doi'] in df_articles['doi']:
            continue
        df_article = pd.DataFrame(columns=columns_extract)
        df_article['SO'] = [article['source_title']]
        df_article['SO_id'] = [article['source_title_id']]
        df_article['doi'] = [article['doi']]
        df_article['dimensions_id'] = [article['id']]
        df_article['title'] = [article['title']]
        df_article['abstract'] = [article['abstract']]
        if 'author_list' in article:
            df_article['authors'] = [article['author_list']]
        if 'cited_dimensions_ids' in article:
            df_article['references'] = [len(article['cited_dimensions_ids'])]
        if 'pub_date' in article:
            df_article['pub_date'] = [article['pub_date']]
        if 'affiliations_json' in article:
            df_article['affiliations_json'] = [article['affiliations_json']]
        df_article['dimensions_cited'] = [article['times_cited']]
        df_articles = df_articles.append(df_article, ignore_index=True)

    df_articles.to_csv(f'{origin_folder}/articles_wos_{journals_index}.csv', index=False)

Wall time: 2h 28min 19s


In [8]:
origin_folder = origin_folder + '/articles_abs'

In [11]:
%%time
for l in range(start_index, int(len(ori_journals) / sep_len)):
    journals_index = str(1 + l * sep_len) + '-' + str((l + 1) * sep_len)
    if journals_index == '801-900':
        journals_index = '801-1100'
    with open(f'{origin_folder}/articles_{journals_index}.json') as f:
        articles = json.load(f)

    columns_extract = ['SO', 'SO_id', 'doi', 'dimensions_id', 'title', 'abstract', 'authors', 'references', 'pub_date', 'affiliations_json', 'dimensions_cited']
    df_articles = pd.DataFrame(columns=columns_extract)

    for article in articles:
        if 'doi' not in article:
            continue
        if article['doi'] in df_articles['doi']:
            continue
        df_article = pd.DataFrame(columns=columns_extract)
        df_article['SO'] = [article['source_title']]
        df_article['SO_id'] = [article['source_title_id']]
        df_article['doi'] = [article['doi']]
        df_article['dimensions_id'] = [article['id']]
        df_article['title'] = [article['title']]
        df_article['abstract'] = [article['abstract']]
        if 'author_list' in article:
            df_article['authors'] = [article['author_list']]
        if 'cited_dimensions_ids' in article:
            df_article['references'] = [len(article['cited_dimensions_ids'])]
        if 'pub_date' in article:
            df_article['pub_date'] = [article['pub_date']]
        if 'affiliations_json' in article:
            df_article['affiliations_json'] = [article['affiliations_json']]
        df_article['dimensions_cited'] = [article['times_cited']]
        df_articles = df_articles.append(df_article, ignore_index=True)

    df_articles.to_csv(f'{origin_folder}/articles_{journals_index}.csv', index=False)

Wall time: 3h 7min 11s


In [12]:
article

{'title': 'Editorial data',
 'source_title_id': 'jour.1138469',
 'journal_title': 'Journal of Accounting and Economics',
 'for_v2': ['3292', '2214', '3326', '2215'],
 'language': 'it',
 'open_access': False,
 'publisher_source': 'Elsevier',
 'publisher_place': '230 Park Avenue Suite 800 Shantae McGee New York NY 10169-0935 United States',
 'publisher': 'Elsevier',
 'pub_class': 'Article',
 'abstract': '',
 'acknowledgements': '',
 'aff_country_count': 0,
 'aff_org_count': 0,
 'created_in_dimensions': '2018-11-29T03:46:13Z',
 'doi': '10.1016/s0165-4101(18)30119-8',
 'html_escaped_abstract': '',
 'html_escaped_title': 'Editorial data',
 'id': 'pub.1110223808',
 'issue': '2-3',
 'pages': 'iii',
 'pub_date': '2018-11',
 'pub_year': 2018,
 'volume': '66',
 'sn_downloads_customer_1': 0,
 'times_cited': 0,
 'sn_denials_customer_1': 0,
 'sn_denials': 0,
 'sn_downloads': 0,
 'altmetric_id': 0,
 'score': 2.0,
 'affiliations_details': [],
 'authors_full': [],
 'source_title': 'Journal of Accounti

# Affiliations Arrange

In [2]:
df_articles = pd.read_csv('SUSTC_Journals/articles_all/article_dimensions.csv', usecols=['doi', 'dimensions_id', 'affiliations_json'], nrows=600000)

In [3]:
df_articles

Unnamed: 0,affiliations_json,dimensions_id,doi
0,,pub.1110223808,10.1016/s0165-4101(18)30119-8
1,"[{""first_name"": ""Novia X."", ""last_name"": ""Chen...",pub.1106857635,10.1016/j.jacceco.2018.08.013
2,"[{""first_name"": ""Thomas W."", ""last_name"": ""Bat...",pub.1106857634,10.1016/j.jacceco.2018.08.002
3,"[{""first_name"": ""Christopher S."", ""last_name"":...",pub.1106286706,10.1016/j.jacceco.2018.08.012
4,"[{""first_name"": ""Jesse"", ""last_name"": ""Chan"", ...",pub.1106173025,10.1016/j.jacceco.2018.08.010
...,...,...,...
599995,"[{""first_name"": ""Darrell L."", ""last_name"": ""Hu...",pub.1029688769,10.1007/s10943-014-9850-2
599996,"[{""first_name"": ""Patryk"", ""last_name"": ""Stecz""...",pub.1029197656,10.1007/s10943-014-9842-2
599997,"[{""first_name"": ""Lisa M."", ""last_name"": ""Tussi...",pub.1023282823,10.1007/s10943-014-9823-5
599998,"[{""first_name"": ""K."", ""last_name"": ""Laios"", ""c...",pub.1021848519,10.1007/s10943-013-9811-1


In [24]:
%%time

df_researchers = pd.DataFrame()
df_affiliations = pd.DataFrame()

for index, item in df_articles.iterrows():
    if index % 5000 == 0:
        print(index)

    doi = item['doi']
    json_str = item['affiliations_json']
    if pd.isna(json_str):
        continue
    for sub in json.loads(json_str):
        df_researcherx = pd.DataFrame()
        df_affiliationx = pd.DataFrame()

        if 'first_name' in sub:
            df_researcherx['first_name'] = [sub['first_name']]
        if 'last_name' in sub:
            df_researcherx['last_name'] = [sub['last_name']]
        if 'orcid' in sub:
            df_researcherx['orcid'] = [sub['orcid']]
        if 'current_organization_id' in sub:
            df_researcherx['current_organization_id'] = [sub['current_organization_id']]
        if 'researcher_id' in sub:
            df_researcherx['researcher_id'] = [sub['researcher_id']]
        if 'affiliations' in sub:
            df_researcherx['affiliations_num'] = [len(sub['affiliations'])]
        if 'raw_affiliation' in sub:
            df_researcherx['raw_affiliation'] = [json.dumps(sub['raw_affiliation'])]

        df_researchers = df_researchers.append(df_researcherx, ignore_index=True)

        for subsub in sub['affiliations']:
            if 'id' in subsub:
                df_affiliationx['affiliation_id'] = [subsub['id']]
            if 'name' in subsub:
                df_affiliationx['name'] = [subsub['name']]
            if 'city' in subsub:
                df_affiliationx['city'] = [subsub['city']]
            if 'city_id' in subsub:
                df_affiliationx['city_id'] = [subsub['city_id']]
            if 'country' in subsub:
                df_affiliationx['country'] = [subsub['country']]
            if 'country_code' in subsub:
                df_affiliationx['country_code'] = [subsub['country_code']]
            if 'state' in subsub:
                df_affiliationx['state'] = [subsub['state']]
            if 'state_code' in subsub:
                df_affiliationx['state_code'] = [subsub['state_code']]

            df_affiliations = df_affiliations.append(df_affiliationx, ignore_index=True)

df_researchers.to_csv('SUSTC_Journals/articles_all/article_researchers_1.csv', index=False)
df_affiliations.to_csv('SUSTC_Journals/articles_all/article_affiliations_1.csv', index=False)


0
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


KeyboardInterrupt: 