# Scopus Title Data

In [1]:
import gzip
import os
import re

import pandas

In [2]:
# Load dataset
url = 'https://www.elsevier.com/__data/assets/excel_doc/0015/91122/title_list.xlsx'
source_df = pandas.read_excel(url, sheetname='Scopus Sources November 2015')
conf_96_df = pandas.read_excel(url, sheetname='Conf. Proceedings post-1995')
conf_95_df = pandas.read_excel(url, sheetname='Conf. Proceedings pre-1996')
code_df = pandas.read_excel(url, sheetname='ASJC Code list ')

In [3]:
# Rename columns on dataframes
renamer = {
    'Sourcerecord id ': 'scopus_id',
    'All Science Classification Codes (ASJC)': 'asjc_codes',
    'ASJC code': 'asjc_codes',
}

for df in source_df, conf_96_df, conf_95_df:
    df.rename(columns=renamer, inplace=True)

## Titles

In [24]:
# Extract title names
rows = list()
for df in source_df, conf_96_df, conf_95_df:
    rows.extend(zip(df.scopus_id, df.ix[:, 1]))

rows.sort()
title_df = pandas.DataFrame(rows, columns=['scopus_id', 'title_name'])

In [25]:
path = os.path.join('data', 'titles.tsv')
title_df.to_csv(path, index=False, sep='\t')

In [26]:
title_df.head(2)

Unnamed: 0,scopus_id,title_name
0,12000,Journal of Technology in Counseling
1,12001,Journal of the Experimental Analysis of Behavior


## ASJC Codes

All Science Journal Classification (ASJC) Codes

In [27]:
# Extract ASJC code descriptions
code_df = code_df.rename(columns={'Code': 'asjc_code', 'Description': 'asjc_description'})
code_df = code_df[['asjc_code', 'asjc_description']]

In [28]:
path = os.path.join('data', 'asjc-codes.tsv')
code_df.to_csv(path, index=False, sep='\t')

In [29]:
code_df.head(2)

Unnamed: 0,asjc_code,asjc_description
0,1000,General
1,1100,Agricultural and Biological Sciences(all)


In [30]:
# Extract title to code mapping
rows = list()
for df in source_df, conf_96_df, conf_95_df:
    for i, (scopus_id, codes) in df[['scopus_id', 'asjc_codes']].iterrows():
        for code in re.split(r'[,;] *', str(codes)):
            code = code.strip()
            if code in {'nan', ''}:
                continue
            code = int(code)
            rows.append((scopus_id, code))

rows.sort()
title_codes_df = pandas.DataFrame(rows, columns=['scopus_id', 'asjc_code'])
title_codes_df = title_codes_df.drop_duplicates()

In [31]:
path = os.path.join('data', 'titles-asjc-codes.tsv')
title_codes_df.to_csv(path, index=False, sep='\t')

In [32]:
title_codes_df.head(2)

Unnamed: 0,scopus_id,asjc_code
0,12000,1705
1,12000,3304


In [33]:
# Extract title to subject area mapping
subject_df = title_codes_df.copy()
subject_df.asjc_code = 100 * (subject_df.asjc_code // 100)
subject_df = subject_df.drop_duplicates()
subject_df = subject_df.merge(code_df)
subject_df.asjc_description = subject_df.asjc_description.map(lambda x: re.sub(r'\(all\)$', '', x))
subject_df = subject_df.sort_values(['scopus_id', 'asjc_code'])

In [34]:
path = os.path.join('data', 'subject-areas.tsv')
subject_df.to_csv(path, index=False, sep='\t')

In [35]:
subject_df.head(2)

Unnamed: 0,scopus_id,asjc_code,asjc_description
0,12000,1700,Computer Science
12965,12000,3300,Social Sciences


## Title attributes

In [36]:
# Extract title attributes for non-conference-proceedings
renamer = {
    'Open Acces status, i.e., registered in DOAJ and/or ROAD. Status September 2015\n': 'open_access',
    'Active or Inactive': 'active',
    'Source Type': 'source_type',
    'Publisher imprints grouped to main Publisher': 'main_publisher',
    "Publisher's Country ": 'publisher_country',
}
attribute_df = source_df.copy()
attribute_df = attribute_df.rename(columns=renamer)
attribute_df = attribute_df[['scopus_id'] + list(renamer.values())]
attribute_df.active = (attribute_df.active == 'Active').astype(int)
attribute_df.open_access = (pandas.isnull(attribute_df.open_access)).astype(int)
attribute_df = attribute_df.sort_values('scopus_id')

In [37]:
path = os.path.join('data', 'title-attributes.tsv')
attribute_df.to_csv(path, index=False, sep='\t')

In [38]:
attribute_df.head(2)

Unnamed: 0,scopus_id,active,open_access,main_publisher,source_type,publisher_country
20542,12000,0,1,Columbus State University,Journal,United States
20755,12001,1,1,Society for the Experimental Analysis of Behav...,Journal,United States


In [39]:
# Top level subject areas
rows = []
tl_cols = list(source_df.columns[source_df.columns.str.startswith('Top level:')])
for i, series in source_df[['scopus_id'] + tl_cols].iterrows():
    scopus_id = series.scopus_id
    for value in series.ix[1:].dropna():
        rows.append((scopus_id, value))
rows.sort()
top_df = pandas.DataFrame(rows, columns=['scopus_id', 'top_level_subject'])

In [40]:
path = os.path.join('data', 'title-top-levels.tsv')
top_df.to_csv(path, index=False, sep='\t')

In [41]:
top_df.head(2)

Unnamed: 0,scopus_id,top_level_subject
0,12000,Physical Sciences
1,12000,Social Sciences
