# Process [journalmetrics](https://journalmetrics.scopus.com/) data into tidy TSVs

In [1]:
import os
import gzip

import pandas

## Prepare a dataframe of metrics and values

In [2]:
path = os.path.join('download', 'CiteScore_Metrics_2011-2105_Download_19_Jan2017.xlsx')
xlsx = pandas.ExcelFile(path)
sheets = [sheet for sheet in xlsx.sheet_names if ' All' in sheet]
sheets

['2015 All', '2014 All', '2013 All', '2012 All', '2011 All']

In [3]:
# Latest year to use for ISSN extraction
latest = 2015

In [4]:
renamer = {
    'Scopus SourceID': 'scopus_id',
    'Print-ISSN': 'print',
    'EISSN': 'electronic',
}

metrics = ['CiteScore', 'SNIP', 'SJR']

dfs = list()
for sheet in sheets:
    year, _ = sheet.split()
    year = int(year)

    df = (
        xlsx.parse('2015 All', skiprows=1)
        .rename(columns=renamer)
        .drop_duplicates(['scopus_id'] + metrics)
        .assign(year=year)
    )

    if year == latest:
        latest_df = df

    df = pandas.melt(df, id_vars=['scopus_id', 'year'], value_vars=metrics, var_name='metric').dropna()
    dfs.append(df)

metric_df = pandas.concat(dfs).sort_values(['scopus_id', 'year', 'metric'])

In [5]:
len(metric_df)

329110

In [6]:
metric_df.head(2)

Unnamed: 0,scopus_id,year,metric,value
3301,12001,2011,CiteScore,2.34
47813,12001,2011,SJR,0.998


In [7]:
metric_df.metric.value_counts()

CiteScore    111280
SJR          110220
SNIP         107610
Name: metric, dtype: int64

In [8]:
metric_df.year.value_counts()

2015    65822
2014    65822
2013    65822
2012    65822
2011    65822
Name: year, dtype: int64

In [9]:
with gzip.open('data/metrics.tsv.gz', 'wt') as write_file:
    metric_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')

## Prepare a dataframe of ISSNs

In [10]:
latest_df.print = latest_df.print.astype(str).str.zfill(8)
issn_df = (
    pandas.melt(latest_df, id_vars='scopus_id', value_vars=['print', 'electronic'], var_name='issn_type', value_name='issn')
    .dropna()
    .drop_duplicates()
    .sort_values(['scopus_id', 'issn_type', 'issn'])
)
len(issn_df), issn_df.scopus_id.nunique()

(30236, 22256)

In [11]:
issn_df.issn_type.value_counts()

print         22256
electronic     7980
Name: issn_type, dtype: int64

In [12]:
issn_df.head(2)

Unnamed: 0,scopus_id,issn_type,issn
3301,12001,print,225002
31977,12002,electronic,15206696


In [13]:
issn_df.to_csv('data/issn.tsv', sep='\t', index=False)