# Process [journalmetrics](https://journalmetrics.scopus.com/) data into tidy TSVs

In [1]:
import pathlib

import pandas

## Prepare a dataframe of metrics and values

In [2]:
path = pathlib.Path('download/CiteScore_Metrics_2011-2016_Download_21Jun2017.xlsx')
xlsx = pandas.ExcelFile(path)
sheets = [sheet for sheet in xlsx.sheet_names if ' All' in sheet]
sheets

['2016 All', '2015 All', '2014 All', '2013 All', '2012 All', '2011 All']

In [3]:
renamer = {
    'Scopus SourceID': 'scopus_id',
}

metrics = ['CiteScore', 'SNIP', 'SJR']

dfs = list()
for sheet in sheets:
    year, _ = sheet.split()
    year = int(year)

    df = (
        xlsx.parse(sheet, skiprows=1)
        .rename(columns=renamer)
        .drop_duplicates(['scopus_id'] + metrics)
        .assign(year=year)
    )

    df = pandas.melt(df, id_vars=['scopus_id', 'year'], value_vars=metrics, var_name='metric').dropna()
    dfs.append(df)

metric_df = pandas.concat(dfs).sort_values(['scopus_id', 'year', 'metric'])

In [4]:
len(metric_df)

372198

In [5]:
metric_df.head(2)

Unnamed: 0,scopus_id,year,metric,value
7926,12001,2011,CiteScore,0.82
44942,12001,2011,SJR,0.556


In [6]:
pandas.crosstab(metric_df.year, metric_df.metric, margins=True)

metric,CiteScore,SJR,SNIP,All
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,18508,18449,17779,54736
2012,19816,19755,19103,58674
2013,20840,20778,20153,61771
2014,21549,21487,20853,63889
2015,22256,22193,21535,65984
2016,22618,22570,21956,67144
All,125587,125232,121379,372198


In [7]:
path = pathlib.Path('data/metrics.tsv.gz')
metric_df.to_csv(path, sep='\t', index=False, float_format='%.4g', compression='gzip')