# Process [journalmetrics](https://journalmetrics.scopus.com/) data into tidy TSVs

In [1]:
import pathlib

import pandas

## Prepare a dataframe of metrics and values

In [2]:
path = pathlib.Path('download/CiteScore 2011-2020 new methodology - May 2021.xlsb')
xlsx = pandas.ExcelFile(path)
sheets = [sheet for sheet in xlsx.sheet_names if 'CiteScore ' in sheet]
sheets

['CiteScore 2020',
 'CiteScore 2019',
 'CiteScore 2018',
 'CiteScore 2017',
 'CiteScore 2016',
 'CiteScore 2015',
 'CiteScore 2014',
 'CiteScore 2013',
 'CiteScore 2012',
 'CiteScore 2011']

In [3]:
renamer = {
    'Scopus Source ID': 'scopus_id',
    # 2020 CiteScore includes year in column name unlike other years
    "CiteScore 2020": "CiteScore"
}

metrics = ['CiteScore', 'SNIP', 'SJR']

dfs = list()
for sheet in sheets:
    print(sheet)
    _, year = sheet.split()
    year = int(year)

    df = (
        xlsx.parse(sheet)
        .rename(columns=renamer)
        .drop_duplicates(['scopus_id'] + metrics)
        .assign(year=year)
    )

    df = pandas.melt(df, id_vars=['scopus_id', 'year'], value_vars=metrics, var_name='metric').dropna()
    dfs.append(df)

metric_df = pandas.concat(dfs).sort_values(['scopus_id', 'year', 'metric'])

CiteScore 2020
CiteScore 2019
CiteScore 2018
CiteScore 2017
CiteScore 2016
CiteScore 2015
CiteScore 2014
CiteScore 2013
CiteScore 2012
CiteScore 2011


In [4]:
len(metric_df)

678652

In [5]:
metric_df.head(2)

Unnamed: 0,scopus_id,year,metric,value
11535,12001,2011,CiteScore,1.7
51467,12001,2011,SJR,0.565


In [6]:
pandas.crosstab(metric_df.year, metric_df.metric, margins=True)

metric,CiteScore,SJR,SNIP,All
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011,19966,19406,18758,58130
2012,20946,20500,19855,61301
2013,21748,21336,20717,63801
2014,22246,21968,21327,65541
2015,22762,22387,21579,66728
2016,23284,22951,22360,68595
2017,24296,23317,23189,70802
2018,24754,23504,23653,71911
2019,25300,24456,24117,73873
2020,25990,25990,25990,77970


In [7]:
path = pathlib.Path('data/metrics.tsv.gz')
metric_df.to_csv(path, sep='\t', index=False, float_format='%.3g', compression={"method": 'gzip', "mtime": 0})