# Process [journalmetrics](http://www.journalmetrics.com/values.php) data into tidy TSVs

In [1]:
import gzip

import pandas

In [2]:
# Read dataset
url = 'http://www.journalmetrics.com/documents/SNIP_IPP_SJR_complete_1999_2014.xlsx'
jbs_df = pandas.read_excel(url, sheetname='Journals and Book Series')
prc_df = pandas.read_excel(url, sheetname='Proceedings')

for df in jbs_df, prc_df:
    df.rename(columns={'Sourcerecord id ': 'scopus_id', 'Print ISSN': 'print', 'E-ISSN': 'electronic'}, inplace=True)

## Prepare a dataframe of metrics and values

In [3]:
metrics = ['SNIP', 'IPP', 'SJR']

dfs = list()
for df in jbs_df, prc_df:
    metric_columns = list(df.columns[df.columns.map(lambda x: any(s in x for s in metrics))])
    value_df = pandas.melt(df, id_vars='scopus_id', value_vars=metric_columns, var_name='year_metric')
    value_df.dropna(inplace=True)
    value_df['year'] = value_df.year_metric.map(lambda x: int(x.split(' ')[0]))
    value_df['metric'] = value_df.year_metric.map(lambda x: x.split(' ')[1])
    value_df.drop('year_metric', axis=1, inplace=True)
    dfs.append(value_df)

metric_df = pandas.concat(dfs)
columns = ['scopus_id', 'year', 'metric', 'value']
metric_df = metric_df[columns]
metric_df.sort_values(columns[:3], inplace=True)

In [4]:
metric_df.head(2)

Unnamed: 0,scopus_id,year,metric,value
157311,12000,2000,IPP,0.5
191595,12000,2000,SJR,0.26


In [5]:
metric_df.metric.value_counts()

SNIP    305610
IPP     305610
SJR     288662
Name: metric, dtype: int64

In [6]:
with gzip.open('data/metrics.tsv.gz', 'wt') as write_file:
    metric_df.to_csv(write_file, sep='\t', index=False, float_format='%.4g')

## Prepare a dataframe of ISSNs

In [7]:
dfs = list()
for df in jbs_df, prc_df:
    df = pandas.melt(df, id_vars='scopus_id', value_vars=['print', 'electronic'], var_name='issn_type', value_name='issn')
    df.dropna(inplace=True)
    dfs.append(df)
issn_df = pandas.concat(dfs)

In [8]:
issn_df.issn_type.value_counts()

print         34400
electronic     8366
Name: issn_type, dtype: int64

In [9]:
issn_df.head(2)

Unnamed: 0,scopus_id,issn_type,issn
0,18500162600,print,15343219
2,19700200922,print,1285157


In [10]:
issn_df.to_csv('data/issn.tsv', sep='\t', index=False)