In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import wosis
import pandas as pd
import metaknowledge as mk
import wosis.analysis.plotting as wos_plot

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# This just helps make the plots look nicer
sns.set_style('darkgrid')
sns.set_context('paper', font_scale=2.0)

The Web of Science licence agreement states that we should not make public certain information/metadata.
To conform to this agreement we have to hide certain columns when printing out DataFrames

In [3]:
hide_columns = ['DE', 'abstract', 'keywords', 'DOI', 'id', 'kws']

In [4]:
# Read in data from temporary cache file
query_id = "756d39801152fe5f5f4ad3a3df9b6a30"
RC = mk.RecordCollection("tmp/{}.txt".format(query_id))

expected_num_records = 15754
num_records = len(RC)

assert num_records == expected_num_records, \
    "Mismatch in number of records - expected {}, got {}".format(expected_num_records, num_records)

print("Corpora consists of", num_records, "Publications")

corpora_df = wosis.rc_to_df(RC)

Corpora consists of 15754 Publications


The corpora should all contain valid DOIs to be included

In [5]:
corpora_df, removed_pubs = wosis.constrain.remove_empty_DOIs(corpora_df)

Removing 1799 records with no DOIs


Then we remove conferences and workshops

In [6]:
corpora_df = wosis.remove_by_journals(corpora_df, ["CONFERENCE", "WORKSHOP", "PROCEEDINGS", "SYMPOSIUM"])

CONFERENCE: 292
WORKSHOP: 13
PROCEEDINGS: 23
SYMPOSIUM: 29


Now we filter out journals that only have 3 papers

In [7]:
# Journals have to have more than 3 papers to be included
corpora_df = corpora_df.groupby('SO').filter(lambda x: x['DOI'].count() > 3)

print("Corpora consists of", len(corpora_df), "Publications")

Corpora consists of 13069 Publications


The initially filtered corpora is written out for further analysis

In [8]:
filtered_corpora = wosis.extract_recs(corpora_df.id, RC)
wosis.export_ris_file(filtered_corpora, 'tmp/filtered_corpora.txt')

This can then be read in for later analysis with:
    
```python
import metaknowledge as mk
RC = mk.RecordCollection("tmp/filtered_corpora.txt".format(query_id))
```

For transparency and reproducibility reasons, the papers removed are listed in the indicated file below

In [9]:
tmp_df = wosis.rc_to_df(RC)

all_removed = tmp_df.loc[~tmp_df.id.isin(corpora_df.id)]
all_removed.loc[:, ~all_removed.columns.isin(hide_columns)].to_csv('../data/initially_removed.csv')