In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import wosis
import pandas as pd
import metaknowledge as mk
import wosis.analysis.plotting as wos_plot

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# This just helps make the plots look nicer
sns.set_style('darkgrid')
sns.set_context('paper', font_scale=2.0)



The Web of Science licence agreement states that we should not make public certain information/metadata.
To conform to this agreement we have to hide certain columns when printing out DataFrames

In [3]:
hide_columns = ['DE', 'abstract', 'keywords', 'DOI', 'id', 'kws']

In [4]:
RC = mk.RecordCollection("tmp/rest_requested_corpora.txt")

expected_num_records = 13692
num_records = len(RC)

assert num_records == expected_num_records, \
    "Mismatch in number of records - expected {}, got {}".format(expected_num_records, num_records)

print("Corpora consists of", num_records, "Publications")

corpora_df = wosis.rc_to_df(RC)

Corpora consists of 13692 Publications


The corpora should all contain valid DOIs to be included

In [5]:
corpora_df, removed_pubs = wosis.constrain.remove_empty_DOIs(corpora_df)

Removing 429 records with no DOIs


Then we remove conferences and workshops

In [6]:
corpora_df = wosis.remove_by_journals(corpora_df, ["CONFERENCE", "WORKSHOP", "PROCEEDINGS", "SYMPOSIUM", "CONGRESS"])

CONFERENCE: 0
WORKSHOP: 0
PROCEEDINGS: 16
SYMPOSIUM: 0
CONGRESS: 0


In [7]:
corpora_df = wosis.remove_by_keywords(corpora_df, ['lifecycle', 'life-cycle', 'life cycle', 'product life cycle'])

lifecycle: 0
life-cycle: 0
life cycle: 0
product life cycle: 0


In [8]:
corpora_df = wosis.remove_by_keywords(corpora_df, ['image processing'])

image processing: 0


Filtering unrelated journals

In [11]:
from wosis.analysis import remove_by_journals, remove_by_title

# List of unrelated Journals - will be matched by partial string
unrelated_jo = ['PSYCHOL', 'BIOINFO', 'BUSINESS INFORMATION', 'MANUFACTURING', 
                'BIOLOGICALLY INSPIRED COGNITIVE ARCHITECTURES', 'COMPLEXITY', 'INDUSTRIAL ECOLOGY',
                'QUANTITATIVE FINANCE', 'VIRTUAL REALITY', 'COMMUNICATION NETWORKS', 'COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE',
                'ARTIFICIAL INTELLIGENCE IN EDUCATION', 'INSURANCE MATHEMATICS & ECONOMICS', 'ASTIN BULLETIN', 
                'COMMUNICATION TECHNOLOGY', 'COMPUTERS & STRUCTURES', 'CHEMOSPHERE', 'VISUAL COMPUTING',
                 'WASTE MANAGEMENT', 'OPTIMIZATION AND ENGINEERING',
                'COMPUTERS & OPERATIONS RESEARCH', 'INTERNATIONAL JOURNAL OF LIFE CYCLE ASSESSMENT',
                'COMPUTATIONAL MECHANICS', 'JOURNAL OF CLEANER PRODUCTION', 'JOURNAL OF ORGANIZATIONAL COMPUTING AND ELECTRONIC COMMERCE'

               ]
corpora_df = wosis.remove_by_journals(corpora_df, unrelated_jo)

PSYCHOL: 0
BIOINFO: 0
BUSINESS INFORMATION: 0
MANUFACTURING: 0
BIOLOGICALLY INSPIRED COGNITIVE ARCHITECTURES: 0
COMPLEXITY: 0
INDUSTRIAL ECOLOGY: 0
QUANTITATIVE FINANCE: 0
VIRTUAL REALITY: 0
COMMUNICATION NETWORKS: 0
COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE: 0
ARTIFICIAL INTELLIGENCE IN EDUCATION: 0
INSURANCE MATHEMATICS & ECONOMICS: 0
ASTIN BULLETIN: 0
COMMUNICATION TECHNOLOGY: 0
COMPUTERS & STRUCTURES: 0
CHEMOSPHERE: 0
VISUAL COMPUTING: 0
WASTE MANAGEMENT: 0
OPTIMIZATION AND ENGINEERING: 0
COMPUTERS & OPERATIONS RESEARCH: 0
INTERNATIONAL JOURNAL OF LIFE CYCLE ASSESSMENT: 209
COMPUTATIONAL MECHANICS: 0
JOURNAL OF CLEANER PRODUCTION: 0
JOURNAL OF ORGANIZATIONAL COMPUTING AND ELECTRONIC COMMERCE: 0


Now we filter out journals that only have 3 papers

In [12]:
# Journals have to have more than 3 papers to be included
corpora_df = corpora_df.groupby('SO').filter(lambda x: x['DOI'].count() > 3)

print("Corpora consists of", len(corpora_df), "Publications")

Corpora consists of 11718 Publications


The initially filtered corpora is written out for further analysis

In [13]:
filtered_corpora = wosis.extract_recs(corpora_df.id, RC)
wosis.export_ris_file(filtered_corpora, 'tmp/rest_requested_corpora.txt')

This can then be read in for later analysis with:
    
```python
import metaknowledge as mk
RC = mk.RecordCollection("tmp/rest_requested_corpora.txt")
```

For transparency and reproducibility reasons, the papers removed are listed in the indicated file below

In [14]:
tmp_rest_df = wosis.rc_to_df(RC)

all_removed = tmp_rest_df.loc[~tmp_rest_df.id.isin(corpora_df.id)]
all_removed.loc[:, ~all_removed.columns.isin(hide_columns)].to_csv('../data/initially_removed.csv')