In [None]:
%run "Common setup.ipynb"

  wos_config = yaml.load(config)


The Web of Science licence agreement states that we should not make public certain information/metadata.
To conform to this agreement we have to hide certain columns when printing out DataFrames

In [None]:
requested_corpora_fn = "{}/rest_requested_corpora.txt".format(TMP_DIR)

RC = mk.RecordCollection(requested_corpora_fn)

expected_num_records = 11718
num_records = len(RC)

assert num_records == expected_num_records, \
    "Mismatch in number of records - expected {}, got {}".format(expected_num_records, num_records)

print("Corpora consists of", num_records, "Publications")

corpora_df = wosis.rc_to_df(RC)

The corpora should all contain valid DOIs to be included

In [None]:
corpora_df, removed_pubs = wosis.constrain.remove_empty_DOIs(corpora_df)

Then we remove conferences and workshops

In [None]:
corpora_df = wosis.remove_by_journals(corpora_df, ["CONFERENCE", "WORKSHOP", "PROCEEDINGS", "SYMPOSIUM", "CONGRESS"])

In [None]:
corpora_df = wosis.remove_by_title(corpora_df, ["CONFERENCE", "WORKSHOP", "PROCEEDINGS", "SYMPOSIUM", "CONGRESS"])

In [None]:
corpora_df = wosis.remove_by_keywords(corpora_df, ['lifecycle', 'life-cycle', 'life cycle', 'product life cycle'])

In [None]:
corpora_df = wosis.remove_by_keywords(corpora_df, ['image processing'])

Filtering unrelated journals

In [None]:
# List of unrelated Journals - will be matched by partial string
unrelated_jo = ['PSYCHOL', 'BIOINFO', 'BUSINESS INFORMATION', 'MANUFACTURING', 
                'BIOLOGICALLY INSPIRED COGNITIVE ARCHITECTURES', 'COMPLEXITY', 'INDUSTRIAL ECOLOGY',
                'QUANTITATIVE FINANCE', 'VIRTUAL REALITY', 'COMMUNICATION NETWORKS', 'COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE',
                'ARTIFICIAL INTELLIGENCE IN EDUCATION', 'INSURANCE MATHEMATICS & ECONOMICS', 'ASTIN BULLETIN', 
                'COMMUNICATION TECHNOLOGY', 'COMPUTERS & STRUCTURES', 'CHEMOSPHERE', 'VISUAL COMPUTING',
                 'WASTE MANAGEMENT', 'OPTIMIZATION AND ENGINEERING',
                'COMPUTERS & OPERATIONS RESEARCH', 'INTERNATIONAL JOURNAL OF LIFE CYCLE ASSESSMENT',
                'COMPUTATIONAL MECHANICS', 'JOURNAL OF CLEANER PRODUCTION', 'JOURNAL OF ORGANIZATIONAL COMPUTING AND ELECTRONIC COMMERCE'
               ]
corpora_df = wosis.remove_by_journals(corpora_df, unrelated_jo)

Now we filter out journals that only have 3 papers

In [None]:
# Journals have to have more than 3 papers to be included
corpora_df = corpora_df.groupby('SO').filter(lambda x: x['DOI'].count() > 3)

print("Corpora consists of", len(corpora_df), "Publications")

The initially filtered corpora is written out for further analysis

In [None]:
filtered_corpora = wosis.extract_recs(corpora_df.id, RC)

filtered_corpora_fn = "{}/rest_filtered_corpora.txt".format(TMP_DIR)

wosis.export_ris_file(filtered_corpora, filtered_corpora_fn)

This can then be read in for later analysis with:
    
```python
import metaknowledge as mk
RC = mk.RecordCollection(filtered_corpora_fn)
```

For transparency and reproducibility reasons, the papers removed are listed in the indicated file below

In [None]:
tmp_rest_df = wosis.rc_to_df(RC)

all_removed = tmp_rest_df.loc[~tmp_rest_df.id.isin(corpora_df.id)]

removed_pubs_fn = "{}/initially_removed.txt".format(DATA_DIR)

all_removed.loc[:, ~all_removed.columns.isin(HIDE_COLUMNS)].to_csv(removed_pubs_fn)