# Systematic Review Dataset Pre-Processing and Basic Stats

Delvin So & Chantal Shaib

Current iteration:
- normalizes the datasets so each one has a unique identifier and requires no more additional pre-processing, at least for input into our current experiment scripts. 
- This also makes the dataset almost immediately SPECTER compliant (ie. minimal pre-processing but nothing that requires altering rows, but even then there is now a unique ID so whatever).
- fixes a mistake where 'empty' abstracts were not actually empty and kept in the dataset(s)

Previously there was no unique identifier and all other information was dropped, making it difficult to keep track of the data whenever any additional pre-processing was performed (which was required internally within our model and in SPECTER), as well as for any ad-hoc analyses. 

* requires some changes in `utils.py` and `AbstractDataset.py` - TODO: generalize the column names


In [1]:
import os
import warnings
import pandas as pd
import numpy as np
import re 
import csv
from glob import glob

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, regexp, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

warnings.filterwarnings('ignore')

### NOTE: please run your `jupyter-lab / notebook` in the *root* (`sra`) directory!!!

In [4]:
print(os.getcwd())

/Users/delvin/Downloads/sra_sample/data_preprocessing/notebooks


In [5]:
# set to root directory
os.chdir('..')
print(os.getcwd())

output_dir = os.path.join('..', 'cleaned_data')
data_dir = os.path.join('data', 'datasets_complete')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    

/Users/delvin/Downloads/sra_sample/data_preprocessing


In [6]:
# the below needs to be applied to the datasets prior to parsing through specter
def preprocess_df(dataset: pd.DataFrame):
    
       
    print('\tBefore ' + str(dataset.shape[0]))

    # we want to drop on empty abstracts, but not titles
    
    dataset['unq_id'] = dataset.index.values.tolist()
    
    dataset['Inclusion'] = pd.to_numeric(dataset['Inclusion'], errors = 'coerce' )
    
    dataset = dataset[~dataset['Abstract'].isnull()]
    dataset = dataset[~dataset['Inclusion'].isnull()]

    # fill with white space so joining doesn't go haywire

    dataset = dataset.fillna(' ')
    
    #print(dataset.isna().sum())
    dataset = dataset.replace(r'\\r',' ', regex = True)
    dataset = dataset.replace(r'\\t',' ', regex = True)

    dataset['All_Text'] = dataset.agg('{0[Title]} {0[Abstract]}'.format, axis=1)
    
    dataset['Metadata'] = dataset.agg('{0[Authors]} {0[Published.Year]} {0[Journal]} {0[Notes]}'.format, axis=1)
    
    print('\tAfter ' + str(dataset.shape[0]))
    
    return dataset


# TODO: Do we want to remove numbers and special characters (e.g., other languages??)
# Credit to Chantal
def clean_text(s):
    s = s.str.lower()                         # put to lowercase for homogeneity    
    s = s.str.replace(r'_', ' ')              # remove underscores from the notes
    s = s.str.replace(r'\W', ' ')             # remove punctutation
    stop = set(stopwords.words('english'))    # define stop words
    lemmatizer = WordNetLemmatizer()          # lemmatize - a lot of repeat words
    s = s.apply(lambda x: [lemmatizer.lemmatize(word, 'v')
                              for word in x.split() 
                              if word not in stop]) # remove stopwords

    s = s.apply(lambda x: [word for word in x if len(word) > 1])
    s = s.apply(lambda x: [word for word in x if not word.isnumeric()])

    return s

In [7]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# need to only download only once
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/delvin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/delvin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Collecting the Datasets

In [8]:
reviews = {}
# see if output already exists, otherwise set to 0
check = 0 

In [9]:


# assuming naming follows 'type' + '_complete.csv' structure 

for f in glob(os.path.join(data_dir, '*')):
    
    print(f)
    
    key = re.split(r'_', os.path.basename(f))[0]
    
    out_fn = os.path.join(output_dir, key + '_oct.tsv')
    
    if check == 1 and os.path.exists(out_fn): 
        print(f'Output file already exists for {key}, skipping!')
        continue
        
    else:
        print(f'Reading in {f}....')
        if f.endswith('csv'):
            reviews[key] = pd.read_csv(f, encoding='latin1')#.fillna(' ')
        elif f.endswith('xlsx'):
            reviews[key] = pd.read_excel(f, encoding='latin1')#.fillna(' ')

        
# reviews

data/datasets_complete/sample_wash_data.csv
Reading in data/datasets_complete/sample_wash_data.csv....


In [11]:
for key, dataset in reviews.items():
    print(dataset.columns.tolist())

['Covidence..', 'Title', 'Authors', 'Abstract', 'Published.Year', 'Published.Month', 'Journal', 'Volume', 'Issue', 'Pages', 'Accession.Number', 'DOI', 'Ref', 'Study', 'Notes', 'Tags', 'Inclusion', 'FullText_Inclusion']


Normalizing column names where needed.

In [12]:
to_keep = ['Title', 'Abstract', 'Notes','Published.Year', 'Covidence..', 'Inclusion', 'FullText_Inclusion',
           'Authors', 'Journal']

for key, dataset in reviews.items():
    
    if 'Published Year' in dataset.columns.tolist():
        dataset.rename(columns = {'Published Year':'Published.Year'}, 
                 inplace=True)
        
    if 'Covidence #' in dataset.columns.tolist():
        dataset.rename(columns = {'Covidence #':'Covidence..'}, 
                 inplace=True)
        
    filter_col = [col for col in dataset if col in to_keep]
    
    reviews[key] = dataset[filter_col]
    print(reviews[key].columns.tolist())

['Covidence..', 'Title', 'Authors', 'Abstract', 'Published.Year', 'Journal', 'Notes', 'Inclusion', 'FullText_Inclusion']


## Pre-Processing the Data

Purely data pre-processing here, ie. not NLP pre-processing

Here we add a unique column identifier (`unq_id`) that can be created from the original data by taking the index along with other pre-processing.


In [13]:
for key, dataset in reviews.items():
    # drop empty abstracts before concatenating them 
    print(key)
    reviews[key] = preprocess_df(reviews[key])

sample
	Before 200
	After 200



From Chantal: 
Clean up and preprocess text: remove special characters, punctuation, tokenize, lemmatize, remove any repeated information (e.g., headings), replace NaNs with 0s

In [14]:
for key, dataset in reviews.items():
    print(key)
    dataset[['All_Text_Clean']] = dataset[['All_Text']].apply(lambda x: clean_text(x))
    dataset['All_Text_Clean'] = dataset['All_Text_Clean'].str.join(' ')
    
    dataset[['Metadata_Clean']] = dataset[['Metadata']].apply(lambda x: clean_text(x))
    dataset['Metadata_Clean'] = dataset['Metadata_Clean'].str.join(' ')

sample


## Save down the resulting data

In [15]:
# save relevant columns

    
for key,dataset in reviews.items():
    out_fn = os.path.join(output_dir, key + '_oct.tsv')
    
    #if not os.path.isfile(fn):
    if 'FullText_Inclusion' not in dataset.columns.tolist():
        dataset[['unq_id', 'All_Text_Clean', 'Metadata_Clean', 'Inclusion','Covidence..',
                 'Title', 'Abstract', 'All_Text', 'Metadata']].to_csv(out_fn, index = False, sep = '\t', quoting=csv.QUOTE_NONNUMERIC)
    else: 
        dataset[['unq_id', 'All_Text_Clean', 'Metadata_Clean', 'Inclusion', 'FullText_Inclusion', 'Covidence..',
                 'Title', 'Abstract', 'All_Text', 'Metadata']].to_csv(out_fn, index = False, sep = '\t', quoting=csv.QUOTE_NONNUMERIC)

    print('{} successfully saved!'.format(out_fn))

../cleaned_data/sample_oct.tsv successfully saved!
