In [51]:
import os
import warnings
import pandas as pd
import numpy as np
import re 
import math 

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, regexp, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

warnings.filterwarnings('ignore')

In [8]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# need to only download only once
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chantal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/chantal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Stats

Collect all the datasets

In [9]:
reviews = {}

# assuming naming follows 'type' + '_complete.csv' structure 
for f in os.listdir('data/'):
    if not f.startswith('.'):
        key = re.split(r'_', f)
        reviews[key[0]] = f
        
reviews

{'Scaling': 'Scaling_complete.csv',
 'Rehab': 'Rehab_complete.csv',
 'WASH': 'WASH_complete.csv',
 'ADIPP': 'ADIPP_complete.csv',
 'NCDS': 'NCDS_complete.csv',
 'VitaminD': 'VitaminD_complete.csv'}

In [10]:
PATH = os.path.abspath('data')

for key, dataset in reviews.items():
    reviews[key] = pd.read_csv(os.path.join(PATH, dataset), encoding='latin1')

In [11]:
reviews['Scaling'].head()

Unnamed: 0,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags,Inclusion
0,Is this scaling nonlinear? [arXiv],"Leitao, J.C.; Miotto, J.M.; Gerlach, M.; Altma...",One of the most celebrated findings in complex...,2016,,arXiv,,,11-pp.,,,,#9879,Leitao 2016,Takhliq Amir (2019-07-19 07:24:56)(Select): I ...,,0
1,Scaling of foreign attractiveness for countrie...,"Bojic, Iva; Belyi, Alexander; Ratti, Carlo; So...","People's behavior on online social networks, w...",2016,,Applied Geography,73.0,,47-52,,10.1016/j.apgeog.2016.06.006,,#4381,Bojic 2016,Takhliq Amir (2019-07-18 13:50:05)(Select): Th...,,0
2,Coastal vs inland sensitivity to desertificati...,"Salvati, Luca; Smiraglia, Daniela; Bajocco, So...",The present study assesses the spatial distrib...,2015,,Rendiconti Lincei,26.0,,571-576,,10.1007/s12210-014-0339-4,,#6536,Salvati 2015,,,0
3,Earthquake hazard and risk assessment based on...,"Kossobokov, V.G.; Nekrasova, A.K.",We apply the general concept of seismic risk a...,2018,,Natural Hazards,93.0,3.0,1435-49,,10.1007/s11069-018-3359-z,,#9852,Kossobokov 2018,,,0
4,Dynamic evaluation of seismic hazard and risks...,"Kossobokov, Vladimir G.; Nekrasova, Anastasia",We continue applying the general concept of se...,2016,,American Geophysical Union Fall Meeting,2016.0,,,,,,#9495,Kossobokov 2016,,,0


Keep only relevant columns: Title, Abstract, Notes and Inclusion

In [12]:
to_keep = ['Title', 'Abstract', 'Notes', 'Inclusion']

for key, dataset in reviews.items():
    reviews[key] = dataset[to_keep]

In [13]:
reviews['Scaling'].head()

Unnamed: 0,Title,Abstract,Notes,Inclusion
0,Is this scaling nonlinear? [arXiv],One of the most celebrated findings in complex...,Takhliq Amir (2019-07-19 07:24:56)(Select): I ...,0
1,Scaling of foreign attractiveness for countrie...,"People's behavior on online social networks, w...",Takhliq Amir (2019-07-18 13:50:05)(Select): Th...,0
2,Coastal vs inland sensitivity to desertificati...,The present study assesses the spatial distrib...,,0
3,Earthquake hazard and risk assessment based on...,We apply the general concept of seismic risk a...,,0
4,Dynamic evaluation of seismic hazard and risks...,We continue applying the general concept of se...,,0


Concatenate Title and Abstract fields

In [14]:
for key, dataset in reviews.items():
    dataset['All_Text'] = dataset.apply(lambda x: f"{x['Title']} {x['Abstract']}",
                                        axis = 1)

Clean up and preprocess text: remove special characters, punctuation, tokenize, lemmatize, remove any repeated information (e.g., headings), replace NaNs with 0s

In [15]:
def clean_text(s):
    s = s.str.lower()                         # put to lowercase for homogeneity    
    s = s.str.replace(r'_', ' ')              # remove underscores from the notes
    s = s.str.replace(r'\W', ' ')             # remove punctutation
    stop = set(stopwords.words('english'))    # define stop words
    lemmatizer = WordNetLemmatizer()          # lemmatize - a lot of repeat words
    return s.apply(lambda x: [lemmatizer.lemmatize(word, 'v') 
                              for word in x.split() 
                              if word not in stop]) # remove stopwords

In [16]:
for key, dataset in reviews.items():
    dataset[['All_Text']] = dataset[['All_Text']].apply(lambda x: clean_text(x))

In [17]:
# reviews['Scaling'][['All_Text']].apply(lambda x: clean_text(x))

reviews['Scaling']

Unnamed: 0,Title,Abstract,Notes,Inclusion,All_Text
0,Is this scaling nonlinear? [arXiv],One of the most celebrated findings in complex...,Takhliq Amir (2019-07-19 07:24:56)(Select): I ...,0,"[scale, nonlinear, arxiv, one, celebrate, find..."
1,Scaling of foreign attractiveness for countrie...,"People's behavior on online social networks, w...",Takhliq Amir (2019-07-18 13:50:05)(Select): Th...,0,"[scale, foreign, attractiveness, countries, st..."
2,Coastal vs inland sensitivity to desertificati...,The present study assesses the spatial distrib...,,0,"[coastal, vs, inland, sensitivity, desertifica..."
3,Earthquake hazard and risk assessment based on...,We apply the general concept of seismic risk a...,,0,"[earthquake, hazard, risk, assessment, base, u..."
4,Dynamic evaluation of seismic hazard and risks...,We continue applying the general concept of se...,,0,"[dynamic, evaluation, seismic, hazard, risk, b..."
...,...,...,...,...,...
10686,Scale-Adjusted Metrics for Predicting the Evol...,More than a half of world population is now li...,,1,"[scale, adjust, metrics, predict, evolution, u..."
10687,Empirical analysis on the connection between p...,We report on the existing connection between p...,,1,"[empirical, analysis, connection, power, law, ..."
10688,Positive and negative feedbacks and free-scale...,Depopulation of rural areas is a widespread ph...,Camille Byfield (2019-02-01 05:12:51)(Select):...,1,"[positive, negative, feedbacks, free, scale, p..."
10689,Growth in urban extent and allometric analysis...,Urban area expansion is happening at much fast...,Jean-Luc Kortenaar (2019-07-17 05:49:44)(Selec...,1,"[growth, urban, extent, allometric, analysis, ..."


In [52]:
# average number of tokens, len of abstracts, num notes, label nums
stats = {}

for key, dataset in reviews.items(): 
    avg = np.mean(dataset['All_Text'].apply(lambda x: x.__len__()))
    num = dataset['All_Text'].count()
    num_notes = dataset['Notes'].count() # counts non-NaNs only
    label_counts = dataset['Inclusion'].value_counts()
    
    stats[key] = {'average token length': math.trunc(avg),
                  'total entries': num,
                  'total number of notes': num_notes,
                  'class labels [0]': label_counts[0],
                  'class labels [1]': label_counts[1]}

In [53]:
from pprint import pprint
pprint(stats)

{'ADIPP': {'average token length': 173,
           'class labels [0]': 44402,
           'class labels [1]': 4990,
           'total entries': 49398,
           'total number of notes': 4513},
 'NCDS': {'average token length': 166,
          'class labels [0]': 17883,
          'class labels [1]': 193,
          'total entries': 18078,
          'total number of notes': 219},
 'Rehab': {'average token length': 155,
           'class labels [0]': 12819,
           'class labels [1]': 220,
           'total entries': 13042,
           'total number of notes': 141},
 'Scaling': {'average token length': 171,
             'class labels [0]': 10460,
             'class labels [1]': 231,
             'total entries': 10691,
             'total number of notes': 53},
 'VitaminD': {'average token length': 196,
              'class labels [0]': 1368,
              'class labels [1]': 80,
              'total entries': 1448,
              'total number of notes': 56},
 'WASH': {'average token len

# Keyword Extraction