In [3]:
import os
import warnings
import pandas as pd
import numpy as np
import re 
import math 

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, regexp, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


warnings.filterwarnings('ignore')

In [4]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# need to only download only once
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chantal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/chantal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Stats

Collect all the datasets

In [25]:
reviews = {}

# assuming naming follows 'type' + '_complete.csv' structure 
for f in os.listdir('/Users/chantal/Desktop/systematic_review/abstract_tool/data/'):
    if not f.startswith('.') and f.endswith('.csv'):
        key = re.split(r'_', f)
        reviews[key[0]] = f
        
reviews

{'Scaling': 'Scaling_complete.csv',
 'Rehab': 'Rehab_complete.csv',
 'WASH': 'WASH_complete.csv',
 'ADIPP': 'ADIPP_complete.csv',
 'NCDS': 'NCDS_complete.csv',
 'VitaminD': 'VitaminD_complete.csv'}

In [26]:
PATH = '/Users/chantal/Desktop/systematic_review/abstract_tool/data'

for key, dataset in reviews.items():
    reviews[key] = pd.read_csv(os.path.join(PATH, dataset), encoding='latin1').fillna(' ')

In [34]:
reviews['VitaminD'].columns.tolist()

['Title',
 'Authors',
 'Abstract',
 'Published.Year',
 'Published Month',
 'Journal',
 'Volume',
 'Issue',
 'Pages',
 'Accession Number',
 'DOI',
 'Ref',
 'Covidence #',
 'Study',
 'Notes',
 'Tags',
 'Inclusion']

Keep only relevant columns: Title, Abstract, Notes and Inclusion

In [35]:
to_keep = ['Title', 'Abstract', 'Notes','Published.Year', 'Inclusion', 'FullText_Inclusion',
           'Authors', 'Journal']

for key, dataset in reviews.items():
    filter_col = [col for col in dataset if col in to_keep]
    if 'Published Year' in dataset:
        dataset.rename(columns={'Published Year':'Published.Year'}, 
                 inplace=True)
    else:
        reviews[key] = dataset[filter_col]

In [37]:
reviews['VitaminD'].head()

Unnamed: 0,Title,Authors,Abstract,Published.Year,Journal,Notes,Inclusion
0,Preventive Effects of Vitamin D on Seasonal In...,"Zhou, Jian; Du, Juan; Huang, Leting; Wang, You...",OBJECTIVES: This study aimed to evaluate the c...,2018,The Pediatric infectious disease journal,Jill Korsiak (2018-04-04 00:54:01)(Screen): I'...,0
1,Feeding of premature infant formula after hosp...,"Wheeler, R. E.; Hall, R. T.","A randomized, double-blind study was conducted...",1996,Journal of perinatology : official journal of ...,,0
2,Effects of vitamin D supplementation in infanc...,"Trilok-Kumar, Geeta; Kaur, Manpreet; Rehman, A...",BACKGROUND: The long-term effects of infant vi...,2015,International journal of epidemiology,,0
3,Effect of Multivitamin-Mineral versus Multivit...,"Taghizadeh, Mohsen; Samimi, Mansooreh; Tabassi...",OBJECTIVE: Micronutrient deficiency during pre...,2014,Oman medical journal,,0
4,The relationship of vitamin D status of exerci...,"Sorvillo A, Wideman L. Lovelady C.",Vitamin D is needed for bone remolding; howeve...,2013,FASEB Journal,,0


Concatenate Title and Abstract fields

In [38]:
for key, dataset in reviews.items():
    dataset['All_Text'] = dataset.apply(lambda x: f"{x['Title']} {x['Abstract']}",
                                        axis = 1)

Clean up and preprocess text: remove special characters, punctuation, tokenize, lemmatize, remove any repeated information (e.g., headings), replace NaNs with 0s

In [39]:
# TODO: Do we want to remove numbers and special characters (e.g., other languages??)
def clean_text(s):
    s = s.str.lower()                         # put to lowercase for homogeneity    
    s = s.str.replace(r'_', ' ')              # remove underscores from the notes
    s = s.str.replace(r'\W', ' ')             # remove punctutation
    stop = set(stopwords.words('english'))    # define stop words
    lemmatizer = WordNetLemmatizer()          # lemmatize - a lot of repeat words
    s = s.apply(lambda x: [lemmatizer.lemmatize(word, 'v')
                              for word in x.split() 
                              if word not in stop]) # remove stopwords

    s = s.apply(lambda x: [word for word in x if len(word) > 1])
    s = s.apply(lambda x: [word for word in x if not word.isnumeric()])

    return s

In [40]:
for key, dataset in reviews.items():
    dataset[['All_Text']] = dataset[['All_Text']].apply(lambda x: clean_text(x))

In [41]:
for key, dataset in reviews.items():
    dataset['All_Text_Raw'] = dataset['All_Text'].str.join(' ')

In [42]:
# reviews['Scaling'][['All_Text']].apply(lambda x: clean_text(x))

reviews['Scaling'].iloc[0]['All_Text_Raw']

'scale nonlinear arxiv one celebrate find complex systems last decade different index patent scale nonlinearly population cities appear recently generality find question study use new databases different definitions city boundaries paper investigate existence nonlinear scale use probabilistic framework fluctuations account explicitly particular show allow estimate confidence intervals also quantify evidence favor test hypothesis observations compatible nonlinear scale employ framework compare different model different datasets find answer point crucially depend fluctuations contain data model fact city size heavy tail distribute'

In [43]:
for key, dataset in reviews.items():
    dataset['Metadata'] = dataset.apply(lambda x: f"{x['Authors']} {x['Published.Year']} \
                                                    {x['Journal']} {x['Notes']}",
                                        axis = 1)

In [45]:
for key, dataset in reviews.items():
    dataset[['Metadata']] = dataset[['Metadata']].apply(lambda x: clean_text(x))

In [51]:
for key, dataset in reviews.items():
    dataset['Metadata_Raw'] = dataset['Metadata'].str.join(' ')
reviews['Scaling']['Metadata_Raw']

'kossobokov vladimir nekrasova anastasia journal seismology'

In [52]:
# save relevant columns
for key,dataset in reviews.items():
    to_save = dataset[['All_Text_Raw', 'Metadata_Raw', 'Inclusion' ]].to_csv(key+'_metadata.tsv', 
                                                                               sep='\t', 
                                                                               header=None)

In [70]:
# average number of tokens, len of abstracts, num notes, label nums
stats = {}

for key, dataset in reviews.items(): 
    avg = np.mean(dataset['All_Text'].apply(lambda x: x.__len__()))
    num = dataset['All_Text'].count()
    num_notes = dataset['Notes'].count() # counts non-NaNs only
    label_counts = dataset['Inclusion'].value_counts()
    
    stats[key] = {'average token length': math.trunc(avg),
                  'total entries': num,
                  'total number of notes': num_notes,
                  'class labels [0]': label_counts[0],
                  'class labels [1]': label_counts[1]}

In [71]:
from pprint import pprint
pprint(stats)

{'ADIPP': {'average token length': 151,
           'class labels [0]': 44402,
           'class labels [1]': 4990,
           'total entries': 49398,
           'total number of notes': 49398},
 'NCDS': {'average token length': 148,
          'class labels [0]': 17883,
          'class labels [1]': 193,
          'total entries': 18078,
          'total number of notes': 18078},
 'Rehab': {'average token length': 141,
           'class labels [0]': 12819,
           'class labels [1]': 220,
           'total entries': 13042,
           'total number of notes': 13042},
 'Scaling': {'average token length': 164,
             'class labels [0]': 10460,
             'class labels [1]': 231,
             'total entries': 10691,
             'total number of notes': 10691},
 'VitaminD': {'average token length': 169,
              'class labels [0]': 1368,
              'class labels [1]': 80,
              'total entries': 1448,
              'total number of notes': 1448},
 'WASH': {'average

# Keyword Extraction

TF-IDF

In [111]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]= score_vals[idx]
    
    return results

In [119]:
# create list of stopwords that also remove pronouns etc. 
stop = list(stopwords.words('english'))

In [124]:
#treating ea. abstract in ea. review as a document

for key, review in reviews.items():
    print('\n******** '+key+' ********')
    corpus = review['Abstract']
    
    cv = CountVectorizer(max_df=0.85, stop_words=stop, max_features=1000)
    word_count_vector = cv.fit_transform(corpus)
    
    tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count_vector)
  

    feature_names = cv.get_feature_names()
    
    # change this line to whatever document you want to extract keywords from!!
    doc = review['Abstract'].iloc[0]


    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
    sorted_items = sort_coo(tf_idf_vector.tocoo())

    keywords = extract_topn_from_vector(feature_names,sorted_items,10)

    # now print the results
    print("\n===== Doc =====")
    print(doc)
    print("\n=== Keywords ===")
    for k in keywords:
        print(k,keywords[k])

 


******** Scaling ********

===== Doc =====
One of the most celebrated findings in complex systems in the last decade is that different indexes y (e.g., patents) scale nonlinearly with the population x of the cities in which they appear, i.e., y ~ x,   1. More recently, the generality of this finding has been questioned in studies using new databases and different definitions of city boundaries. In this paper we investigate the existence of nonlinear scaling using a probabilistic framework in which fluctuations are accounted explicitly. In particular, we show that this allows not only to (a) estimate  and confidence intervals, but also to (b) quantify the evidence in favor of   1 and (c) test the hypothesis that the observations are compatible with the nonlinear scaling. We employ this framework to compare 5 different models to 15 different datasets and we find that the answers to points (a)-(c) crucially depend on the fluctuations contained in the data, on how they are modeled, and on

# Sentiment Analysis (Brief) 

Is there a link between sentiment and class label = 1?