# Topic Modeling - LDA 

1. Implements cleaning and pre-processing from `data_exploration.ipynb` with additional text cleaning.
2. Trains an LDA for each of the datasets, for each outcome (ie. abstracts fitting exclusion criteria and inclusion criteria separately), and prints the top words belonging to each topic.


In [1]:
import os
import warnings
import pandas as pd
import numpy as np
import re 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, regexp, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


In [2]:
warnings.filterwarnings('ignore')

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# need to only download only once
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/delvin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/delvin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 01. Reading in and Cleaning 
**(mostly from data_exploration.ipynb)**

In [3]:
# read in each dataset into a dictionary
reviews = {}

# assuming naming follows 'type' + '_complete.csv' structure 
for f in os.listdir('../data/'):
    if not f.startswith('.'):
        key = re.split(r'_', f)
        reviews[key[0]] = f
        
PATH = os.path.abspath('../data')

for key, dataset in reviews.items():
    reviews[key] = pd.read_csv(os.path.join(PATH, dataset), encoding='latin1')

In [4]:
# columns to keep
to_keep = ['Title', 'Abstract', 'Notes', 'Inclusion']

for key, dataset in reviews.items():
    reviews[key] = dataset[to_keep]

In [5]:
# join title and abstract together
for key, dataset in reviews.items():
    dataset['All_Text'] = dataset.apply(lambda x: f"{x['Title']} {x['Abstract']}",
                                        axis = 1)

In [6]:

# modified to remove 1 letter words and numbers. shouldn't be relevant
def clean_text(s):
    s = s.str.lower()                         # put to lowercase for homogeneity    
    s = s.str.replace(r'_', ' ')              # remove underscores from the notes
    s = s.str.replace(r'\W', ' ')             # remove punctutation
    stop = set(stopwords.words('english'))    # define stop words
    lemmatizer = WordNetLemmatizer()          # lemmatize - a lot of repeat words

    s = s.apply(lambda x: [lemmatizer.lemmatize(word, 'v')
                              for word in x.split() 
                              if word not in stop]) # remove stopwords

    s = s.apply(lambda x: [word for word in x if len(word) > 1])
    s = s.apply(lambda x: [word for word in x if not word.isnumeric()])

    return(s)

In [7]:
for key, dataset in reviews.items():
    dataset[['All_Text']] = dataset[['All_Text']].apply(lambda x: clean_text(x))


## 02. LDA

TODO
* how applicable is this given the nuances in exclusion/inclusion? - literature?
* play around with dictionary filtering parameters
* given a new 'All_Text' (abstract + title), return similar papers?
* topic coherence
* add n-gram
* use small n to see how granular topics are

In [8]:
import gensim
from gensim import models, corpora


In [9]:

def text2lda(txt, num_topics = 5):
    """
    Creates a dictionary, filters based on document size and frequency. 
    Converts to a bag of words and fits a simple LDA.
    """
    doc_size = txt.shape[0]
    dictionary = corpora.Dictionary(txt)
    # remove terms occuring in less 1% of documents, and those occuring in more than 30
    dictionary.filter_extremes(no_below = doc_size * 0.01, no_above = 0.30)
    corpus = [dictionary.doc2bow(text) for text in txt]
    lda_model = models.LdaModel(corpus=corpus,
                                num_topics = num_topics,
                                id2word=dictionary,
                                eval_every=None)
    return(lda_model)

def print_top_words(model,n = 10):
    for topic in range(0, model.num_topics):
        print('\t\tTopic {}: '.format(topic) + ', '.join(words[0] for words in model.show_topic(topic, n)))
        # print('topic {}: '.format(topic) + ', '.join([str(words[1]) for words in model.show_topic(topic, 10)]))


In [10]:
# fit LDA for each dataset, for each label (inclusion and exclusion),
# and print top 5 words for each topic
for key, dataset in reviews.items():
    for label in (0, 1):
        print('Dataset: {}, Inclusion: {}, # of Abstracts {}'\
              .format(key, str(label), dataset['All_Text'][dataset.Inclusion == label].count()))
        lda = text2lda(dataset['All_Text'][dataset.Inclusion == label], num_topics=5)
        print_top_words(lda, 5)
    print('\n')

Dataset: Vitamin D, Inclusion: 0, # of Abstracts 1368
		Topic 0: bone, infants, pregnancy, patients, oh
		Topic 1: oh, bone, trials, infants, pregnancy
		Topic 2: children, placebo, iu, status, high
		Topic 3: trials, patients, bone, children, status
		Topic 4: pregnancy, infants, children, trials, data
Dataset: Vitamin D, Inclusion: 1, # of Abstracts 80
		Topic 0: children, pneumonia, ml, nan, mo
		Topic 1: nmol, wk, breastfeed, mug, milk
		Topic 2: micronutrient, mo, breast, nmol, day
		Topic 3: ml, children, single, day, calcium
		Topic 4: ml, ng, breast, day, weeks


Dataset: Scaling, Inclusion: 0, # of Abstracts 10460
		Topic 0: species, health, community, plant, effect
		Topic 1: health, care, community, service, interventions
		Topic 2: hiv, systems, network, cost, water
		Topic 3: species, spatial, pattern, land, size
		Topic 4: community, sample, species, level, water
Dataset: Scaling, Inclusion: 1, # of Abstracts 231
		Topic 0: network, distributions, large, area, relationshi

Notes:
   * Remove words with two letters (abbreviatinos such as ml, ui)
   * Vitamin - Inclusion is not too informative, likely because of the small number of positive abstracts
   * Scaling is interesting, exclusion criteria is related to 


In [11]:
# for key, dataset in reviews.items():
#     print('Dataset: {}, # of Abstracts'.format(key))
#     lda = text2lda(dataset['All_Text'], num_topics=5)
#     print_top_words(lda, 10)
