# ADRs extraction using Topic modeling

In [1]:
import pandas as pd

data = pd.read_csv('Mirena_IUD.csv', encoding='cp1252',error_bad_lines=False);
data_text = data[['Reviews']]
data_text['index'] = data_text.index
documents = data_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


In [2]:
documents.head()

Unnamed: 0,Reviews,index
0,I got the Mirena in Feb 2020. The insertion wa...,0
1,This is 2nd time I have had the Mirena. Total ...,1
2,The biggest joke known to mankind. Doctor gave...,2
3,I’ve had mirena for exactly one month now and ...,3
4,"I have a history of gushing, painful periods w...",4


In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [4]:
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


#preprocess will remove the stop words & word_length<4 words append the word in it's stemmed form
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
doc_sample = documents[documents['index'] == 1008].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Had', 'it', 'for', 'almost', 'a', 'year.', 'Bloated,', 'hair', 'thinning', 'and', 'would', 'get', 'back', 'cramps', 'all', 'the', 'time.', 'Has', 'it', 'removed', 'and', 'had', 'instant', 'relief!']


 tokenized and lemmatized document: 
['year', 'bloat', 'hair', 'thin', 'cramp', 'time', 'remov', 'instant', 'relief']


In [7]:
processed_docs = documents['Reviews'].map(preprocess)

In [8]:
processed_docs[:10]

0    [mirena, insert, wasn, pleasant, mean, like, i...
1    [time, mirena, total, year, total, issu, surf,...
2    [biggest, joke, know, mankind, doctor, give, s...
3    [mirena, exact, month, complet, nightmar, doct...
4    [histori, gush, pain, period, includ, vomit, l...
5    [mirena, insert, decemb, gyna, recommend, help...
6    [birth, control, devic, ruin, life, total, mon...
7    [mirena, month, stand, anymor, constant, cramp...
8    [facial, hair, huge, acn, pimpl, destroy, sugg...
9    [recommend, mirena, eas, endometriosi, pain, b...
Name: Reviews, dtype: object

## Bag of words on the dataset

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [10]:
len(dictionary)

3180

In [11]:
count = 0
for k, v in dictionary.iteritems(): #key & value in dictionary
    print(k, v)
    count += 1
    if count > 10:
        break

0 abl
1 appetit
2 attent
3 bodi
4 boyfriend
5 break
6 chang
7 chest
8 contract
9 control
10 couldn


In [12]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [13]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
#bow_corpus[117]
len(bow_corpus) 

1028

In [14]:
bow_doc_801 = bow_corpus[801]

for i in range(len(bow_doc_801)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_801[i][0], 
                                                     dictionary[bow_doc_801[i][0]], 
                                                     bow_doc_801[i][1]))

Word 101 ("bleed") appears 1 time.
Word 113 ("tender") appears 1 time.
Word 141 ("mood") appears 1 time.
Word 150 ("swing") appears 1 time.
Word 241 ("breast") appears 1 time.
Word 441 ("nauseous") appears 1 time.


### TF-IDF

In [15]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [16]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.24143905390274553),
 (1, 0.08054150798236333),
 (2, 0.14840155449816345),
 (3, 0.14439943615678255),
 (4, 0.20000002210824572),
 (5, 0.16542722110881483),
 (6, 0.1092239770460668),
 (7, 0.20835860919445964),
 (8, 0.10669445514597896),
 (9, 0.14702469473774815),
 (10, 0.11384794784517635),
 (11, 0.12780007229918525),
 (12, 0.11658598296119647),
 (13, 0.14439943615678255),
 (14, 0.13738475821151322),
 (15, 0.15282569107032426),
 (16, 0.050746424645124945),
 (17, 0.06980371910435576),
 (18, 0.04325053535877227),
 (19, 0.14074578836728166),
 (20, 0.12530485550847062),
 (21, 0.10259425420509913),
 (22, 0.083121938481233),
 (23, 0.11929598518446949),
 (24, 0.1675810501612803),
 (25, 0.08341869208743972),
 (26, 0.18583715226261865),
 (27, 0.09500516419468213),
 (28, 0.11593495502870105),
 (29, 0.15955855278655381),
 (30, 0.1498250962650667),
 (31, 0.1675810501612803),
 (32, 0.3495942431771688),
 (33, 0.21338891029195792),
 (34, 0.1775071238118574),
 (35, 0.10992608470469993),
 (36, 0.1

## LDA with Bag of Words

In [17]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=10, workers=10)

In [18]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.033*"year" + 0.027*"effect" + 0.023*"week" + 0.020*"experi" + 0.019*"take" + 0.018*"pill" + 0.016*"go" + 0.014*"get" + 0.013*"doctor" + 0.012*"caus"
Topic: 1 
Words: 0.028*"year" + 0.026*"felt" + 0.021*"experi" + 0.021*"like" + 0.018*"time" + 0.017*"take" + 0.013*"pill" + 0.013*"work" + 0.012*"doctor" + 0.012*"go"
Topic: 2 
Words: 0.032*"like" + 0.025*"feel" + 0.021*"experi" + 0.019*"felt" + 0.018*"want" + 0.017*"take" + 0.017*"hour" + 0.017*"go" + 0.016*"hurt" + 0.015*"birth"
Topic: 3 
Words: 0.043*"remov" + 0.024*"doctor" + 0.019*"start" + 0.017*"have" + 0.017*"bleed" + 0.016*"year" + 0.016*"hair" + 0.015*"migrain" + 0.014*"time" + 0.013*"want"
Topic: 4 
Words: 0.035*"week" + 0.033*"go" + 0.031*"bleed" + 0.028*"remov" + 0.027*"year" + 0.021*"feel" + 0.021*"day" + 0.020*"time" + 0.017*"think" + 0.015*"spot"
Topic: 5 
Words: 0.024*"feel" + 0.024*"like" + 0.020*"spot" + 0.019*"weight" + 0.018*"control" + 0.017*"birth" + 0.017*"year" + 0.016*"gain" + 0.014*"go" + 0.014

## LDA using TF-IDF

In [19]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=15, id2word=dictionary, passes=50, workers=50)
#no of workers used for parallelization

In [20]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.012*"remov" + 0.012*"gain" + 0.012*"weight" + 0.011*"bleed" + 0.010*"acn" + 0.010*"mood" + 0.009*"depress" + 0.009*"wait" + 0.009*"lose" + 0.009*"hair"
Topic: 1 Word: 0.052*"cyst" + 0.036*"ovarian" + 0.035*"attack" + 0.033*"panic" + 0.032*"lot" + 0.030*"multipl" + 0.029*"migrain" + 0.028*"anxieti" + 0.027*"ovari" + 0.017*"believ"
Topic: 2 Word: 0.068*"surgeri" + 0.036*"pregnant" + 0.028*"hospit" + 0.028*"cours" + 0.024*"extrem" + 0.023*"coil" + 0.023*"write" + 0.021*"remov" + 0.021*"appar" + 0.019*"problem"
Topic: 3 Word: 0.117*"updat" + 0.031*"appar" + 0.026*"sweat" + 0.012*"call" + 0.011*"appoint" + 0.009*"heat" + 0.009*"suck" + 0.009*"wasnt" + 0.008*"ask" + 0.008*"painless"
Topic: 4 Word: 0.009*"bleed" + 0.009*"year" + 0.009*"week" + 0.009*"spot" + 0.008*"like" + 0.008*"birth" + 0.008*"day" + 0.008*"experi" + 0.007*"control" + 0.007*"go"
Topic: 5 Word: 0.118*"bear" + 0.025*"sharp" + 0.017*"stay" + 0.014*"stab" + 0.013*"scare" + 0.012*"women" + 0.012*"child" + 0.008*

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [21]:
processed_docs[801]  #750,

['chill',
 'itch',
 'mood',
 'swing',
 'abnorm',
 'bleed',
 'cramp',
 'nauseous',
 'breast',
 'tender',
 'pain']

In [22]:
for index, score in sorted(lda_model[bow_corpus[801]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 20)))


Score: 0.871407687664032	 
Topic: 0.035*"week" + 0.033*"go" + 0.031*"bleed" + 0.028*"remov" + 0.027*"year" + 0.021*"feel" + 0.021*"day" + 0.020*"time" + 0.017*"think" + 0.015*"spot" + 0.015*"stop" + 0.014*"have" + 0.013*"get" + 0.012*"know" + 0.012*"constant" + 0.010*"cycl" + 0.010*"bodi" + 0.010*"blood" + 0.010*"light" + 0.009*"recommend"

Score: 0.014289222657680511	 
Topic: 0.024*"feel" + 0.024*"like" + 0.020*"spot" + 0.019*"weight" + 0.018*"control" + 0.017*"birth" + 0.017*"year" + 0.016*"gain" + 0.014*"go" + 0.014*"week" + 0.013*"lose" + 0.013*"bleed" + 0.013*"thing" + 0.012*"have" + 0.012*"time" + 0.011*"day" + 0.011*"horribl" + 0.011*"heavi" + 0.011*"depress" + 0.011*"experi"

Score: 0.014289028011262417	 
Topic: 0.033*"year" + 0.027*"effect" + 0.023*"week" + 0.020*"experi" + 0.019*"take" + 0.018*"pill" + 0.016*"go" + 0.014*"get" + 0.013*"doctor" + 0.012*"caus" + 0.012*"remov" + 0.011*"have" + 0.011*"gain" + 0.011*"extrem" + 0.011*"felt" + 0.011*"weight" + 0.011*"work" + 0.010*

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [23]:
for index, score in sorted(lda_model_tfidf[bow_corpus[801]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.7238268852233887	 
Topic: 0.012*"remov" + 0.012*"gain" + 0.012*"weight" + 0.011*"bleed" + 0.010*"acn" + 0.010*"mood" + 0.009*"depress" + 0.009*"wait" + 0.009*"lose" + 0.009*"hair"

Score: 0.15235751867294312	 
Topic: 0.023*"normal" + 0.018*"overal" + 0.017*"easi" + 0.015*"nauseous" + 0.015*"flow" + 0.015*"coupl" + 0.015*"call" + 0.014*"partner" + 0.014*"problem" + 0.014*"day"


In [24]:
#Testing

unseen_document = 'i gained weight the first month with no change to my diet i lose hair constantly i seem to irratated all day every day i have horrible headches crying spells after a few months i couldnt feel the strings anymore its just not for everyone'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 15)))

Score: 0.9307525753974915	 Topic: 0.024*"feel" + 0.024*"like" + 0.020*"spot" + 0.019*"weight" + 0.018*"control" + 0.017*"birth" + 0.017*"year" + 0.016*"gain" + 0.014*"go" + 0.014*"week" + 0.013*"lose" + 0.013*"bleed" + 0.013*"thing" + 0.012*"have" + 0.012*"time"
