# Adverse drug reaction identification and extraction from natural language text using Topic modeling 

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import string
import math

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *


In [9]:
dataset = pd.read_csv('Mirena_IUD.csv',encoding='cp1252')
dataset.head()

Unnamed: 0,Title,Time,reporttype,Reviews,Unnamed: 4
0,Rated for Birth Control Report,Posted 6 months ago (8/19/2020),Report,I got the Mirena in Feb 2020. The insertion wa...,
1,Rated for Birth Control Report,Posted 6 months ago (8/19/2020),Report,This is 2nd time I have had the Mirena. Total ...,
2,Rated for Birth Control Report,Posted 6 months ago (8/19/2020),Report,The biggest joke known to mankind. Doctor gave...,
3,Rated for Birth Control Report,Posted 6 months ago (8/14/2020),Report,I’ve had mirena for exactly one month now and ...,
4,Rated for Endometriosis Report,Posted 7 months ago (8/6/2020),Report,"I have a history of gushing, painful periods w...",


In [10]:
dataset.isnull().any()

Title         False
Time          False
reporttype    False
Reviews       False
Unnamed: 4     True
dtype: bool

In [11]:
dataset.isnull().sum()

Title           0
Time            0
reporttype      0
Reviews         0
Unnamed: 4    979
dtype: int64

In [12]:
dataset.columns

Index(['Title', 'Time', 'reporttype', 'Reviews', 'Unnamed: 4'], dtype='object')

In [14]:
dataset["Unnamed: 4"].isnull().count()

1028

## drop unwanted columns

In [15]:
dataset.drop(columns = ['Title','Time','reporttype','Unnamed: 4'],inplace = True)
dataset

Unnamed: 0,Reviews
0,I got the Mirena in Feb 2020. The insertion wa...
1,This is 2nd time I have had the Mirena. Total ...
2,The biggest joke known to mankind. Doctor gave...
3,I’ve had mirena for exactly one month now and ...
4,"I have a history of gushing, painful periods w..."
...,...
1023,I got the Mirena inserted in July of 2020. It ...
1024,"This has given me some severe cramp episodes, ..."
1025,I had the Mirena placed in October 2019 for bi...
1026,This is an update to having the Mirena REMOVED...


## convert to lower case

In [16]:
dataset["Reviews"]=dataset["Reviews"].str.lower()
dataset.head()

Unnamed: 0,Reviews
0,i got the mirena in feb 2020. the insertion wa...
1,this is 2nd time i have had the mirena. total ...
2,the biggest joke known to mankind. doctor gave...
3,i’ve had mirena for exactly one month now and ...
4,"i have a history of gushing, painful periods w..."


## removal of digits & words containing digits

In [17]:
dataset['Reviews']=dataset['Reviews'].apply(lambda x: re.sub('\w*\d\w*','', x))

In [18]:
dataset.head()

Unnamed: 0,Reviews
0,i got the mirena in feb . the insertion wasn't...
1,this is time i have had the mirena. total of ...
2,the biggest joke known to mankind. doctor gave...
3,i’ve had mirena for exactly one month now and ...
4,"i have a history of gushing, painful periods w..."


In [12]:
#dataset.to_csv('Mirena_reviews.csv')

## conversion of contraction words

In [19]:
#contraction words which are frequently used in our dataset are - i'hv, i'll, i'm, it's, don't, didn't, doesn't, wasn't, 
#couldn't, hasn't, won't, hadn't, Couldn't,didn’t, haven't

pattern = r'i[\’\']ve'
replacement = 'i have'
dataset.loc[:,"Reviews"] = dataset.Reviews.apply(lambda x:' '.join(re.sub(pattern,replacement,word) for word in x.split()))

pattern = r'i[\’\']ll'
replacement = 'i will'
dataset.loc[:,"Reviews"] = dataset.Reviews.apply(lambda x:' '.join(re.sub(pattern,replacement,word) for word in x.split()))

pattern = r'i[\’\']m'
replacement = 'i am'
dataset.loc[:,"Reviews"] = dataset.Reviews.apply(lambda x:' '.join(re.sub(pattern,replacement,word) for word in x.split()))

pattern = r'won[\’\']t'
replacement = 'will not'
dataset.loc[:,"Reviews"] = dataset.Reviews.apply(lambda x:' '.join(re.sub(pattern,replacement,word) for word in x.split()))

pattern = r'can[\’\']t'
replacement = 'can not'
dataset.loc[:,"Reviews"] = dataset.Reviews.apply(lambda x:' '.join(re.sub(pattern,replacement,word) for word in x.split()))

pattern = r'it[\’\']s'
replacement = 'it is'
dataset.loc[:,"Reviews"] = dataset.Reviews.apply(lambda x:' '.join(re.sub(pattern,replacement,word) for word in x.split()))

pattern = r'wasn[\’\']t'
replacement = 'was not'
dataset.loc[:,"Reviews"] = dataset.Reviews.apply(lambda x:' '.join(re.sub(pattern,replacement,word) for word in x.split()))

pattern = r'don[\’\']t'
replacement = 'do not'
dataset.loc[:,"Reviews"] = dataset.Reviews.apply(lambda x:' '.join(re.sub(pattern,replacement,word) for word in x.split()))



dataset.head(99)

Unnamed: 0,Reviews
0,i got the mirena in feb . the insertion was no...
1,this is time i have had the mirena. total of y...
2,the biggest joke known to mankind. doctor gave...
3,i have had mirena for exactly one month now an...
4,"i have a history of gushing, painful periods w..."
...,...
94,i had a severe heavy bleeding no pills were he...
95,i got my mirena about months ago. while gettin...
96,i am with adult children and about to get my t...
97,i am am no kids and got mirena as a form of bi...


## Stop word removal

In [20]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [22]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

dataset["Reviews"] = dataset["Reviews"].apply(lambda text: remove_stopwords(text))
dataset.head(99)

Unnamed: 0,Reviews
0,got mirena feb . insertion pleasant means. lik...
1,time mirena. total years total. never issues s...
2,biggest joke known mankind. doctor gave big st...
3,mirena exactly one month complete nightmare. d...
4,"history gushing, painful periods included vomi..."
...,...
94,severe heavy bleeding pills helping doctor sai...
95,got mirena months ago. getting iud unbearable ...
96,"adult children get third mirena iud, clearly f..."
97,kids got mirena form birth control. fine first...


## removal of special characters

In [23]:
#the string.punctuation in python contains the following punctuation symbols
#     !"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`


#import spacy
import string
dataset["Reviews"] = dataset["Reviews"].astype(str)
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(Reviews):
    return Reviews.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

dataset["Reviews"]= dataset["Reviews"].apply(lambda Reviews: remove_punctuation(Reviews))
dataset.head(118)


Unnamed: 0,Reviews
0,got mirena feb insertion pleasant means like ...
1,time mirena total years total never issues sur...
2,biggest joke known mankind doctor gave big sto...
3,mirena exactly one month complete nightmare do...
4,history gushing painful periods included vomit...
...,...
113,twice years first one half years second little...
114,woah big difference one day taking out got rig...
115,reason girlfriend use mirena device even liste...
116,mirena little years worst decision ever madeal...


In [16]:
#dataset.to_csv('Mirena_reviews.csv')

In [24]:
#removing white spaces

dataset["Reviews"] = dataset["Reviews"].str.strip('Reviews')
dataset.head()

Unnamed: 0,Reviews
0,got mirena feb insertion pleasant means like ...
1,time mirena total years total never issues sur...
2,biggest joke known mankind doctor gave big sto...
3,mirena exactly one month complete nightmare do...
4,history gushing painful periods included vomit...


In [25]:
#frequently used words

from collections import Counter
cnt = Counter()
for text in dataset["Reviews"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('mirena', 1400),
 ('pain', 698),
 ('period', 611),
 ('months', 607),
 ('insertion', 535),
 ('get', 528),
 ('would', 509),
 ('years', 509),
 ('got', 484),
 ('like', 467)]

In [26]:
#rarely used words

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
print(RAREWORDS)

{'tall', 'upped', 'painl', 'blur', 'episodes', 'rails', 'truthfully', 'psychiatrist', 'glow', 'medicated'}


## stemming & lemmatization

In [29]:
import nltk
keywords = nltk.download('wordnet')

stemmer = SnowballStemmer('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [31]:
processed_docs = dataset['Reviews'].map(preprocess)

### tokenization

In [32]:
processed_docs[:10]

0    [mirena, insert, pleasant, mean, like, instant...
1    [time, mirena, total, year, total, issu, surf,...
2    [biggest, joke, know, mankind, doctor, give, s...
3    [mirena, exact, month, complet, nightmar, doct...
4    [histori, gush, pain, period, includ, vomit, l...
5    [mirena, insert, decemb, gyna, recommend, help...
6    [birth, control, devic, ruin, life, total, mon...
7    [mirena, month, stand, anymor, constant, cramp...
8    [facial, hair, huge, acn, pimpl, destroy, sugg...
9    [recommend, mirena, eas, endometriosi, pain, b...
Name: Reviews, dtype: object