In [1]:
import pandas as pd
import numpy as np
import re, string, unicodedata
import inflect
from nltk.corpus import stopwords
import time
import nltk

In [2]:
train_reviews = pd.read_csv('gs://stanforddrugreviews/drugsComTrain_raw.tsv', sep='\t')
test_reviews = pd.read_csv('gs://stanforddrugreviews/drugsComTest_raw.tsv', sep='\t')
reviews = pd.concat([train_reviews, test_reviews])

In [3]:
reviews.shape

(215063, 7)

In [4]:
reviews.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [5]:
reviews.isnull().sum()

Unnamed: 0        0
drugName          0
condition      1194
review            0
rating            0
date              0
usefulCount       0
dtype: int64

In [5]:
reviews['word_count'] = reviews['review'].apply(lambda x: len(str(x).split(" ")))
reviews[['review','word_count']].head()

Unnamed: 0,review,word_count
0,"""It has no side effect, I take it in combinati...",17
1,"""My son is halfway through his fourth week of ...",141
2,"""I used to take another oral contraceptive, wh...",133
3,"""This is my first time using any form of birth...",89
4,"""Suboxone has completely turned my life around...",134


In [7]:
reviews.word_count.describe()

count    215063.000000
mean         85.632029
std          45.323405
min           1.000000
25%          49.000000
50%          85.000000
75%         127.000000
max        1857.000000
Name: word_count, dtype: float64

In [9]:
freq = pd.Series(' '.join(reviews['review']).split()).value_counts()[:20]
freq

I       883057
and     563694
the     484954
to      425525
a       376651
my      330282
it      266069
for     260027
was     229373
of      226444
have    223421
on      173244
in      156598
is      147266
had     146513
but     144612
this    136890
with    134673
that    126769
me      119671
dtype: int64

In [10]:
freq1 = pd.Series(' '.join(reviews['review']).split()).value_counts()[-20:]
freq1

"Discover           1
(momentary          1
Tamsulosin),        1
Product."           1
dirt,               1
like..WOW...its     1
sinuses"            1
withdrawals),       1
reestablish         1
Gerd,               1
phernergan          1
fours.              1
136.6.              1
rags                1
Top.                1
again,&quot;        1
hey..it             1
be,sometimes        1
MScontin.           1
withdrawal...the    1
dtype: int64

In [8]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [6]:
# Load NLTK stop words
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
#print ("NLTK stopword list length: ",len(stop_words),'words')

# Add custom drug stopwords
#stop_words.extend(drug_stopwords)
#print ("Added custom stopwords")
#print ("New stopword list length: ",len(stop_words),'words')
nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
def preprocess_text_short(text, perform_Lemmatize = True):

    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    

    #Lemmatisation
    if perform_Lemmatize:
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text  if not word in stop_words] 
    text = " ".join(text)
    return text

In [8]:
raw_review_data = reviews['review'].fillna('NA').values

In [9]:
processed_review_data = []
for review in raw_review_data:
    processed_review_data.append(preprocess_text_short(review))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(processed_review_data)

from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)

# get feature names
feature_names=cv.get_feature_names()

In [17]:
review_num = 5000
print raw_review_data[review_num]
print processed_review_data[review_num]

"I have irritable bowl syndrome with diarrhea and this has worked great for me. I take it once or twice a day as needed with juice. The only side effect I noticed was one case of nausea which passed after about an hour. Make sure to drink plenty of fluids to avoid major constipation."
irritable bowl syndrome diarrhea worked great take twice day needed juice side effect noticed one case nausea passed hour make sure drink plenty fluid avoid major constipation


In [18]:
# fetch document for which keywords needs to be extracted
doc=processed_review_data[review_num]
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

In [19]:
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

#sort the tf-idf vectors by descending order of scores

sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,5)
 
# now print the results
print("\nOriginalReview:")
print(raw_review_data[review_num])
print("\nReview:")
print(doc)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])


OriginalReview:
"I have irritable bowl syndrome with diarrhea and this has worked great for me. I take it once or twice a day as needed with juice. The only side effect I noticed was one case of nausea which passed after about an hour. Make sure to drink plenty of fluids to avoid major constipation."

Review:
irritable bowl syndrome diarrhea worked great take twice day needed juice side effect noticed one case nausea passed hour make sure drink plenty fluid avoid major constipation

Keywords:
(u'take twice day', 0.244)
(u'drink plenty', 0.234)
(u'bowl', 0.243)
(u'great take', 0.236)
(u'take twice', 0.234)
