In [1]:
#In this notebook, we will be working with a file that contains reviews for prescription drugs, followed by a rating on a scale
#of 1 - 10. This data is very messy, so we'll be doing some deep cleaning on it before using it to try and predict customer
#drug ratings based on the review they provided.

In [2]:
#This tab will be where we import all of our modules

import pandas as pd
from sklearn.feature_extraction.text import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

import sklearn.decomposition

In [3]:
#We can see from our preview below that there is a lot of data that isn't quite useful for our purposes. As such, we'll remove
#these columns.

drug_reviews = pd.read_excel(r"C:\Users\hecto\OneDrive\Documents\Jupyter Notebook\Practice Code\Datasets\drugsCom_raw\drugsComTrain_raw_short.xlsx")
print(drug_reviews.shape)
drug_reviews.head()

(205764, 7)


Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,2012-05-20,27.0
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",,NaT,
2,We have tried many different medications and s...,8,2010-04-27 00:00:00,192,,NaT,
3,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",,NaT,
4,The positive side is that I didn&#039;t have a...,5,2009-12-14 00:00:00,17,,NaT,


In [4]:
drug_reviews = drug_reviews.drop(["Unnamed: 0", "condition", "date", "usefulCount"], axis=1)
drug_reviews.head()

Unnamed: 0,drugName,review,rating
0,Valsartan,"""It has no side effect, I take it in combinati...",9.0
1,Guanfacine,"""My son is halfway through his fourth week of ...",
2,8,192,
3,Lybrel,"""I used to take another oral contraceptive, wh...",
4,5,17,


In [5]:
#Now we have our three variables of interest! Next, we'll dorp columns which contain NaN values as these are not helpful for 
#training the model. Please note that the original dataset has over 200,000 rows, so we won't be doing much harm by trimming 
#the data.


drug_reviews = drug_reviews.dropna()
print("The shape of the dataset after pre-processing is: {}".format(drug_reviews.shape))
drug_reviews.head()

#Any rows with NaN cells have now been dropped and we're still left with 143k rows of data. Some of these reviews feature
#punctuation we do not need, and improperly processed symbols, such as an apostrophe appearing as &#039;. For this, we'll create
#a function to try and remove the excess nose

The shape of the dataset after pre-processing is: (129453, 3)


Unnamed: 0,drugName,review,rating
0,Valsartan,"""It has no side effect, I take it in combinati...",9.0
5,Ortho Evra,"""This is my first time using any form of birth...",8.0
6,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",9.0
7,Cialis,"""2nd day on 5mg started to work with rock hard...",2.0
8,Levonorgestrel,"""He pulled out, but he cummed a bit in me. I t...",1.0


In [6]:
drug_reviews.drop(drug_reviews.index[60000:129453], inplace=True)
drug_reviews.reset_index(inplace=True)
print("The shape of the dataset after pre-processing is: {}".format(drug_reviews.shape))

The shape of the dataset after pre-processing is: (60000, 4)


In [7]:

low_rating= [1.0, 2.0, 3.0, 4.0]
moderate_rating = [5.0, 6.0, 7.0]
high_rating = [8.0, 9.0, 10.0]

# Replace matching ratings with -1
drug_reviews.loc[drug_reviews['rating'].isin(low_rating), 'rating'] = -1

# Replace matching ratings with 0
drug_reviews.loc[drug_reviews['rating'].isin(moderate_rating), 'rating'] = 0

# Replace remaining ratings with 1
drug_reviews.loc[drug_reviews['rating'].isin(high_rating), 'rating'] = 1



In [8]:
drug_reviews['rating'] = drug_reviews['rating'].astype('category')

In [None]:
print("Original Review:" + "\n" + drug_reviews["review"][5] + "\n")


def clean_data(review):
    
    no_punc = re.sub(r'[^\w\s]', '', review)
    no_digits = ''.join([i for i in no_punc if not i.isdigit()])
    no_dbl_space = re.sub("  ", " ", no_digits)
    return(no_dbl_space)

drug_reviews["review"] = drug_reviews["review"].astype(str)
drug_reviews["review"] = drug_reviews["review"].apply(clean_data)

drug_reviews["drugName"] = drug_reviews["drugName"].astype(str)
drug_reviews["drugName"] = drug_reviews["drugName"].apply(clean_data)

print("Cleaned Review:" + "\n" + drug_reviews["review"][5])

#Success! Unnecessary noise has been removed from our text! Next, lets tokenize and vectorize our reviews for training

In [10]:
print(drug_reviews["review"].shape)

(60000,)


In [11]:
tf_idf = TfidfVectorizer(strip_accents=None, 
                        preprocessor=None, 
                        max_features=None)

x = tf_idf.fit_transform(drug_reviews['review'])
y = drug_reviews["rating"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [12]:
#LSA = sklearn.decomposition.TruncatedSVD(n_components = 100)
#x_train = LSA.fit_transform(x_train)
#x_test = LSA.fit_transform(x_test)

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver = "sag")
lr.fit(x_train, y_train)
predicted = lr.predict(x_test)
accuracy_score(predicted,y_test)


0.7460833333333333

In [20]:
import sklearn.neural_network

myC = sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(40,40), max_iter=400)

In [21]:
myC.fit(x_train, y_train)

In [22]:
predMLP = myC.predict(x_test)
accuracy_score(predMLP, y_test)

0.737