In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [5]:
df_train = pd.read_csv("drugsTrain_processed.csv", sep="\t")
df_test = pd.read_csv("drugsTest_processed.csv", sep="\t")

In [6]:
# df_train = pd.read_csv("DrugReviewAnalysis/drugsTrain_processed.csv", sep="\t")
# df_test = pd.read_csv("DrugReviewAnalysis/drugsTest_processed.csv", sep="\t")

In [7]:
X_train = df_train.drop(columns=["rating"])
y_train = df_train["rating"]

X_test = df_test.drop(columns=["rating"])
y_test = df_test["rating"]

In [14]:
df_train.head()

Unnamed: 0,drugName,condition,rating,date,usefulCount,timestamp,processed_review
0,valsartan,left ventricular dysfunction,9.0,"May 20, 2012",27,2012-05-20,side effect take combination bystolic 5 mg fi...
1,guanfacine,adhd,8.0,"April 27, 2010",192,2010-04-27,son halfway fourth week intuniv became concer...
2,lybrel,birth control,5.0,"December 14, 2009",17,2009-12-14,used take another oral contraceptive 21 pill ...
3,ortho evra,birth control,8.0,"November 3, 2015",10,2015-11-03,first time using form birth control m glad we...
4,buprenorphine / naloxone,opiate dependence,9.0,"November 27, 2016",37,2016-11-27,suboxone completely turned life around feel h...


In [12]:
y_train.value_counts()

rating
10.0    50504
9.0     27219
1.0     21391
8.0     18688
7.0      9338
5.0      7907
2.0      6833
3.0      6422
6.0      6254
4.0      4942
Name: count, dtype: int64

In [40]:
# Feature engineering and model 1
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'processed_review'),
        ('numeric', StandardScaler(), ['usefulCount']),
        ('categorical', OneHotEncoder(handle_unknown='ignore'), ['drugName','condition'])
    ],
    remainder='drop'
)

# Choose the models
pipeline = Pipeline([
    ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1, verbose=0)),
    ('classifier', DecisionTreeClassifier()),
    # ('classifier', RandomForestClassifier(n_estimators=100)),
    # ('classifier', SVC()),
    # ('classifier', KNeighborsClassifier(n_neighbors=5))
])

print(pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('text', TfidfVectorizer(),
                                                  'processed_review'),
                                                 ('numeric', StandardScaler(),
                                                  ['usefulCount']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['drugName', 'condition'])])),
                ('classifier', DecisionTreeClassifier())])


In [41]:
# Fit the model and report the accuracy
pipe = pipeline.fit(X_train, y_train)
print(f"Train accuracy: {pipe.score(X_train, y_train)}")
print(f"Test accuracy: {pipe.score(X_test, y_test)}")
pipe

Train accuracy: 0.9999623819734417
Test accuracy: 0.6981390977443609


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('text', TfidfVectorizer(),
                                                  'processed_review'),
                                                 ('numeric', StandardScaler(),
                                                  ['usefulCount']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['drugName', 'condition'])])),
                ('classifier', DecisionTreeClassifier())])

In [42]:
# Testing classification report 
y_pred = pipe.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         1.0       0.72      0.69      0.71      7547
         2.0       0.64      0.68      0.66      2183
         3.0       0.63      0.68      0.65      2039
         4.0       0.63      0.67      0.65      1522
         5.0       0.62      0.67      0.64      2493
         6.0       0.60      0.66      0.63      1908
         7.0       0.60      0.63      0.61      2878
         8.0       0.65      0.65      0.65      6072
         9.0       0.67      0.67      0.67      9013
        10.0       0.79      0.75      0.77     17545

    accuracy                           0.70     53200
   macro avg       0.66      0.68      0.67     53200
weighted avg       0.70      0.70      0.70     53200



In [43]:
# Training classification report 
y_pred = pipe.predict(X_train)
print(classification_report(y_pred, y_train))

              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     21391
         2.0       1.00      1.00      1.00      6834
         3.0       1.00      1.00      1.00      6421
         4.0       1.00      1.00      1.00      4942
         5.0       1.00      1.00      1.00      7909
         6.0       1.00      1.00      1.00      6255
         7.0       1.00      1.00      1.00      9338
         8.0       1.00      1.00      1.00     18687
         9.0       1.00      1.00      1.00     27219
        10.0       1.00      1.00      1.00     50502

    accuracy                           1.00    159498
   macro avg       1.00      1.00      1.00    159498
weighted avg       1.00      1.00      1.00    159498

