In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# !git clone https://github.com/cse151a-DrugReviewAnalysis/DrugReviewAnalysis.git

In [2]:
# Get dataset for working locally
df_train = pd.read_csv("drugsTrain_processed.csv", sep="\t")
df_test = pd.read_csv("drugsTest_processed.csv", sep="\t")

In [3]:
# Get dataset when working on colab
# df_train = pd.read_csv("DrugReviewAnalysis/drugsTrain_processed.csv", sep="\t")
# df_test = pd.read_csv("DrugReviewAnalysis/drugsTest_processed.csv", sep="\t")

In [4]:
X_train = df_train.drop(columns=["rating"])
y_train = df_train["rating"]

X_test = df_test.drop(columns=["rating"])
y_test = df_test["rating"]

In [5]:
df_train.head()

Unnamed: 0,drugName,condition,rating,date,usefulCount,timestamp,processed_review
0,valsartan,left ventricular dysfunction,9.0,"May 20, 2012",27,2012-05-20,side effect take combination bystolic 5 mg fi...
1,guanfacine,adhd,8.0,"April 27, 2010",192,2010-04-27,son halfway fourth week intuniv became concer...
2,lybrel,birth control,5.0,"December 14, 2009",17,2009-12-14,used take another oral contraceptive 21 pill ...
3,ortho evra,birth control,8.0,"November 3, 2015",10,2015-11-03,first time using form birth control m glad we...
4,buprenorphine / naloxone,opiate dependence,9.0,"November 27, 2016",37,2016-11-27,suboxone completely turned life around feel h...


In [6]:
y_train.value_counts()

rating
10.0    50504
9.0     27219
1.0     21391
8.0     18688
7.0      9338
5.0      7907
2.0      6833
3.0      6422
6.0      6254
4.0      4942
Name: count, dtype: int64

In [21]:
# Feature engineering and model 1
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'processed_review'),
        ('numeric', StandardScaler(), ['usefulCount']),
        ('categorical', OneHotEncoder(handle_unknown='ignore'), ['drugName','condition'])
    ],
    remainder='drop'
)

# Choose the models
pipeline = Pipeline([
    ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1, verbose=0)),
    ('regressor', DecisionTreeRegressor()),
    # ('classifier', RandomForestClassifier(n_estimators=100)),
    # ('classifier', SVC()),
    # ('classifier', KNeighborsClassifier(n_neighbors=5))
])

print(pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('text', TfidfVectorizer(),
                                                  'processed_review'),
                                                 ('numeric', StandardScaler(),
                                                  ['usefulCount']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['drugName', 'condition'])])),
                ('regressor', DecisionTreeRegressor())])


In [22]:
# Fit the model and report the accuracy
pipe = pipeline.fit(X_train, y_train)

print(f"Train accuracy: {pipe.score(X_train, y_train)}")
print(f"Test accuracy: {pipe.score(X_test, y_test)}")
pipe

Train accuracy: 0.9999885838847794
Test accuracy: 0.44764581078313603


In [38]:
# Testing classification report 
y_pred = pipe.predict(X_test)
print(mean_squared_error(y_pred, y_test))

y_pred = np.round(y_pred)
print(classification_report(y_pred, y_test))

5.965507518796993
              precision    recall  f1-score   support

         1.0       0.72      0.69      0.71      7494
         2.0       0.65      0.65      0.65      2311
         3.0       0.63      0.67      0.65      2064
         4.0       0.62      0.65      0.63      1552
         5.0       0.61      0.65      0.63      2517
         6.0       0.60      0.65      0.62      1921
         7.0       0.60      0.65      0.62      2839
         8.0       0.64      0.65      0.64      5975
         9.0       0.66      0.68      0.67      8874
        10.0       0.78      0.74      0.76     17653

    accuracy                           0.69     53200
   macro avg       0.65      0.67      0.66     53200
weighted avg       0.70      0.69      0.69     53200



In [39]:
# Training classification report 
y_pred = pipe.predict(X_train)
print(mean_squared_error(y_pred, y_train))

y_pred = np.round(y_pred)
print(classification_report(y_pred, y_train))

0.00012225858631456193
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     21391
         2.0       1.00      1.00      1.00      6834
         3.0       1.00      1.00      1.00      6421
         4.0       1.00      1.00      1.00      4942
         5.0       1.00      1.00      1.00      7905
         6.0       1.00      1.00      1.00      6253
         7.0       1.00      1.00      1.00      9344
         8.0       1.00      1.00      1.00     18687
         9.0       1.00      1.00      1.00     27215
        10.0       1.00      1.00      1.00     50506

    accuracy                           1.00    159498
   macro avg       1.00      1.00      1.00    159498
weighted avg       1.00      1.00      1.00    159498



In [24]:
# Want to create a basic linear regression model to compare against the decision tree regressor
pipeline2 = Pipeline([
    ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1, verbose=0)),
    ('regressor', LinearRegression()),
    # ('classifier', RandomForestClassifier(n_estimators=100)),
    # ('classifier', SVC()),
    # ('classifier', KNeighborsClassifier(n_neighbors=5))
])

In [40]:
# Fit the model and report the accuracy
pipe2 = pipeline2.fit(X_train, y_train)

print(f"Train accuracy: {pipe2.score(X_train, y_train)}")
print(f"Test accuracy: {pipe2.score(X_test, y_test)}")
pipe2

Train accuracy: 0.7460011047880561
Test accuracy: 0.41443165013485983


In [42]:
y_pred = pipe2.predict(X_train)
print(f"Train error: {mean_squared_error(y_pred, y_train)}")

y_pred = pipe2.predict(X_test)
print(f"Test error: {mean_squared_error(y_pred, y_test)}")

Train error: 2.7201500032101595
Test error: 6.324225401173784
