# <font color='violet'> Modeling to Predict Ratings based on Reviews 
    
Using data with most features engineered here: https://github.com/fractaldatalearning/psychedelic_efficacy/blob/main/notebooks/6-kl-studies-finish-preprocess.ipynb

Other feature engineering will be completed as part of the modeling pipeline. 

In [1]:
# ! pip install xgboost

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [3]:
df = pd.read_csv('../data/interim/studies_w_vector_similarity.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31557 entries, 0 to 31556
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         31557 non-null  int64  
 1   condition          31557 non-null  object 
 2   date               31557 non-null  object 
 3   drug0              31557 non-null  object 
 4   drug1              31557 non-null  object 
 5   review_len         31557 non-null  int64  
 6   complexity         31557 non-null  float64
 7   no_stop_cap_lemm   31557 non-null  object 
 8   subjectivity       31557 non-null  float64
 9   original_polarity  31557 non-null  float64
 10  set                31557 non-null  object 
 11  rating             31557 non-null  float64
 12  similarity_w_10    31557 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 3.1+ MB


In [4]:
# Remove categorical features that aren't specifically related to the review text
df = df.drop(columns = ['Unnamed: 0', 'condition', 'date', 'drug0', 'drug1'])

# Rename the review column for clarity.
df = df.rename(columns={'no_stop_cap_lemm':'review'})

# Reorder columns for clarity
df = df[['review', 'rating', 'review_len', 'complexity', 'subjectivity', 
              'original_polarity', 'similarity_w_10', 'set']]
df.head()

Unnamed: 0,review,rating,review_len,complexity,subjectivity,original_polarity,similarity_w_10,set
0,good give run gas,9.0,36,-1.2,0.6,0.7,0.64005,train
1,75 mg x daily no noticeable effect 150 mg x da...,8.0,547,5.4,0.343056,0.031439,0.841263,train
2,take 145 mg 10 year fantastic insomnia really ...,8.0,390,4.8,0.591667,0.096296,0.922576,train
3,help stability mood help insomnia start experi...,7.0,156,8.2,1.0,-1.0,0.825203,train
4,crazy eat sleep sit,2.0,66,-0.4,0.9,-0.6,0.467668,train


In [5]:
train_set = df[df.set=='train'].drop(columns=['set']).copy()
test_set = df[df.set=='test'].drop(columns=['set']).copy()

train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22089 entries, 0 to 22088
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   review             22089 non-null  object 
 1   rating             22089 non-null  float64
 2   review_len         22089 non-null  int64  
 3   complexity         22089 non-null  float64
 4   subjectivity       22089 non-null  float64
 5   original_polarity  22089 non-null  float64
 6   similarity_w_10    22089 non-null  float64
dtypes: float64(5), int64(1), object(1)
memory usage: 1.3+ MB


In [6]:
X_train = train_set.drop(columns=['rating'])
X_test = test_set.drop(columns=['rating'])
y_train = train_set.rating
y_test = test_set.rating

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22089 entries, 0 to 22088
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   review             22089 non-null  object 
 1   review_len         22089 non-null  int64  
 2   complexity         22089 non-null  float64
 3   subjectivity       22089 non-null  float64
 4   original_polarity  22089 non-null  float64
 5   similarity_w_10    22089 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 1.2+ MB


Prepare to normalize numeric columns and use CountVectorizer with review column. I know that most of this data is not normally distributed, so I'll normalize with MinMaxScaler. 

Prepare to try out PCA on just the numeric columns after scaling them. 

Finally, prepare to randomly search parameters for the various classifiers I've imported. 

In [None]:
review_col = 0
numeric_cols = [1,2,3,4,5]


# Normalize numeric columns first, then do PCA on numeric columns, then vectorize text. 
# Try performing PCA with a few different component values or no reduction at all. 
# Use columntransformer in order to do pca with numeric columns only, not vectorized text
pca1 = ColumnTransformer([('mms', MinMaxScaler(), numeric_cols), 
                          ('pca1', PCA(n_components=1, random_state=17), numeric_cols), 
                          ('cv', CountVectorizer(lowercase=False), review_col)])
pca2 = ColumnTransformer([('mms', MinMaxScaler(), numeric_cols), 
                          ('pca2', PCA(n_components=2, random_state=17), numeric_cols), 
                          ('cv', CountVectorizer(lowercase=False), review_col)])
pca3 = ColumnTransformer([('mms', MinMaxScaler(), numeric_cols), 
                          ('pca1', PCA(n_components=1, random_state=17), numeric_cols), 
                          ('cv', CountVectorizer(lowercase=False), review_col)])
pca4 = ColumnTransformer([('mms', MinMaxScaler(), numeric_cols), 
                          ('cv', CountVectorizer(lowercase=False), review_col)])
pca_params = {'pca':[pca1, pca2, pca3, pca4]}

# Compile gridsearch parameters and pipeline. Run through randomized grid search

pipe = Pipeline(steps=[('pca', pca1), ('clf', SVC(probability=True))])

rgs = RandomizedSearchCV(estimator=pipe, param_distributions=pca_params, 
                             scoring='roc_auc_ovr', random_state=17, error_score='raise')


# Run the gridsearch
rgs.fit(X_train, y_train)
print(rgs.best_params_)
print(rgs.best_score)



In [None]:
# Full with bugs; trying a portion above to fix early bugs.

review_col = 0
numeric_cols = [1,2,3,4,5]


# Normalize numeric columns first. 
# Try performing PCA with a few different component values or no reduction at all. 
# Use columntransformer in order to do pca with numeric columns only, not vectorized text
pca1 = ColumnTransformer([('mms', MinMaxScaler(), numeric_cols), 
                          ('pca1', PCA(n_components=1, random_state=17), numeric_cols)], 
                         remainder='passthrough')
pca2 = ColumnTransformer([('mms', MinMaxScaler(), numeric_cols), 
                          ('pca1', PCA(n_components=2, random_state=17), numeric_cols)], 
                         remainder='passthrough')
pca3 = ColumnTransformer([('mms', MinMaxScaler(), numeric_cols), 
                          ('pca1', PCA(n_components=3, random_state=17), numeric_cols)], 
                         remainder='passthrough')
pca_params = {'pca':[pca1, pca2, pca3, 'passthrough']}

# Vectorize text data
text_trans = ColumnTransformer([('cv', CountVectorizer(lowercase=False), review_col)], 
                              remainder='passthrough')


# Set up classifier hyperparameters to tune  

clf1 = KNeighborsClassifier()
param1 = dict(clf=(clf1,), clf__n_neighbors=list(np.arange(3,22,2)), 
              clf__weights=['uniform','distance'],
              clf__leaf_size=list(np.arange(10,101,10)), 
              clf__p=[1,2], clf__metric=['euclidean','chebyshev','minkowski'])

clf2 = SVC(probability=True, random_state=43)
param2 = dict(clf=(clf2,), clf__C=list(np.arange(1,11)), 
              clf__kernel=['linear', 'poly', 'rbf', 'sigmoid'],
              clf__degree=list(np.arange(1,11)), clf__gamma=['scale', 'auto'], 
              clf__coef0=list(np.arange(0,4,0.5)), clf__shrinking=[True,False], 
              clf__probability=[True,False], clf__class_weight=[None,'balanced'])

# Multinomial NB requires positve values. Find a different way to try beysian classifier
clf3 = MultinomialNB()
param3 = dict(clf=(clf3,), clf__fit_prior=[True,False])

# Only select potential hyperparameters that I've read minimize overfitting
clf4 = XGBClassifier()
param4 = dict(clf=(clf4,), clf__colsample_bytree=list(np.arange(0, 0.6, 0.1)), 
              clf__subsample=list(np.arange(0, 0.6, 0.1)), 
              clf__max_depth=list(np.arange(1,5)), clf__gamma=list(np.arange(4,11,1)), 
              clf__eta=list(np.arange(0, 0.6, 0.1)), clf__min_child_weight=[5,20,50,100,200], 
              clf__alpha=[5,10,20,50,100], clf__n_estimators=[5,10,20,50])

clf5 = RidgeClassifier(class_weight='balanced', random_state=17)
param5 = dict(clf=(clf5,), clf__alpha=[1e-10, 1e-5, 1e-2, 1, 5, 10, 20, 50, 100, 200, 500])

clf6 = LinearDiscriminantAnalysis(shrinkage='auto')
param6 = dict(clf=(clf6,), clf__solver=['svd', 'lsqr', 'eigen'], 
              clf__store_covariance=[True,False])

# Compile gridsearch parameters and pipeline. Run through randomized grid search

pipe = Pipeline(steps=[('pca', pca1), ('text', text_trans), ('clf', clf1)])

param_grid = [pca_params, param1, param2, param3, param4, param5, param6]

rgs = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, 
                             scoring='roc_auc', random_state=17, error_score='raise')


# Run the gridsearch
rgs.fit(X_train, y_train)
print(rgs.best_params_)
print(rgs.best_score)

Note: I drew some code from this resource: https://towardsdatascience.com/getting-the-most-out-of-scikit-learn-pipelines-c2afc4410f1a