## Modeling

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime

%matplotlib inline

In [102]:
df = pd.read_csv('clean_users.csv', engine='python')

In [137]:
df = df[df['clean_review'].notna()]
df = df[df['sent_class'].notna()]

In [105]:
df = df.drop(columns = 'Unnamed: 0')

In [104]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,artist,label,release_date,metascore,user_score,genre,summary,name,date,rating,review,clean_review,length,word_count,sentiment,negative,neutral,positive,sent_class,rating_sent,sp_lm
0,0,Continuum,John Mayer,Sony,2006-09-12,67.0,8.9,Adult Alternative,The singer-songwriter's first album in three y...,ibadukefan,2014-02-02,10.0,This is John Mayer in the zone. This is where...,john mayer zone lives kind making rest career ...,441.0,83.0,-0.3761,0.154,0.728,0.118,-1.0,1.0,john mayer zone live kind make rest career kno...
1,1,Continuum,John Mayer,Sony,2006-09-12,67.0,8.9,Adult Alternative,The singer-songwriter's first album in three y...,ToddW,2006-09-27,1.0,"I give Little, Good John kudos for at least t...",give little good john kudos least turning ligh...,575.0,102.0,-0.3651,0.222,0.576,0.202,-1.0,-1.0,give little good john kudos least turn light s...
2,2,Continuum,John Mayer,Sony,2006-09-12,67.0,8.9,Adult Alternative,The singer-songwriter's first album in three y...,ChristopherG.,2007-08-01,3.0,John Mayer... oh John Mayer. A talented blues...,john mayer oh john mayer talented bluesguitari...,653.0,117.0,0.9371,0.057,0.678,0.266,1.0,-1.0,john mayer oh john mayer talented bluesguitari...
3,3,Continuum,John Mayer,Sony,2006-09-12,67.0,8.9,Adult Alternative,The singer-songwriter's first album in three y...,jfrotylpe532,2012-12-21,8.0,John Mayer brings a great sounding album as a ...,john mayer brings great sounding matter fact w...,108.0,20.0,0.7964,0.0,0.497,0.503,1.0,1.0,john mayer bring great sounding matter fact wa...
4,4,Continuum,John Mayer,Sony,2006-09-12,67.0,8.9,Adult Alternative,The singer-songwriter's first album in three y...,ErinY,2006-09-12,10.0,It is great to have John Mayer back. This alb...,great john mayer back definitely best really s...,123.0,22.0,0.9001,0.0,0.389,0.611,1.0,1.0,great john mayer back definitely good really s...


In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83415 entries, 0 to 85499
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         83415 non-null  object 
 1   artist        83415 non-null  object 
 2   label         83042 non-null  object 
 3   release_date  83415 non-null  object 
 4   metascore     83415 non-null  float64
 5   user_score    83415 non-null  object 
 6   genre         83415 non-null  object 
 7   summary       82114 non-null  object 
 8   name          83415 non-null  object 
 9   date          83415 non-null  object 
 10  rating        83415 non-null  float64
 11  review        83415 non-null  object 
 12  clean_review  83415 non-null  object 
 13  length        83414 non-null  float64
 14  word_count    83414 non-null  float64
 15  sentiment     83414 non-null  float64
 16  negative      83414 non-null  float64
 17  neutral       83414 non-null  float64
 18  positive      83414 non-nu

In [122]:
stopwords_list=stopwords.words('english')+list(string.punctuation)+['album','albums', 'songs', 'song', 'music', 'like', 'one']

### Tokenize

In [123]:
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')


In [124]:
#function to remove all stopwords, punctuations & unimportant words from the reviews and make a list
def reduce(text):
    tokens = tokenizer.tokenize(text) # tokenize every review
    removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return removed

In [138]:
clean_reviews = df['clean_review']
target = df['sent_class']

In [139]:
# remove all stopwords, punctuations & unimportant words from the reviews and make a list
processed_data = list(map(reduce, clean_reviews))

### Lemmatization

In [140]:
lemmatizer = WordNetLemmatizer()

In [141]:
lem_review = []
for j in processed_data:
    lem = ' '.join([lemmatizer.lemmatize(w) for w in j])
    lem_review.append(lem)

### TF-IDF Vectorization

In [142]:
XL = lem_review
yL = target

In [143]:
XL_train, XL_test, yL_train, yL_test = train_test_split(XL, yL, test_size=0.2, random_state=1)
tfVectorizer = TfidfVectorizer()

XL_train_tf = tfVectorizer.fit_transform(XL_train)
XL_test_tf = tfVectorizer.transform(XL_test)

#### Baseline Model

In [None]:
#Fitting & predicting the Dummy Classifier (Baseline Model)
from sklearn.dummy import DummyClassifier
dclf = DummyClassifier() 

In [None]:
dclf.fit(XL_train_tf, yL_train)
yL_preds = dclf.predict(XL_test_tf)
print('dummy accuracy:',accuracy_score(yL_test, yL_preds),
      'dummy forest f1:',f1_score(yL_test, yL_preds, average = 'weighted'))

#### Random Forest

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=250)

In [None]:
startTime = datetime.now()

rf_classifier.fit(XL_train_tf, yL_train)
yL_preds = rf_classifier.predict(XL_test_tf)
print('random forest accuracy:',accuracy_score(yL_test, yL_preds),
      'random forest f1:',f1_score(yL_test, yL_preds, average = 'weighted'))
print(datetime.now() - startTime)

#### Naive Bayes

In [144]:
nb_classifier = MultinomialNB()

In [145]:
startTime = datetime.now()

nb_classifier.fit(XL_train_tf, yL_train)
yL_preds = nb_classifier.predict(XL_test_tf)
print('naive bayes accuracy:',accuracy_score(yL_test, yL_preds),
      'naive bayes f1:',f1_score(yL_test, yL_preds, average = 'weighted'))
print(datetime.now() - startTime)

naive bayes accuracy: 0.7851705328777798 naive bayes f1: 0.7110343514866779
0:00:00.065859


#### SVM - Final Model

In [146]:
svc_classifier = SVC(kernel='rbf', C= 1.0, class_weight = 'balanced')

In [37]:
startTime = datetime.now()

svc_classifier.fit(XL_train_tf, yL_train)
yL_preds = svc_classifier.predict(XL_test_tf)
print('support vector machine accuracy:',accuracy_score(yL_test, yL_preds),
      'support vector machine f1:',f1_score(yL_test, yL_preds, average = 'weighted'))
print(datetime.now() - startTime)

support vector machine accuracy: 0.8900124918208316 support vector machine f1: 0.8908538384465301
0:38:21.635190


#### Grid SearchCV Naive Bayes Model

In [41]:
nb_params = {'alpha': [0.01,0.03,0.05,0.07,0.09,0.11,0.13,0.15,0.17,0.19],
              'fit_prior': [True, False],
              'class_prior': [[-1,0,1],[1,0,-1]]}

In [42]:
grid_nb = GridSearchCV(nb_classifier, param_grid=nb_params, cv=7, scoring='accuracy', verbose =1, n_jobs=-1)
grid_nb.fit(XL_train_tf, yL_train)

Fitting 7 folds for each of 40 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:    7.5s finished
  self.class_log_prior_ = np.log(class_prior)
  self.class_log_prior_ = np.log(class_prior)


GridSearchCV(cv=7, estimator=MultinomialNB(), n_jobs=-1,
             param_grid={'alpha': [0.01, 0.03, 0.05, 0.07, 0.09, 0.11, 0.13,
                                   0.15, 0.17, 0.19],
                         'class_prior': [[-1, 0, 1], [1, 0, -1]],
                         'fit_prior': [True, False]},
             scoring='accuracy', verbose=1)

In [43]:
# examine the best model
print(grid_nb.best_score_)
# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid_nb.best_params_)
# Shows default parameters that we did not specify
print(grid_nb.best_estimator_)
#Identify the best score during fitting with cross-validation

0.7696117078159256
{'alpha': 0.01, 'class_prior': [1, 0, -1], 'fit_prior': True}
MultinomialNB(alpha=0.01, class_prior=[1, 0, -1])


In [44]:
yL_preds = grid_nb.best_estimator_.predict(XL_test_tf)
print('naive bayes accuracy:',accuracy_score(yL_test, yL_preds),
      'naive bayes f1:',f1_score(yL_test, yL_preds, average = 'weighted'))

naive bayes accuracy: 0.7734221640592469 naive bayes f1: 0.6746073845032925


### Modeling with spaCy

Modeling review text with spaCy was used to try to better process the large amount of text in the data and handle the context of the review and see if it resulted in better performing models.

In [55]:
XS = df['sp_lm']
yS = target

In [56]:
XS_train, XS_test, yS_train, yS_test = train_test_split(XS, yS, test_size=0.2, random_state=1)
tfVectorizer = TfidfVectorizer()

XS_train_tf = tfVectorizer.fit_transform(XS_train)
XS_test_tf = tfVectorizer.transform(XS_test)

#### Random Forest

In [57]:
rf_classifier = RandomForestClassifier(n_estimators=250)

In [58]:
startTime = datetime.now()

rf_classifier.fit(XS_train_tf, yS_train)
yS_preds = rf_classifier.predict(XS_test_tf)
print('random forest accuracy:',accuracy_score(yS_test, yS_preds),
      'random forest f1:',f1_score(yS_test, yS_preds, average = 'weighted'))
print(datetime.now() - startTime)

random forest accuracy: 0.8515852715483909 random forest f1: 0.8235248252766394
0:08:16.000135


#### Naive Bayes

In [59]:
nb_classifier = MultinomialNB()

In [60]:
startTime = datetime.now()

nb_classifier.fit(XS_train_tf, yS_train)
yS_preds = nb_classifier.predict(XS_test_tf)
print('naive bayes accuracy:',accuracy_score(yS_test, yS_preds),
      'naive bayes f1:',f1_score(yS_test, yS_preds, average = 'weighted'))
print(datetime.now() - startTime)

naive bayes accuracy: 0.7887097733626792 naive bayes f1: 0.7165917504732271
0:00:00.081752


#### SVM

In [61]:
svc_classifier = SVC(kernel='linear', C = 1.0)

In [63]:
svc_classifier = SVC(kernel='rbf', C= 1.0, class_weight = 'balanced')

In [64]:
startTime = datetime.now()

svc_classifier.fit(XS_train_tf, yS_train)
yS_preds = svc_classifier.predict(XS_test_tf)
print('support vector machine accuracy:',accuracy_score(yS_test, yL_preds),
      'support vector machine f1:',f1_score(yS_test, yS_preds, average = 'weighted'))
print(datetime.now() - startTime)

support vector machine accuracy: 0.7734221640592469 support vector machine f1: 0.8892753959425289
0:43:17.802017
