## Modeling

In [32]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime

%matplotlib inline

In [2]:
df = pd.read_csv('clean_critics.csv', engine='python')

In [4]:
df = df.drop(columns = 'Unnamed: 0')

In [11]:
df = df[df['clean_review'].notna()]
df = df[df['sent_class'].notna()]

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,artist,label,release_date,metascore,user_score,genre,summary,name,...,clean_review,length,word_count,sentiment,negative,neutral,positive,sent_class,rating_sent,sp_lm
0,0,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77.0,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,Mojo,...,best dreams oct p,103.0,8.0,0.7845,0.0,0.225,0.775,1.0,1.0,good dream oct p
1,1,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77.0,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,Boston Globe,...,jones furthers exploratory path hes committed ...,351.0,43.0,0.9274,0.078,0.493,0.428,1.0,1.0,jones further exploratory path s commit tran...
2,2,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77.0,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,AllMusic,...,jones performances compositions years touched ...,372.0,56.0,0.4767,0.0,0.812,0.188,1.0,1.0,jones performances composition year touch them...
3,3,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77.0,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,Drowned In Sound,...,wanting boasts technical excellence cosy welco...,197.0,20.0,0.8932,0.0,0.457,0.543,1.0,1.0,want boast technical excellence cosy welcoming...
4,4,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77.0,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,PopMatters,...,makes space creates landscape invites,151.0,17.0,0.2732,0.0,0.656,0.344,1.0,1.0,make space create landscape invite


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110768 entries, 0 to 110896
Data columns (total 22 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   title         110768 non-null  object 
 1   artist        110768 non-null  object 
 2   label         110355 non-null  object 
 3   release_date  110768 non-null  object 
 4   metascore     110768 non-null  float64
 5   user_score    110768 non-null  object 
 6   genre         110768 non-null  object 
 7   summary       109085 non-null  object 
 8   name          110768 non-null  object 
 9   date          43840 non-null   object 
 10  rating        110768 non-null  float64
 11  review        110768 non-null  object 
 12  clean_review  110768 non-null  object 
 13  length        110768 non-null  float64
 14  word_count    110768 non-null  float64
 15  sentiment     110768 non-null  float64
 16  negative      110768 non-null  float64
 17  neutral       110768 non-null  float64
 18  posi

In [6]:
stopwords_list=stopwords.words('english')+list(string.punctuation)+['album','albums', 'songs', 'song', 'music', 'like', 'one']

### Tokenize

In [7]:
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

In [8]:
#function to remove all stopwords, punctuations & unimportant words from the reviews and make a list
def reduce(text):
    tokens = tokenizer.tokenize(text) # tokenize every review
    removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return removed

In [18]:
df['clean_review']=df['clean_review'].apply(str)


In [19]:
clean_reviews = df['clean_review']
target = df['sent_class']

In [20]:
# remove all stopwords, punctuations & unimportant words from the reviews and make a list
processed_data = list(map(reduce, clean_reviews))

In [21]:
processed_data 

[['best', 'dreams', 'oct', 'p'],
 ['jones',
  'furthers',
  'exploratory',
  'path',
  'hes',
  'committed',
  'tranquil',
  'yet',
  'compelling',
  'acoustic',
  'steelstring',
  'guitar',
  'compositions',
  'built',
  'thoughtful',
  'open',
  'tunings',
  'kind',
  'expressive',
  'bottleneck',
  'guitar',
  'even',
  'win',
  'fail',
  'even',
  'banjo',
  'great',
  'swamp',
  'way',
  'rout'],
 ['jones',
  'performances',
  'compositions',
  'years',
  'touched',
  'themes',
  'yearning',
  'past',
  'soft',
  'echo',
  'present',
  'day',
  'stand',
  'reason',
  'title',
  'cover',
  'arta',
  'guitarplaying',
  'cat',
  'looking',
  'moonshould',
  'sum',
  'feeling',
  'wanting',
  'well'],
 ['wanting',
  'boasts',
  'technical',
  'excellence',
  'cosy',
  'welcoming',
  'atmosphere',
  'simple',
  'combination',
  'perhaps',
  'hugely',
  'rewarding'],
 ['makes', 'space', 'creates', 'landscape', 'invites'],
 ['perhaps',
  'whatever',
  'hes',
  'wishing',
  'doesnt',
  's

### Lemmatization

In [22]:
lemmatizer = WordNetLemmatizer()

In [23]:
lem_review = []
for j in processed_data:
    lem = ' '.join([lemmatizer.lemmatize(w) for w in j])
    lem_review.append(lem)

### TF-IDF Vectorization

In [24]:
XL = lem_review
yL = target

In [25]:
XL_train, XL_test, yL_train, yL_test = train_test_split(XL, yL, test_size=0.2, random_state=1)
tfVectorizer = TfidfVectorizer()

XL_train_tf = tfVectorizer.fit_transform(XL_train)
XL_test_tf = tfVectorizer.transform(XL_test)

### Baseline Model

In [51]:
#Fitting & predicting the Dummy Classifier (Baseline Model)
from sklearn.dummy import DummyClassifier
dclf = DummyClassifier() 

In [52]:
dclf.fit(XL_train_tf, yL_train)
yL_preds = dclf.predict(XL_test_tf)
print('dummy accuracy:',accuracy_score(yL_test, yL_preds),
      'dummy forest f1:',f1_score(yL_test, yL_preds, average = 'weighted'))

dummy accuracy: 0.48132443161313604 dummy forest f1: 0.48142467455421245




### Random Forest

In [53]:
rf_classifier = RandomForestClassifier(n_estimators=250)

In [54]:
startTime = datetime.now()

rf_classifier.fit(XL_train_tf, yL_train)
yL_preds = rf_classifier.predict(XL_test_tf)
print('random forest accuracy:',accuracy_score(yL_test, yL_preds),
      'random forest f1:',f1_score(yL_test, yL_preds, average = 'weighted'))
print(datetime.now() - startTime)

random forest accuracy: 0.8153644893540238 random forest f1: 0.7966156506921931
0:11:20.793319


### Naive Bayes

In [26]:
nb_classifier = MultinomialNB()

In [29]:
startTime = datetime.now()

nb_classifier.fit(XL_train_tf, yL_train)
yL_preds = nb_classifier.predict(XL_test_tf)
print('naive bayes accuracy:',accuracy_score(yL_test, yL_preds),
      'naive bayes f1:',f1_score(yL_test, yL_preds, average = 'weighted'))
print(datetime.now() - startTime)

naive bayes accuracy: 0.6492281303602059 naive bayes f1: 0.5132713495834365
0:00:00.190882


### SVM - Final Model

In [30]:
svc_classifier = SVC(kernel='rbf', C= 1.0, class_weight = 'balanced')

In [62]:
startTime = datetime.now()

svc_classifier.fit(XL_train_tf, yL_train)
yL_preds = svc_classifier.predict(XL_test_tf)
print('support vector machine accuracy:',accuracy_score(yL_test, yL_preds),
      'support vector machine f1:',f1_score(yL_test, yL_preds, average = 'weighted'))
print(datetime.now() - startTime)

support vector machine accuracy: 0.8425658607001083 support vector machine f1: 0.8437809748890294
0:58:18.315062


### Grid SearchCV Naive Bayes Model

In [64]:
nb_params = {'alpha': [0.01,0.03,0.05,0.07,0.09,0.11,0.13,0.15,0.17,0.19],
              'fit_prior': [True, False],
              'class_prior': [[-1,0,1],[1,0,-1]]}

In [65]:
grid_nb = GridSearchCV(nb_classifier, param_grid=nb_params, cv=7, scoring='accuracy', verbose =1, n_jobs=-1)
grid_nb.fit(XL_train_tf, yL_train)

Fitting 7 folds for each of 40 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:    6.8s finished
  self.class_log_prior_ = np.log(class_prior)
  self.class_log_prior_ = np.log(class_prior)


GridSearchCV(cv=7, estimator=MultinomialNB(), n_jobs=-1,
             param_grid={'alpha': [0.01, 0.03, 0.05, 0.07, 0.09, 0.11, 0.13,
                                   0.15, 0.17, 0.19],
                         'class_prior': [[-1, 0, 1], [1, 0, -1]],
                         'fit_prior': [True, False]},
             scoring='accuracy', verbose=1)

In [66]:
# examine the best model
print(grid_nb.best_score_)
# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid_nb.best_params_)
# Shows default parameters that we did not specify
print(grid_nb.best_estimator_)
#Identify the best score during fitting with cross-validation

0.641606893491904
{'alpha': 0.01, 'class_prior': [1, 0, -1], 'fit_prior': True}
MultinomialNB(alpha=0.01, class_prior=[1, 0, -1])


In [67]:
yL_preds = grid_nb.best_estimator_.predict(XL_test_tf)
print('naive bayes accuracy:',accuracy_score(yL_test, yL_preds),
      'naive bayes f1:',f1_score(yL_test, yL_preds, average = 'weighted'))

naive bayes accuracy: 0.6459761818837965 naive bayes f1: 0.5070367750809013


### Modeling with spaCy

Modeling review text spaCy was used to try to better process the large amount of text in the data and handle the context of the review and see if it resulted in better performing models.

In [41]:
XS = df['sp_lm']
yS = target

In [42]:
XS_train, XS_test, yS_train, yS_test = train_test_split(XS, yS, test_size=0.2, random_state=1)
tfVectorizer = TfidfVectorizer()

XS_train_tf = tfVectorizer.fit_transform(XS_train)
XS_test_tf = tfVectorizer.transform(XS_test)

#### Random Forest

In [43]:
rf_classifier = RandomForestClassifier(n_estimators=250)

In [44]:
startTime = datetime.now()

rf_classifier.fit(XS_train_tf, yS_train)
yS_preds = rf_classifier.predict(XS_test_tf)
print('random forest accuracy:',accuracy_score(yS_test, yS_preds),
      'random forest f1:',f1_score(yS_test, yS_preds, average = 'weighted'))
print(datetime.now() - startTime)

random forest accuracy: 0.8061620353662937 random forest f1: 0.7853735547614679
0:10:40.732026


#### Naive Bayes

In [45]:
nb_classifier = MultinomialNB()

In [46]:
startTime = datetime.now()

nb_classifier.fit(XS_train_tf, yS_train)
yS_preds = nb_classifier.predict(XS_test_tf)
print('naive bayes accuracy:',accuracy_score(yS_test, yS_preds),
      'naive bayes f1:',f1_score(yS_test, yS_preds, average = 'weighted'))
print(datetime.now() - startTime)

naive bayes accuracy: 0.6474197040779502 naive bayes f1: 0.510363029492351
0:00:00.078231


#### SVM

In [47]:
svc_classifier = SVC(kernel='linear', C = 1.0)

In [48]:
svc_classifier = SVC(kernel='rbf', C= 1.0, class_weight = 'balanced')

In [50]:
startTime = datetime.now()

svc_classifier.fit(XS_train_tf, yS_train)
yS_preds = svc_classifier.predict(XS_test_tf)
print('support vector machine accuracy:',accuracy_score(yS_test, yS_preds),
      'support vector machine f1:',f1_score(yS_test, yS_preds, average = 'weighted'))
print(datetime.now() - startTime)

support vector machine accuracy: 0.8360700108264164 support vector machine f1: 0.8380091409789721
0:57:06.546422
