# Model Fitting and Testing

### Importing Data

In [132]:
import pandas as pd

data = pd.read_csv("allcleanwithbrand.csv")
data["review"] = data["title"] + data["desc"]
data["review"] = data["review"].fillna("")

In [162]:
from sklearn.model_selection import train_test_split
training_x, testing_x, train_y, test_y = train_test_split(data["review"], data["Brand"], test_size = 0.25, random_state = 12)


In [134]:
print(training_y.value_counts())
print(testing_y.value_counts())

Sony         777
Nintendo     560
Microsoft    486
Other        274
Arcade       131
Meta          30
Name: Brand, dtype: int64
Sony         235
Nintendo     195
Microsoft    157
Other        106
Arcade        46
Meta          14
Name: Brand, dtype: int64


In [135]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

lemmatizer = nltk.stem.WordNetLemmatizer()
tokencomp = []
for review in list(training_x):
    tokens = nltk.word_tokenize(str(review).lower())
    lemmatized_token = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    tokencomp.append([token for token in lemmatized_token if token not in stopwords.words('english')])

comp = []
for review in tokencomp:
    comp.append(" ".join(review))
vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = 2)
vectorizer.fit(comp)

train_x = vectorizer.transform(training_x)
test_x = vectorizer.transform(testing_x)

### Naive Bayes

In [136]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
NBmodel = MultinomialNB()

NBmodel.fit(train_x, train_y)
y_pred_NB = NBmodel.predict(test_x)

acc_NB = accuracy_score(test_y, y_pred_NB)
print("Naive Bayes model Accuracy:: {:.2f}%".format(acc_NB*100))

Naive Bayes model Accuracy:: 73.97%


### Logistic Model

In [137]:
from sklearn.linear_model import LogisticRegression
Logitmodel = LogisticRegression()

Logitmodel.fit(train_x, train_y)
y_pred_logit = Logitmodel.predict(test_x)

acc_logit = accuracy_score(test_y, y_pred_logit)
print("Logit model Accuracy:: {:.2f}%".format(acc_logit*100))

Logit model Accuracy:: 84.99%


### Random Forest

In [138]:
from sklearn.ensemble import RandomForestClassifier

RFmodel = RandomForestClassifier(n_estimators=50, max_depth=6, bootstrap=True, random_state=0)

RFmodel.fit(train_x, train_y)
y_pred_RF = RFmodel.predict(test_x)

acc_RF = accuracy_score(test_y, y_pred_RF)
print("Random Forest Model Accuracy: {:.2f}%".format(acc_RF*100))

Random Forest Model Accuracy: 63.75%


### SVC Model

In [139]:
from sklearn.svm import LinearSVC
SVMmodel = LinearSVC()

SVMmodel.fit(train_x, train_y)
y_pred_SVM = SVMmodel.predict(test_x)

acc_SVM = accuracy_score(test_y, y_pred_SVM)
print("SVM model Accuracy: {:.2f}%".format(acc_SVM*100))

SVM model Accuracy: 88.45%


### Neural Network

In [140]:
from sklearn.neural_network import MLPClassifier
DLmodel = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(3,2), random_state=1)

DLmodel.fit(train_x, train_y)
y_pred_DL= DLmodel.predict(test_x)

acc_DL = accuracy_score(test_y, y_pred_DL)
print("DL model Accuracy: {:.2f}%".format(acc_DL*100))

DL model Accuracy: 73.97%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


### Recurring Neural Network

In [169]:
import numpy as np 

docs_x = []
docs_train_x = []
docs_test_x = []
for review in training_x:
    docs_x.append(nltk.word_tokenize(str(review).lower()))
    docs_train_x.append(nltk.word_tokenize(str(review).lower()))
for review in testing_x:
    docs_x.append(nltk.word_tokenize(str(review).lower()))
    docs_test_x.append(nltk.word_tokenize(str(review).lower()))

from collections import Counter
words = [j for i in docs_x for j in i]
count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)
vocab_to_int = {w: i+1 for i, (w,c) in enumerate(sorted_words)} 

text_int = []
for i in docs_train_x:
    r = [vocab_to_int[w] for w in i]
    text_int.append(r)


text_test_int = []
for i in docs_test_x:
    r = [vocab_to_int[w] for w in i]
    text_test_int.append(r)
    


In [184]:
from keras.preprocessing import sequence 
from keras.models import Sequential 
from keras.layers import Dense, Embedding, Flatten 
from keras.layers import LSTM
max_features = total_words
maxlen = 250
batch_size = 32

x_train = sequence.pad_sequences(text_int, maxlen=maxlen)
x_test = sequence.pad_sequences(text_test_int, maxlen=maxlen)

encoded_train = [0 if label =='Sony' else 1 if label == "Nintendo" else 2 if label == "Microsoft" else 3 if label == "Arcade" else 4 if label == "Meta" else 5 for label in train_y]
encoded_test = [0 if label =='Sony' else 1 if label == "Nintendo" else 2 if label == "Microsoft" else 3 if label == "Arcade" else 4 if label == "Meta" else 5 for label in test_y]

model = Sequential()
model.add(Embedding(max_features, 20, input_length=maxlen))
model.add(LSTM(100, dropout=0.10, recurrent_dropout=0.10))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train.tolist(), encoded_train, batch_size=batch_size, epochs=2, validation_data=(x_test.tolist(), encoded_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2126446d040>