In [1]:
import re
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer 
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC  
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
nltk.download('punkt', 'stopwords', 'wordnet')

True

**Import Data**

In [2]:
ytrn = np.load('/Users/dorisveronicaavedikian/Desktop/Spring 2022/CSCI 6366/Paper and Competition /csci6366-data-mining-competition-utrgv/y_train.npy').ravel()
xtrn = open('X_train.txt','r', encoding = 'utf-8').readlines()
xtest = open('X_test.txt','r', encoding = 'utf-8').readlines()

We use the following code to process and create a dictionary of all words present across all the documents. 
The dictionary will contain all unique words across the corpus and each word in the dictionary will be treated 
as a feature.

In [3]:
count_vectorize = CountVectorizer()
feature_vector = count_vectorize.fit(xtrn)
features = feature_vector.get_feature_names()
print("Total # of unique words or features: ", len(features))
print("Random Sample:", random.sample(features, 10))

Total # of unique words or features:  5658
Random Sample: ['olds', 'drying', 'stunning', 'safes', 'stupid', 'charging', 'playthrough', 'realistic', 'negeves', 'shelter']


In [4]:
train_features = count_vectorize.transform(xtrn)
print("Dimensions (# of Reviews, # of Features):", train_features.shape)

Dimensions (# of Reviews, # of Features): (1000, 5658)


**Function to preprocess the reviews.**

In [None]:
def CleanData(data_to_clean):
    
    clean_data=[]
    ps=PorterStemmer()
    wnl = WordNetLemmatizer()
    
    removewords = ['peopl', 'much', 'go', 'make', 'realli', 'thing', 'well', 
            'even', 'time', 'one', 'still', 'tri', 'get', 'also', 'way', 'first', 'say']
    
    stopwords_nltk = stopwords.words('english')
    removewords += stopwords_nltk
    
    for text in data_to_clean:
        review = re.sub('[^a-zA-Z]', ' ', text).lower()
        
        stemlem = [ps.stem(k) and wnl.lemmatize(k) for k in word_tokenize(review) if not k
                     in set(stopwords.words('english'))]
        
        for word in stemlem:
            if word in removewords:
                stemlem.remove(word)
                
        clean_text = ' '.join(stemlem)
        clean_data.append(clean_text)
        
    return(clean_data)  

In [7]:
trn_data = pd.DataFrame({'Before': xtrn,'Label': ytrn})
test_data = pd.DataFrame({'Before': xtest})

clean_xtrn, clean_xtest = [CleanData(k) for k in [xtrn, xtest]]

trn_data['After'] = clean_xtrn
test_data['After'] = clean_xtest

In [8]:
count_vectorize = CountVectorizer()
feature_vector = count_vectorize.fit(clean_xtrn)
features = feature_vector.get_feature_names()
print("# of unique words after cleaning data: ", len(features))

# of unique words after cleaning data:  4825


In [9]:
train_features = count_vectorize.transform(clean_xtrn)
print("Dimensions after cleaning (# of Reviews, # of Features):", train_features.shape)

Dimensions after cleaning (# of Reviews, # of Features): (1000, 4825)


The following feature count was used after cleaning the data to determine which words were showing up the most 
that arent traditional english stop words and might not be helpful (i.m.o) but are repeated a lot. The feature count after 
cleaning was used to come up with the list of words in "removewords" above in CleanData(data_to_clean).

In [11]:
count_vectorize = CountVectorizer()

feature_vector = count_vectorize.fit(clean_xtrn)
features = feature_vector.get_feature_names()

train_ds_features = count_vectorize.transform(clean_xtrn)
feature_counts=np.sum(train_ds_features.toarray(), axis=0)

feature_counts=pd.DataFrame(dict(features = features, counts = feature_counts))
feature_counts.sort_values('counts', ascending=False) [0:20]

Unnamed: 0,features,counts
1718,game,1439
298,bad,879
3137,play,237
2436,like,226
1820,good,195
1693,fun,167
4769,would,133
1845,great,131
3424,really,126
526,buy,107


In [14]:
trn_data[['Before', 'After']].sample(5)

Unnamed: 0,Before,After
8,great game play it like everyday i recomend th...,great game play like everyday recomend everyone
276,its just relly fun\n,relly fun
835,"i don't like high textures, sorry.\n",like high texture sorry
95,meow meow!\n,meow meow
845,"takes a long time to finish each matches, and ...",take long finish match anyways im tf item lol


**Training Models**

In [15]:
xtr, xval, ytr, yval = train_test_split(trn_data.After, ytrn, test_size= 0.20)

In [16]:
vec = CountVectorizer()
tfidf_vec = TfidfTransformer()

xtrn_vec = vec.fit_transform(xtr)
xtrn_tfidf1 = tfidf_vec.fit_transform(xtrn_vec)
xtest_vec = vec.transform(xval)
xtest_tfidf1 = tfidf_vec.transform(xtest_vec)

In [17]:
model1 = LogisticRegression()
x = model1.fit(xtrn_tfidf1, ytr)
print("Model 1 / Logistic Regression Accuracy")
print("Training:", accuracy_score(ytr, x.predict(xtrn_tfidf1)))
print("Testing:", accuracy_score(yval, x.predict(xtest_tfidf1)))

Model 1 / Logistic Regression Accuracy
Training: 0.9625
Testing: 0.805


In [18]:
model2 = LinearSVC()
x = model2.fit(xtrn_tfidf1, ytr)
print("Model 2 / LinearSVC Accuracy")
print("Training:", accuracy_score(ytr, x.predict(xtrn_tfidf1)))
print("Testing:", accuracy_score(yval, x.predict(xtest_tfidf1)))

Model 2 / LinearSVC Accuracy
Training: 0.99625
Testing: 0.79


In [19]:
model3 = MLPClassifier()
x = model3.fit(xtrn_tfidf1, ytr)
print("Model 3 / MLPClassifier Accuracy")
print("Training:", accuracy_score(ytr, x.predict(xtrn_tfidf1)))
print("Testing:", accuracy_score(yval, x.predict(xtest_tfidf1)))

Model 3 / MLPClassifier Accuracy
Training: 0.9975
Testing: 0.775


In [20]:
model4 = MultinomialNB()
x = model4.fit(xtrn_tfidf1, ytr)
print("Model 4 / MultinomialNB Accuracy")
print("Training:", accuracy_score(ytr, x.predict(xtrn_tfidf1)))
print("Testing:", accuracy_score(yval, x.predict(xtest_tfidf1)))

Model 4 / MultinomialNB Accuracy
Training: 0.97625
Testing: 0.79


**Final Models - Testing Data**

In [21]:
vec = CountVectorizer()
tfidf_vec = TfidfTransformer()

xtrn_vec = vec.fit_transform(trn_data.After)
xtrn_tfidf1 = tfidf_vec.fit_transform(xtrn_vec)
xtest_vec = vec.transform(test_data.After)
xtest_tfidf1 = tfidf_vec.transform(xtest_vec)

In [22]:
LSVC = LinearSVC()
LSVC.fit(xtrn_tfidf1, ytrn)
ypred = LSVC.predict(xtest_tfidf1)

df = pd.DataFrame()
df['id'] = range(1, xtest_tfidf1.shape[0] + 1)
df['label'] = ypred
df.to_csv('LSVC.csv', index = False)

In [23]:
MLP = MLPClassifier()
MLP.fit(xtrn_tfidf1, ytrn)
pred_test = MLP.predict(xtest_tfidf1)

df = pd.DataFrame()
df['id'] = range(1, xtest_tfidf1.shape[0] + 1)
df['label'] = pred_test
df.to_csv('MLP.csv', index = False)

In [24]:
lg = LogisticRegression()
lg.fit(xtrn_tfidf1, ytrn)
ypred=lg.predict(xtest_tfidf1)

df = pd.DataFrame()
df['id'] = range(1, xtest_tfidf1.shape[0] + 1)
df['label'] = ypred
df.to_csv('LG.csv', index = False)

In [25]:
NB = MultinomialNB()
NB.fit(xtrn_tfidf1, ytrn)
ypred=lg.predict(xtest_tfidf1)

df = pd.DataFrame()
df['id'] = range(1, xtest_tfidf1.shape[0] + 1)
df['label'] = ypred
df.to_csv('NB.csv', index = False)