In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn import metrics

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

import string

In [2]:
data = pd.read_csv("./data/X_train.csv")
data = data.sample(frac=0.015)
# data = data[data['Summary'].notna()]
data = data[data['Text'].notna()]

In [3]:
X = data[['Id', 'Text']]
Y = data['Score']
X

Unnamed: 0,Id,Text
22573,27293,Ambitious dancer uses the horny goat leader of...
635987,772828,Currently everybody thinks of him as directing...
1098855,1335223,Beth (Kristen Bell) is a N.Y. curator who take...
968490,1176851,Rob Halford (aka The Metal God) and his bandma...
995135,1209158,yeah so what am i going to say you don't alrea...
...,...,...
64811,78614,I'm not going to waste time on the story of Ci...
437926,532004,This was a great movie! My dad was in the Navy...
399347,485138,I love how Jim Jarmusch lets the viewer decide...
628862,764126,I had forgotten about this series. It wasn't o...


In [4]:
def text_process(text):
    lower_text = text.lower()
    tokenized_text = word_tokenize(lower_text)
    tok_text_no_punc = [word for word in tokenized_text if word.isalpha()]
    tok_text_no_stop = [word for word in tok_text_no_punc if not word in stopwords.words('english')]
    return tok_text_no_stop
    
def text_process_fast(reviewText):
    nopunc = [i for i in reviewText if i not in string.punctuation]
    nopunc_text = ''.join(nopunc)
    return [i for i in nopunc_text.split() if i.lower() not in stopwords.words('english')]

In [5]:
text_process(X['Text'].to_numpy()[1])

['currently',
 'everybody',
 'thinks',
 'directingpan',
 'labyrinthor',
 'thehellboymovies',
 'beginning',
 'career',
 'guillermo',
 'del',
 'toro',
 'honed',
 'directorial',
 'skills',
 'truly',
 'brilliant',
 'unique',
 'movie',
 'called',
 'cronos',
 'expertly',
 'blended',
 'alchemy',
 'vampirism',
 'creeping',
 'psychological',
 'dealer',
 'jes',
 'uacute',
 'gris',
 'federico',
 'luppi',
 'handling',
 'angel',
 'statue',
 'finds',
 'insectile',
 'metal',
 'object',
 'bottom',
 'bites',
 'injecting',
 'strange',
 'fluid',
 'soon',
 'jes',
 'uacute',
 'finds',
 'addicted',
 'device',
 'finds',
 'slowly',
 'restoring',
 'youth',
 'strength',
 'party',
 'also',
 'finds',
 'giving',
 'hunger',
 'wealthy',
 'dying',
 'businessman',
 'determined',
 'find',
 'device',
 'sends',
 'brutal',
 'nephew',
 'angel',
 'ron',
 'perlman',
 'find',
 'angel',
 'even',
 'kills',
 'jes',
 'uacute',
 'old',
 'man',
 'tell',
 'wants',
 'know',
 'jes',
 'uacute',
 'rises',
 'undead',
 'creature',
 'still

In [6]:
# tfidf = TfidfVectorizer(analyzer=text_process, max_features=2000)
# vocab = tfidf.fit_transform(X)

In [7]:
# print(vocab)

In [8]:
nb = MultinomialNB()
svc = SVC(C=1, class_weight='balanced', gamma='scale')

In [9]:
pipeline = Pipeline([('Tf-idf', TfidfVectorizer(analyzer=text_process, max_features=25000)), ('classifier', svc)], verbose=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15)
pipeline.fit(x_train['Text'], y_train)

[Pipeline] ............ (step 1 of 2) Processing Tf-idf, total= 7.4min
[Pipeline] ........ (step 2 of 2) Processing classifier, total= 3.9min


Pipeline(steps=[('Tf-idf',
                 TfidfVectorizer(analyzer=<function text_process at 0x0000022A0C3F8430>,
                                 max_features=25000)),
                ('classifier', SVC(C=1, class_weight='balanced'))],
         verbose=True)

In [10]:
predictionSet = pd.read_csv("./data/prediction.csv")
# predictionSet = predictionSet[predictionSet['Summary'].notnull()]
predictionSet['Text'].fillna("na", inplace=True)

In [11]:
x_predict = predictionSet['Text']

In [14]:
predictionSet['Score'] = pipeline.predict(x_predict)
# x_test['Score'] = pipeline.predict(x_test['Text'])
# x_test = x_test.sort_values(by=['Id'])
# x_test
predictionSet

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Score
0,5,0005019281,A2L0G56BNOTX6S,0,0,1383696000,Dickens updated.,This has been a favorite movie of mine for a l...,5.0
1,11,0005019281,A33EWPXESP9GQH,0,0,1390780800,Good Version,Even though i don't care for Henry Winklers a...,3.0
2,17,0005019281,A13KAQO9F5X0FN,0,0,1389657600,the fonz does scrooge,Anorher good movie for holiday watchers..a lit...,4.0
3,46,0005019281,A306NASGVUDFKF,10,14,1132963200,A refreshing twist on a Holiday classic,My wife and I grew up in New Hampshire where t...,5.0
4,47,0005019281,A38G1NN5SD81GD,0,1,1384905600,Not my favorite,"This is a first for me, I didn't like this mov...",2.0
...,...,...,...,...,...,...,...,...,...
299995,1697520,B00LH9ROKM,AYB0IXBPBJ20A,0,1,1404345600,"Basically an Episode of Criminal Minds, See It...",Just how seriously one should take Scott Derri...,4.0
299996,1697522,B00LT1JHLW,AU73NIGESSIRE,25,88,1405555200,"July 17, 2014 - the first day of pre-order (wi...",Let's be clear - the 5 stars are for the serie...,5.0
299997,1697524,B00LT1JHLW,A3PPYOJBMFBP6U,3,10,1405728000,Please Include The 'Batman In Color' Bumper Wh...,I would also like to see the original 20th Cen...,1.0
299998,1697527,B00LT1JHLW,A2CA2Q6JS6CQAE,10,14,1405987200,Finally on dvd and blu-ray The Batman TV Series,Finally to be released on DVD and Blu-Ray Nove...,5.0


In [15]:
submission = predictionSet[['Id', 'Score']]
# submission_offline = x_test[['Id', 'Score']]
# print(submission_offline.head())
submission.to_csv("./data/submission.csv", index=False)
# submission_offline.to_csv("./data/submission_offline.csv", index=False)