In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn import metrics

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

import string

In [2]:
data = pd.read_csv("./data/X_train.csv")
data = data.sample(frac=0.02)
# data = data[data['Summary'].notna()]
data = data[data['Text'].notna()]

In [3]:
X = data[['Id', 'Text']]
Y = data['Score']
X

Unnamed: 0,Id,Text
59409,71960,"Sorry, but I have to agree with a previous rev..."
471301,572612,So bad I tried to sell it to blockbuster today...
452358,549599,"this movie was awful, nothing like the first m..."
1136165,1380532,"127 HOURSSTARRING: James Franco, Kate Mara, Am..."
872044,1059653,I was caught off guard by how interesting the ...
...,...,...
533109,647734,This movie works on several levels.As a semi-d...
1038346,1261701,"without all of the fancy CGI, and other modern..."
620049,753402,Probably the single most important reason that...
559502,679879,While Queer As Folk is fascinating in that it ...


In [4]:
def text_process(text):
    lower_text = text.lower()
    tokenized_text = word_tokenize(lower_text)
    tok_text_no_punc = [word for word in tokenized_text if word.isalpha()]
    tok_text_no_stop = [word for word in tok_text_no_punc if not word in stopwords.words('english')]
    return tok_text_no_stop
    
def text_process_fast(reviewText):
    nopunc = [i for i in reviewText if i not in string.punctuation]
    nopunc_text = ''.join(nopunc)
    return [i for i in nopunc_text.split() if i.lower() not in stopwords.words('english')]

def tokenizer(text):
    return [word for word in word_tokenize(text) if word.isalpha()]

In [5]:
text_process(X['Text'].to_numpy()[1])

['bad',
 'tried',
 'sell',
 'blockbuster',
 'today',
 'didnt',
 'even',
 'come',
 'system',
 'complete',
 'waste',
 'money']

In [6]:
# tfidf = TfidfVectorizer(analyzer=text_process, max_features=2000)
# vocab = tfidf.fit_transform(X)

In [7]:
# print(vocab)

In [8]:
nb = MultinomialNB()
tfidf = TfidfVectorizer(tokenizer=tokenizer, lowercase=True, stop_words='english', max_features=21000)
svc = SVC(C=0.85, class_weight='balanced', gamma='scale')

In [9]:
pipeline = Pipeline([('tf-idf', tfidf), ('classifier', svc)], verbose=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15)
pipeline.fit(x_train['Text'], y_train)

[Pipeline] ............ (step 1 of 2) Processing tf-idf, total=  22.2s
[Pipeline] ........ (step 2 of 2) Processing classifier, total= 8.0min


Pipeline(steps=[('tf-idf',
                 TfidfVectorizer(max_features=21000, stop_words='english',
                                 tokenizer=<function tokenizer at 0x000001C4FADDFB80>)),
                ('classifier', SVC(C=0.85, class_weight='balanced'))],
         verbose=True)

In [10]:
predictionSet = pd.read_csv("./data/prediction.csv")
# predictionSet = predictionSet[predictionSet['Summary'].notnull()]
predictionSet['Text'].fillna("na", inplace=True)

In [11]:
x_predict = predictionSet['Text']

In [12]:
predictionSet['Score'] = pipeline.predict(x_predict)

In [13]:
x_test['Score'] = pipeline.predict(x_test['Text'])
x_test = x_test.sort_values(by=['Id'])
x_test

Unnamed: 0,Id,Text,Score
93,110,Joseph is amazing story to read and watch. A ...,5.0
437,512,"This is a very basic cooking dvd, which I thin...",4.0
652,786,I love this movie...It has a real sactifying e...,5.0
1325,1613,i'm a fan of olivia dehaviland. she is a wond...,5.0
2187,2644,"If you are reading this, you're already famili...",5.0
...,...,...,...
1396306,1696034,I watched this movie in the theatre and was ve...,4.0
1396345,1696081,"Comedy? Spy thriller? One thing is for sure,...",5.0
1396758,1696584,This film can't decide whether it wants to be ...,3.0
1397049,1696946,I loved this little movie and I think you will...,4.0


In [14]:
submission = predictionSet[['Id', 'Score']]
submission_offline = x_test[['Id', 'Score']]
print(submission.head())
submission.to_csv("./data/submission_fast.csv", index=False)
submission_offline.to_csv("./data/submission_offline_fast.csv", index=False)

   Id  Score
0   5    5.0
1  11    4.0
2  17    4.0
3  46    5.0
4  47    2.0
