In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import metrics

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

import string

In [2]:
data = pd.read_csv("./data/X_train.csv")
data = data.sample(frac=0.006)
# data = data[data['Summary'].notna()]
data = data[data['Text'].notna()]

In [3]:
X = data[['Id', 'Text']]
Y = data['Score']
X

Unnamed: 0,Id,Text
1173575,1426079,Countless specials have been made on dinosaurs...
153915,186828,Enjoyed the film! Complicated story. We are cu...
486238,590709,"...A BEAUTIFUL MIND is really a fine film, wel..."
182260,221240,I've seen the Godfather trilogy many times ove...
792364,962922,It's tough to fathom a G-rated animated film g...
...,...,...
397737,483189,Seeing the R-Rated DVD version of SUPERNOVA ma...
1127361,1369936,THE ROAD TO CHRISTMAS IS A VERY MOVING FAMILY ...
961453,1168340,"From the title ""Dog Tags,"" and the ""come hithe..."
1087883,1321937,This movie was pretty good. The only troubling...


In [4]:
def text_process(text):
    lower_text = text.lower()
    tokenized_text = word_tokenize(lower_text)
    tok_text_no_punc = [word for word in tokenized_text if word.isalpha()]
    tok_text_no_stop = [word for word in tok_text_no_punc if not word in stopwords.words('english')]
    return tok_text_no_stop
    
def text_process_fast(reviewText):
    nopunc = [i for i in reviewText if i not in string.punctuation]
    nopunc_text = ''.join(nopunc)
    return [i for i in nopunc_text.split() if i.lower() not in stopwords.words('english')]

In [5]:
text_process(X['Text'].to_numpy()[1])

['enjoyed',
 'film',
 'complicated',
 'story',
 'currently',
 'living',
 'germany',
 'reason',
 'fascinated',
 'whole',
 'ludwig',
 'story',
 'great',
 'job']

In [6]:
# tfidf = TfidfVectorizer(analyzer=text_process, max_features=2000)
# vocab = tfidf.fit_transform(X)

In [7]:
# print(vocab)

In [8]:
pipeline = Pipeline([('Tf-idf', CountVectorizer(analyzer=text_process)), ('classifier', MultinomialNB())], verbose=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)
pipeline.fit(x_train['Text'], y_train)
# prediction = pipeline.predict(x_test)
# metrics.classification_report(y_test, prediction)

[Pipeline] ............ (step 1 of 2) Processing Tf-idf, total= 3.5min
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.0s


Pipeline(steps=[('Tf-idf',
                 CountVectorizer(analyzer=<function text_process at 0x0000019B7605BA60>)),
                ('classifier', MultinomialNB())],
         verbose=True)

In [9]:
predictionSet = pd.read_csv("./data/prediction.csv")
# predictionSet = predictionSet[predictionSet['Summary'].notnull()]
predictionSet['Text'].fillna("na", inplace=True)

In [10]:
x_predict = predictionSet['Text']

In [11]:
# predictionSet['Score'] = pipeline.predict(x_predict)
x_test['Score'] = pipeline.predict(x_test['Text'])
x_test = x_test.sort_values(by=['Id'])
x_test

Unnamed: 0,Id,Text,Score
117,136,This is an accurate and realistic depiction of...,5.0
390,456,If there was ever a show on T.V. that could pu...,5.0
1734,2109,I saw this film many years ago and it left an ...,4.0
2119,2561,I bought this VHS movie because it brought bac...,5.0
5446,6568,Loved it....True pure SCIFY...at its best........,5.0
...,...,...,...
1389274,1687472,I don't know how Hollywood found all my uncles...,5.0
1391623,1690307,"Cute movie for kids; blot was ok, not the best...",5.0
1392281,1691120,Although a little slower in the beginning whil...,5.0
1393562,1692669,I had the good fortune of hearing Kate Mulgrew...,5.0


In [12]:
# submission = predictionSet[['Id', 'Score']]
submission_offline = x_test[['Id', 'Score']]
print(submission_offline.head())
# submission.to_csv("./data/submission.csv", index=False)
submission_offline.to_csv("./data/submission_offline.csv", index=False)

        Id  Score
117    136    5.0
390    456    5.0
1734  2109    4.0
2119  2561    5.0
5446  6568    5.0
