In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [2]:
data = pd.read_csv("./data/X_train.csv")
data = data.sample(frac=0.07)
# data = data[data['Summary'].notna()]
data = data[data['Text'].notna()]

In [3]:
stop_words = set(stopwords.words('english')) 

word_bags = {0.0: set(), 1.0: set(), 2.0: set(), 3.0: set(), 4.0: set(), 5.0: set()}
for i, row in tqdm(data.iterrows()):
    summary = row['Text']
    summary = summary.lower()
    sum_tokens = word_tokenize(summary)
    filtered_tokens = [word for word in sum_tokens if not word in stop_words]
    filtered_tokens = [word for word in filtered_tokens if word.isalpha()]
    word_bags[row['Score']] = word_bags[row['Score']].union(set(filtered_tokens))
# word_bags[1.0]

97823it [05:32, 294.57it/s]


In [4]:
def get_score_word_column(df, score):
    score_vec = []
    for i, row in tqdm(df.iterrows()):
        tot_words = 0
        summary = row['Text']
        summary = summary.lower()
        sum_tokens = word_tokenize(summary)
        filtered_tokens = [word for word in sum_tokens if not word in stop_words]
        filtered_tokens = [word for word in filtered_tokens if word.isalpha()]
        for word in filtered_tokens:
            if word in word_bags[score]:
                tot_words += 1
        score_vec.append(tot_words)
    return score_vec

In [5]:
data['0words'] = get_score_word_column(data, 0.0)
data['1words'] = get_score_word_column(data, 1.0)
data['2words'] = get_score_word_column(data, 2.0)
data['3words'] = get_score_word_column(data, 3.0)
data['4words'] = get_score_word_column(data, 4.0)
data['5words'] = get_score_word_column(data, 5.0)

97823it [01:34, 1036.19it/s]
97823it [01:35, 1027.52it/s]
97823it [01:35, 1027.90it/s]
97823it [01:34, 1033.22it/s]
97823it [01:34, 1035.61it/s]
97823it [01:34, 1033.64it/s]


In [6]:
data

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,0words,1words,2words,3words,4words,5words
445096,540767,B00003CXXR,A1ZKK7GPHUFIMH,0,0,2.0,995500800,Great movie destroyed by a bad ending.,Jurrassic park 3 is a great addition to the se...,0,50,54,50,52,53
358308,435426,6304773307,AQ01Q3070LT29,2,3,3.0,1058227200,Can a straight Bronx man live with a clean gay...,After catching his brother having sex with his...,0,54,55,58,57,57
1295874,1574335,B008QP3ZYE,ANV4I96IQ28T,5,9,4.0,1379462400,"Great Work, Awful Presentation",The Story of Film is a great piece of work and...,0,82,80,81,85,84
1203782,1462681,B005IZLPKQ,AHBNPMHDWZ7SB,3,4,4.0,1341792000,The Avengers Saga Enters the Golden Age,After years of awaiting a movie worth this his...,0,794,802,810,824,820
713603,867243,B0006GAO54,A82LIVYSX6WZ9,7,10,5.0,1109030400,Angel says goodbye but he will live on forever,Angel (David Boreanaz) is now the head honcho ...,0,477,491,503,519,532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1112160,1351351,B003L77G7Y,A23N0VB0K2QUA,0,0,5.0,1356652800,loved every episode,american dad often gets overshadowed by family...,0,18,18,18,18,18
1324367,1608870,B00AF6B22E,A2Z29N73CUH3GB,0,0,4.0,1395273600,Very Good Movie,"Flight is a very good movie. Great story, gre...",0,20,20,20,20,20
483597,587515,B00005JKJM,A1NVW4NUXPOSJB,0,2,3.0,1074816000,Slow but well done !,The acting and filming are very good. Lighting...,0,31,31,31,31,31
514171,624740,B00005JMZK,A3ROE64EVHDTTV,1,1,4.0,1207180800,But My Mommy Told Me There Were No Real Monste...,The Good Things*Cool special effects and some ...,0,169,171,171,172,172


In [7]:
Y = pd.DataFrame(data['Score'])
X = data.drop(columns=['Score'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [8]:
x_train_features = x_train.drop(columns=['Id', 'Text', 'Summary', 'ProductId', 'UserId', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time'])
x_test_features = x_test.drop(columns=['Id', 'Text', 'Summary', 'ProductId', 'UserId', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time'])

In [9]:
predictionSet = pd.read_csv("./data/prediction.csv")
# predictionSet = predictionSet[predictionSet['Summary'].notnull()]
predictionSet['Text'].fillna("na", inplace=True)

In [10]:
predictionSet['0words'] = get_score_word_column(predictionSet, 0.0)
predictionSet['1words'] = get_score_word_column(predictionSet, 1.0)
predictionSet['2words'] = get_score_word_column(predictionSet, 2.0)
predictionSet['3words'] = get_score_word_column(predictionSet, 3.0)
predictionSet['4words'] = get_score_word_column(predictionSet, 4.0)
predictionSet['5words'] = get_score_word_column(predictionSet, 5.0)
x_predict = predictionSet.drop(columns=['Id', 'Text', 'Summary', 'ProductId', 'UserId', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Score'])
x_predict

300000it [04:45, 1051.92it/s]
300000it [04:51, 1030.91it/s]
300000it [04:53, 1023.29it/s]
300000it [04:50, 1032.60it/s]
300000it [04:51, 1028.74it/s]
300000it [04:59, 1003.20it/s]


Unnamed: 0,0words,1words,2words,3words,4words,5words
0,0,22,23,23,23,23
1,0,10,10,10,10,10
2,0,12,12,12,12,12
3,0,28,29,30,29,29
4,0,6,6,6,6,6
...,...,...,...,...,...,...
299995,0,355,365,365,382,373
299996,0,164,172,172,176,181
299997,0,17,17,17,18,17
299998,0,483,515,538,570,570


In [11]:
# model = GaussianNB().fit(x_train_features, y_train.values.flatten())
model = KNeighborsClassifier(n_neighbors=3).fit(x_train_features, y_train.values.flatten())
predictionSet['Score'] = model.predict(x_predict)
x_test['Score'] = model.predict(x_test_features)
x_test = x_test.sort_values(by=['Id'])
x_test

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,0words,1words,2words,3words,4words,5words,Score
55,63,0005019281,A38JFF6SGZEO37,0,0,1388793600,The Best Film about Scrooge!,I love An American Christmas Carol with Henry ...,0,14,15,15,15,15,5.0
175,205,0307141985,A2JP0URFHXP6DO,11,12,1141171200,A THANKSGIVING TRADITION,It's been a number of years since I've seen Mo...,0,105,109,116,113,116,3.0
268,313,0307142493,A20VTMEUX1RLFZ,0,0,1382918400,A Classic,I watched this program as a kid and every year...,0,20,20,20,20,20,1.0
286,331,0307142493,A22BPC7LRSGSYQ,2,2,1386547200,Christmas Classic,I caught this on ABC-TV tonight as I was flick...,0,72,73,73,73,74,5.0
461,544,0310263662,A2NP4TXHLHC7K9,8,13,1091491200,The movie is great. The DVD is so so,One of the great reasons for buying DVD's is t...,0,94,94,94,95,96,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1396294,1696022,B00JA3RPAG,A175WEJB723MO9,0,1,1405468800,Something for everyone over the pg-13 rating,"I am 48 father of two, but act like I'm 20(som...",0,29,29,29,29,29,1.0
1396625,1696418,B00JAQJMJ0,A3GUNXVYK34085,0,2,1405728000,Ornate Silliness,This film was shoved out of the theaters and o...,0,84,90,88,90,89,2.0
1396818,1696656,B00JJ3EH6C,A3Q9SC3WO1GO33,0,0,1405296000,Tom Hardy's One Man Show,"Tom Hardy Is basically the film, and his perfo...",0,14,14,14,14,14,3.0
1396862,1696713,B00JK7QTZE,A234GGIGZWC63U,0,1,1405987200,A true love story,A true love story that takes you back to anoth...,0,20,20,20,20,20,1.0


In [12]:
submission = predictionSet[['Id', 'Score']]
submission_offline = x_test[['Id', 'Score']]
print(submission.head())
submission.to_csv("./data/submission.csv", index=False)
submission_offline.to_csv("./data/submission_offline.csv", index=False)

   Id  Score
0   5    2.0
1  11    5.0
2  17    1.0
3  46    3.0
4  47    5.0
