In [1]:
from nltk.probability import FreqDist
import json
import numpy as np

### Loading data from unzipped file

In [2]:
# Attention: You need to unzip "reviews_Home_and_Kitchen_5.json" to get "Home_and_Kitchen_5.json"
# I used "WinRar" on Windows to extract this json file
path = "Home_and_Kitchen_5.json"

In [3]:
with open(path, encoding='utf-8') as f:
    data = [json.loads(line) for line in f.readlines()]

In [4]:
len(data)

551682

In [5]:
# proccessed_data = []
proccessed_data = [ [dics["reviewText"], dics["helpful"], dics['overall'] ] for dics in data]
# for dics in data:
#     line = []
#     line.append(dics["reviewText"])
#     line.append(dics["helpful"])
#     line.append(dics['overall'])
#     proccessed_data.append(line)

Checking number of unique words in the data set before preproccessing:

In [6]:
fdist1 = FreqDist()

In [7]:
for line in proccessed_data:
    for word in line[0].split():
        fdist1[word.lower()] += 1
fdist1

FreqDist({'the': 2772778, 'i': 1591274, 'and': 1541022, 'a': 1434474, 'to': 1416472, 'it': 1226405, 'is': 863426, 'of': 832332, 'this': 732618, 'for': 715249, ...})

In [8]:
print(fdist1)

<FreqDist with 838417 samples and 53925892 outcomes>


In [9]:
# np_data = np.array(proccessed_data)
# print(np_data) 
# len(np_data)

##  Filtering Punctuation

In [10]:
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
punct_set = set(punctuation)

In [12]:
def filter_punctuation(text):
    no_punct=[words for words in text if words not in punct_set]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

In [13]:
for data in proccessed_data:
    data[0] = filter_punctuation(data[0])

In [14]:
fdist_wo_punct = FreqDist()
for line in proccessed_data:
    for word in line[0].split():
        fdist_wo_punct[word.lower()] += 1
print(fdist_wo_punct)

<FreqDist with 481174 samples and 53739451 outcomes>


number of unique words went down to 481174 from 838417 after removing the puncuation :)

## Tokenizing

In [15]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gaston\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
for data in proccessed_data:
    data[0] = word_tokenize(data[0])

In [17]:
fdist_wo_punct.most_common(50)

[('the', 2780773),
 ('i', 1606253),
 ('and', 1557137),
 ('a', 1438350),
 ('to', 1425398),
 ('it', 1387669),
 ('is', 876047),
 ('of', 835434),
 ('this', 759706),
 ('for', 728397),
 ('in', 612086),
 ('that', 551863),
 ('my', 467378),
 ('with', 446288),
 ('you', 432856),
 ('have', 428896),
 ('on', 425902),
 ('but', 399901),
 ('not', 355774),
 ('so', 322844),
 ('as', 308031),
 ('was', 305702),
 ('are', 304115),
 ('one', 270744),
 ('use', 243819),
 ('they', 241978),
 ('be', 239917),
 ('its', 239749),
 ('very', 232904),
 ('or', 218627),
 ('like', 202988),
 ('just', 200785),
 ('if', 194194),
 ('up', 193151),
 ('out', 187099),
 ('can', 183018),
 ('great', 179985),
 ('at', 177328),
 ('these', 176098),
 ('when', 175176),
 ('all', 171446),
 ('had', 166292),
 ('them', 164924),
 ('well', 161751),
 ('would', 161676),
 ('will', 158631),
 ('more', 147408),
 ('good', 145571),
 ('from', 144893),
 ('than', 139689)]

## Filtering Stop Words: 

In [18]:
from nltk.corpus import stopwords

In [19]:
stop_words = set(stopwords.words('english'))

In [20]:
def remove_stopwords(text):
    text=[word for word in text if word.lower() not in stop_words]
    return text

In [21]:
for data in proccessed_data:
    data[0] = remove_stopwords(data[0])

## Frequency of words after removing punctuation & stop words:

In [22]:
fdist_wo_stop_words = FreqDist()
for line in proccessed_data:
    for word in line[0]:
        fdist_wo_stop_words[word.lower()] += 1
print("Frequency of words after removing stop words:", fdist_wo_stop_words)

Frequency of words after removing stop words: <FreqDist with 481023 samples and 27368065 outcomes>


In [23]:
# most common words in data set
fdist_wo_stop_words.most_common(50)

[('one', 270744),
 ('use', 243819),
 ('like', 202988),
 ('great', 179985),
 ('well', 161751),
 ('would', 161676),
 ('good', 145571),
 ('get', 136486),
 ('easy', 129296),
 ('time', 118144),
 ('really', 114609),
 ('dont', 113747),
 ('much', 109164),
 ('little', 106839),
 ('used', 106190),
 ('also', 106111),
 ('coffee', 105085),
 ('love', 103695),
 ('water', 100722),
 ('make', 94297),
 ('bought', 91486),
 ('nice', 89382),
 ('clean', 84660),
 ('product', 84548),
 ('made', 83895),
 ('even', 82685),
 ('works', 82030),
 ('im', 80594),
 ('using', 76578),
 ('put', 75824),
 ('ive', 75558),
 ('set', 75234),
 ('small', 74070),
 ('two', 73704),
 ('price', 71142),
 ('first', 68355),
 ('still', 66993),
 ('work', 65862),
 ('better', 65386),
 ('need', 65110),
 ('size', 63284),
 ('top', 62384),
 ('buy', 62174),
 ('quality', 61197),
 ('enough', 60855),
 ('pan', 60791),
 ('perfect', 60398),
 ('years', 59130),
 ('got', 58625),
 ('way', 56894)]

In [24]:
# WTF!! All of Stop words are suppossed to be filtered!
stop_words_in_data = [word for word in fdist_wo_stop_words.keys() if word in stop_words]
len(stop_words_in_data)

0

### Creating sentimental labels for helpfulness:

In [25]:
# Ask Gaston if 0.55 is a good threshold? Should I add Neutral label for 0 and %45 to %55??
def create_sntmtl_label(help_list):
    if help_list[1] == 0:
        return -1
#         return "neutral"
    return 1 if (help_list[0] / help_list[1]) > 0.55 else -1

In [26]:
for data in proccessed_data:
    data[1] = create_sntmtl_label(data[1])

## creating sentimental labels for overall ratings:

In [27]:
def create_rating_sntmtl(rating: float):
    if rating < 3:
        return -1
    elif rating == 3:
        return 0
    else:
        return 1

In [28]:
for line in proccessed_data:
    line[2] = create_rating_sntmtl(line[2])

## Creating a Panda dataframe:

In [29]:
import pandas as pd
from pandas import DataFrame

In [30]:
df = DataFrame(proccessed_data, columns=['Tokenized Review','Helpfulness','Rating'])

In [31]:
df.head()

Unnamed: 0,Tokenized Review,Helpfulness,Rating
0,"[daughter, wanted, book, price, Amazon, best, ...",-1,1
1,"[bought, zoku, quick, pop, daughterr, zoku, qu...",-1,1
2,"[shortage, pop, recipes, available, free, web,...",1,1
3,"[book, must, get, Zoku, also, highly, recommen...",1,1
4,"[cookbook, great, really, enjoyed, reviewing, ...",-1,1


In [35]:
df.to_csv("Tokenized.csv", index=False)

In [36]:
SEED = 1234

In [37]:
import pandas as pd
import ast

In [38]:
df= pd.read_csv("Tokenized.csv")
df.head()

Unnamed: 0,Tokenized Review,Helpfulness,Rating
0,"['daughter', 'wanted', 'book', 'price', 'Amazo...",-1,1
1,"['bought', 'zoku', 'quick', 'pop', 'daughterr'...",-1,1
2,"['shortage', 'pop', 'recipes', 'available', 'f...",1,1
3,"['book', 'must', 'get', 'Zoku', 'also', 'highl...",1,1
4,"['cookbook', 'great', 'really', 'enjoyed', 're...",-1,1


In [43]:
df.shape

(551682, 4)

In [40]:
df["Text"]= df["Tokenized Review"].map(lambda x: ast.literal_eval(x))

In [41]:
from sklearn.model_selection import train_test_split
#split is 60 20 20
X_train, X_test, __, _ = train_test_split(df[['Text',"Helpfulness","Rating"]], df[['Rating']], test_size=0.20, random_state=SEED)

In [44]:
X_train, X_val, __, _ = train_test_split(X_train, __, test_size=0.25, random_state=SEED)

In [48]:
X_train.to_csv("./data/Train.csv",index=False)
X_val.to_csv("./data/Val.csv",index=False)
X_test.to_csv("./data/Test.csv",index=False)