In [1]:
import json
import nltk
import timeit
import pickle

In [5]:
f = open("stopwords_list.dat")
stopwords = set([word.strip().lower() for word in f.readlines()])
f.close()

In [6]:
devfile = "../A1_Data/dev.json"
trainfile = "../A1_Data/train.json"
devpickle = "tokenised_dev.pickle"
trainpickle = "tokenised_train.pickle"

### Pickling tokenised dataset

In [9]:
f = open(devfile,"r")
tokenised_list = []
start_time = timeit.default_timer()
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    r = json_dump["review"].lower()
    r = nltk.word_tokenize(r)
    r = [word for word in r if word not in stopwords]
    tokenised_list.append((json_dump["ratings"],r))
print("tokenising dev", timeit.default_timer() - start_time)
f.close()

11.460372282002936


In [11]:
start_time = timeit.default_timer()
f = open(devpickle,"wb")
pickle.dump(tokenised_list,f)
f.close()
print("pickling dev",timeit.default_timer() - start_time)

pickling dev 0.22221641799842473


In [7]:
f = open(trainfile,"r")
tokenised_list = []
start_time = timeit.default_timer()
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    r = json_dump["review"]
    r = nltk.word_tokenize(r)
    r = [word for word in r if word not in stopwords]
    tokenised_list.append((json_dump["ratings"],r))
print("tokenising train",timeit.default_timer() - start_time)
f.close()

57.07665412499773


In [8]:
start_time = timeit.default_timer()
f = open(trainpickle,"wb")
pickle.dump(tokenised_list,f)
f.close()
print("pickling train",timeit.default_timer() - start_time)

### Pickling TFIDF Vectors

In [4]:
import nltk
import numpy as np
import json
import pickle
import timeit
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize,stop_words=stopwords, min_df=5)

In [9]:
train_data = []
train_labels = []
f = open(trainfile,"r")
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    train_data.append(json_dump["review"])
    train_labels.append(json_dump["ratings"])
f.close()

In [10]:
start = timeit.default_timer()
feature_vector = vectorizer.fit_transform(train_data)
print(feature_vector.shape)
print(timeit.default_timer()-start)

(1000000, 85627)
802.7324854979997


In [11]:
pickle.dump(feature_vector,open("baseline_tfidf_train_vector.pickle", "wb"))
pickle.dump(vectorizer,open("baseline_vectorizer.pickle", "wb"))

In [12]:
dev_data = []
dev_labels = []
f = open(devfile,"r")
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    dev_data.append(json_dump["review"])
    dev_labels.append(json_dump["ratings"])
f.close()

In [13]:
dev_feature_vector = vectorizer.transform(dev_data)
dev_feature_vector.shape

(200000, 85627)

In [14]:
pickle.dump(dev_feature_vector,open("baseline_tfidf_dev_vector.pickle", "wb"))