In [1]:
import nltk
import nltk.sentiment
import numpy as np
import json
import pickle
import timeit
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import re
# import snowballstemmer



In [2]:
f = open("../stopwords_list.dat")
stopwords = set([word.strip().lower() for word in f.readlines()])
f.close()

In [3]:
f = open("../spl_symbols.dat")
spl_symb = set([word.strip().lower() for word in f.readlines()])
f.close()

In [4]:
devfile = "../../A1_Data/dev.json"
trainfile = "../../A1_Data/train.json"

### Pickling TFIDF vectors with preprocessing

In [5]:
cap_block = re.compile('[A-Z][A-Z][A-Z]+')
numeral = re.compile('\d+\.*\d*')

In [6]:
# stemmer = snowballstemmer.stemmer("english")

In [7]:
def remove_spl_symbols(word):
    word = [char for char in word if char not in spl_symb]
    word = "".join(word)
    return word

In [8]:
def preprocess(text):
    # tokenize
    text = nltk.word_tokenize(text.lower())
    # stemming
    # text = stemmer.stemWords(text)
    # substitute all numbers with a common text
    # text = [numeral.sub("_numeral_",word) for word in text]
    # remove stopwords
    text = [word for word in text if word not in stopwords]
    # propogate negation
    text = nltk.sentiment.util.mark_negation(text)
    # remove unnecessary symbols
    text = list(map(remove_spl_symbols, text))
    # remove punctuations ONLY after removing negation
    text = [word for word in text if len(word)>0]
    
    return text

In [12]:
vectorizer = TfidfVectorizer(tokenizer=preprocess, min_df=3, lowercase=False,
                             sublinear_tf=True, strip_accents="ascii", ngram_range=(1,2), max_features = 201031)

In [10]:
train_data = []
train_labels = []
f = open(trainfile,"r")
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    train_data.append(json_dump["review"])
    train_labels.append(json_dump["ratings"])
f.close()

In [13]:
start = timeit.default_timer()
feature_vector = vectorizer.fit_transform(train_data)
print(feature_vector.shape)
print(timeit.default_timer()-start)

(1000000, 201031)
2361.3233779040165


In [14]:
pickle.dump(feature_vector,open("neg_tfidf_train_vector.pickle", "wb"))
pickle.dump(vectorizer,open("neg_vectorizer.pickle", "wb"))

In [15]:
del train_data
del feature_vector

In [16]:
dev_data = []
dev_labels = []
f = open(devfile,"r")
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    dev_data.append(json_dump["review"])
    dev_labels.append(json_dump["ratings"])
f.close()

In [17]:
start = timeit.default_timer()
dev_feature_vector = vectorizer.transform(dev_data)
print(dev_feature_vector.shape)
print(timeit.default_timer()-start)

(200000, 201031)
344.0937866587192


In [18]:
pickle.dump(dev_feature_vector,open("neg_tfidf_dev_vector.pickle", "wb"))

In [19]:
f = open("feature_names.txt","w")
for i in vectorizer.get_feature_names():
    f.write(i+"\n")
f.close()