In [1]:
# I/O handling and profiling
import json
import pickle
import timeit

# feature extraction
import re
import nltk
import nltk.sentiment
import snowballstemmer

# feature representation
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [2]:
# hand-crafted corpus of stopwords relevant to Yelp dataset
f = open("stopwords_list.dat")
stopwords = set([word.strip().lower() for word in f.readlines()])
f.close()

In [3]:
# list of special symbols to remove during preprocessing
f = open("spl_symbols.dat")
spl_symb = set([word.strip().lower() for word in f.readlines()])
f.close()

In [4]:
devfile = "data/dev.json"
trainfile = "data/train.json"

#### Preprocessing Text

Preprocessing steps relevant to the task:
* convert to lowercase: handle sparsity due to different capitalization
* stem words: reduce vocabulary size <sup>#</sup>
* substitute numbers with a text signifying that there was a number: exact value of numbers often not important <sup>#</sup>
* remove stopwords
* remove punctuation marks
* mark words following negation with a suffic NEG uptil the first punctuation mark: differentiate positive sentences from negation
* remove infrequent words: reduce vocabulary size, Zipf's Law (remove words occuring $<3$ times)
* use n-grams: add local sequential information to features (make use of bigrams/trigrams)

<sup>#</sup> cause decrease in accuracy hence not part of final model

In [5]:
numeral = re.compile('\d+\.*\d*')

In [6]:
stemmer = snowballstemmer.stemmer("english")

In [7]:
def remove_spl_symbols(word):
    word = [char for char in word if char not in spl_symb]
    word = "".join(word)
    return word

In [8]:
def preprocess(text):
    # tokenize
    text = nltk.word_tokenize(text.lower())
    
    # stemming
    # text = stemmer.stemWords(text)
    
    # substitute all numbers with a common text
    # text = [numeral.sub("_numeral_",word) for word in text]
    
    # remove stopwords
    text = [word for word in text if word not in stopwords]
    
    # propogate negation
    text = nltk.sentiment.util.mark_negation(text)
    
    # remove unnecessary symbols
    text = list(map(remove_spl_symbols, text))
    
    # remove punctuations ONLY after removing negation
    text = [word for word in text if len(word)>0]
    
    return text

In [9]:
vectorizer = TfidfVectorizer(tokenizer=preprocess, min_df=3, lowercase=False,
                             sublinear_tf=True, strip_accents="ascii", ngram_range=(1,2), max_features = 312000)

#### Preprocessing and Vectorizing train set

In [10]:
train_data = []
train_labels = []
f = open(trainfile,"r")
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    train_data.append(json_dump["review"])
    train_labels.append(json_dump["ratings"])
f.close()

In [11]:
start = timeit.default_timer()
feature_vector = vectorizer.fit_transform(train_data)
print("Number of train examples, Number of features:",feature_vector.shape)
print(timeit.default_timer()-start)

Number of train examples, Number of features: (100000, 312000)
156.76875230800215


In [12]:
pickle.dump(feature_vector,open("train_vector.pickle", "wb"))
pickle.dump(vectorizer,open("vectorizer.pickle", "wb"))

In [13]:
del train_data
del feature_vector

#### Preprocessing and Vectorizing dev set

In [14]:
dev_data = []
dev_labels = []
f = open(devfile,"r")
while(True):
    s = f.readline()
    if (s==""):
        break
    json_dump = json.loads(s)    
    dev_data.append(json_dump["review"])
    dev_labels.append(json_dump["ratings"])
f.close()

In [15]:
start = timeit.default_timer()
dev_feature_vector = vectorizer.transform(dev_data)
print(dev_feature_vector.shape)
print(timeit.default_timer()-start)

(5000, 312000)
7.1499977499988745


In [16]:
pickle.dump(dev_feature_vector,open("dev_vector.pickle", "wb"))

#### Selected features

In [17]:
f = open("feature_names.txt","w")
for i in vectorizer.get_feature_names():
    f.write(i+"\n")
f.close()