# Sentiment classification

## Generate vocabulary for yelp and IMDB

Output IMDB-vocab.txt and YELP-vocab.txt. Each file should contain the 10,000 words in the vocabulary, their corresonding id, and their frequency. Each line is a word, its numeric id, and its frequency all tab separated. Example:

the 1 20456 <br>
a   2 18003<br>
and 3 16830<br>
of  4 15456<br>
in  5 15016<br>
...
<br>
where the is the word, 1 is the id of the word, and 20456 is the frequency of the word.

In [1]:
import re
import string
from string import digits
import csv
import collections
import csv


In [94]:
import numpy as np
col = [ [ ["this","good","good"],["this","is","not","good"] ],  [["yo","damn","good"],["this","is","not","good"]] ]
vocab_dict = {'this':0, 'is':1,'good':2,'not':3,'damn':4,'yo':5}

import time

t1 = time.time()
for review in col:
    v2 = [0]*len(vocab_dict)
    for sentence in review:
        v1 = [0]*len(vocab_dict)
        for word in sentence:
            i = vocab_dict.get(word)
            v1[i] += 1
            v2[i] += 1
        sentence_vectors.append(v1)
    review_vectors.append(v2)
X_sentences = np.array(vectors)
X_reviews = np.array(review_vectors)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

Cs = [1,10,30,50,100]
solvers = ['sag']
params_grid = {'C':Cs,'solver':solvers}

logisticRegression_clf = LogisticRegression()

        
clf = GridSearchCV(logisticRegression_clf,params_grid,cv=10,scoring = 'accuracy')
clf.fit(x_reviews, y_reviews)

clf.predict(x_sentences)





In [153]:
def strim(line):
    #remove <br /><br />
    line = line.replace("<br /><br />", ' ');
    #remove digits
    line = line.translate(str.maketrans('', '', digits))
    #remove punctuations
    line = re.sub('['+string.punctuation+']', '', line)
    #lower case
    line = line.lower()
    return line

def get_top_words(textfile, top=10000):    
    ''' Returns the most common words in the textfile.'''
    words = collections.Counter()
    with open(textfile) as textfile:
         for line in textfile:
            line = strim(line)
            #how often each word appears
            words.update(line.split())
    return dict(words.most_common(top))

IMDB_dict = get_top_words("IMDB-train.txt")
yelp_dict = get_top_words("yelp-train.txt")


def generate_vocab(fileToWrite, vocab_dict):
    with open(fileToWrite, 'w') as csv_file:
    writer = csv.writer(csv_file)
    i = 1
    for key, value in vocab_dict.items():
        vocab_dict[key] = (i,value)
        writer.writerow([str(key)+'\t'+ str(i) + '\t' + str(value)])
        i+=1;

#generate IMDB-vocab
generate_vocab('IMDB-vocab.txt',IMDB_dict)
#generate yelp-vocab
generate_vocab('yelp-vocab.txt',yelp_dict)

## Convert train, test, valid to id-id-...-label format for yelp and IMDB

For train/valid/test file, each line is a data point. Each review should be represented as space separates ids for corresponding words in the review3 and the class label in mentioned in the end of the review as tab separated. Example:

<b> 100 8 3 1034 0 </b>

Here 0 is the class label and rest of the numbers represent a 4 word review.

In [162]:
def getWordId(word, vocab_dict):
    if word in vocab_dict:
        return str(vocab_dict[word][0])
    else: return '0'
             
#print(getWordId("eagle"))

def convert(fileToRead,fileToWrite,vocab_dict):
    with open(fileToRead) as openfile:
        reader = csv.reader(openfile,delimiter='\t')
        with open(fileToWrite,'w') as file:
            writer = csv.writer(file, delimiter='\t')
            i = 0
            for line in reader:
                lineToList = strim(line[0]).split()
                idsList = list(map(lambda word: getWordId(word,vocab_dict), lineToList))
                #print (idsList)
                idsLine = ' '.join(idsList)
                result = [idsLine,line[1]]
                writer.writerow(result)
    openfile.close()
            
print ('converting...')
convert("IMDB-train.txt","IMDB-train.txt",IMDB_dict)
convert("IMDB-valid.txt","IMDB-valid.txt",IMDB_dict)
convert("IMDB-test.txt","IMDB-test.txt",IMDB_dict)
        
convert("yelp-train.txt","yelp-train.txt",yelp_dict)
convert("yelp-valid.txt","yelp-valid.txt",yelp_dict)
convert("yelp-test.txt","yelp-test.txt",yelp_dict)

print('done converting')


converting...
done converting


## Make BBOW and FBOW data for train, test and valid

### BBOW 

x = <br>
0 0 0 1 ... 1 <br>
0 1 0 0 ... 0 <br>
... <br>
0 0 0 1 ... 1 <br>
0 1 0 0 ... 0 <br>

y = <br>
0 <br>
1 <br>
.. <br>
0 <br>
1 <br>

### FBOW

x = <br>
12 3 0 9 ... 1 <br>
30 1 28 10 ... 0 <br>
... <br>
10 20 30 1 ... 1 <br>
22 1 17 25 ... 0 <br>

y = <br>
0 <br>
1 <br>
.. <br>
0 <br>
1 <br>

In [2]:
#bbow_data(file) return x=[[0,0,1...],[0,0,..] ] , y= [[0],[1],....]
def bbow_data(file):
    size = 0
    with open(file) as f:
        reader = csv.reader(f,delimiter='\t')
        size = len(list(reader))
    x = np.zeros((size,10000))
    y = np.zeros((size,1))
    with open(file) as f:
        reader = csv.reader(f,delimiter='\t')
        line_i = 0
        for line in reader:
            #print (bbow_v(line[0]))
            
            lineToList = line[0].split()
            for id in lineToList:
                id = int(id)
                if(id !=0):
                    x[line_i][id-1] = 1
            
            
            y[line_i][0] = line[1]
            line_i += 1
    
    return x,y
#print ('running')
#x,y = bbow("IMDB-train-2.txt")
#print (x)
#print(y)

In [3]:
#fbow return x=[[20,10,1...],[21,99,..]] , y= [[0],[1],....]
def fbow_data(file):
    size = 0
    with open(file) as f:
        reader = csv.reader(f,delimiter='\t')
        size = len(list(reader))
    x = np.zeros((size,10000))
    y = np.zeros((size,1))
    
    with open(file) as f:
        reader = csv.reader(f,delimiter='\t')
        line_i = 0
        for line in reader:
            lineToList = line[0].split()
            for id in lineToList:
                id = int(id)
                if(id !=0):
                    x[line_i][id-1] = lineToList.count(str(id))
                    
            y[line_i][0] = line[1]
            
            line_i += 1
    
    return x,y
#print ('running')
#x,y = fbow("yelp-train-2.txt")
#print (x)
#print(y)

#### Load BBOW and FBOW data for IMDB and YELP ONLY ONCE

In [4]:
yelp_bbow_train = bbow_data("yelp-train.txt")
yelp_bbow_test = bbow_data("yelp-test.txt")
yelp_bbow_valid = bbow_data("yelp-valid.txt")

yelp_fbow_train = fbow_data("yelp-train.txt")
yelp_fbow_test = fbow_data("yelp-test.txt")
yelp_fbow_valid = fbow_data("yelp-valid.txt")

IMDB_bbow_train = bbow_data("IMDB-train.txt")
IMDB_bbow_test = bbow_data("IMDB-test.txt")
IMDB_bbow_valid = bbow_data("IMDB-valid.txt")

IMDB_fbow_train = fbow_data("IMDB-train.txt")
IMDB_fbow_test = fbow_data("IMDB-test.txt")
IMDB_fbow_valid = fbow_data("IMDB-valid.txt")



In [38]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.utils import check_X_y
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
from sklearn.model_selection import PredefinedSplit

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import numpy as np
from enum import Enum
from random import randint

In [120]:
class Classifier(Enum):
    Random = 1
    Majority = 2
    BernouilliNB = 3
    GaussianNB = 4
    LinearSVM = 5
    DecisionTree = 6

#takes 3 datasets train,test,valid (IMDB or yelp)
def a3(train,test,valid,classifier):
    #read 3 train.txt, test.txt, valid.txt
    x_train,y_train = train
    x_test,y_test = test
    x_valid,y_valid = valid
    
    y_train = y_train.ravel()
    y_test = y_test.ravel()
    y_valid = y_valid.ravel()
    
    
    xc,yc = check_X_y(x_train,y_train)
    
    x_train_valid = np.concatenate((x_train, x_valid))
    y_train_valid = np.concatenate((y_train, y_valid))

    
    
    
     
    # majority classifier
    if(classifier == Classifier.Majority):
        major_clf = DummyClassifier(strategy="most_frequent")
        major_clf.fit(xc,yc)

        y_major_pred = major_clf.predict(x_test)
        print("Major classifier")
        #print (y_major_pred)
        print("Accuracy score: ", accuracy_score(y_test,y_major_pred))
        print("F1 score: ", f1_score(y_test,y_major_pred,average="macro"))
        #print(classification_report(y_test,y_major_pred))
        return
    
    
    # rand classifier
    if(classifier == Classifier.Random):
        rand_clf = DummyClassifier(strategy="uniform")
        rand_clf.fit(xc,yc)

        y_rand_pred = rand_clf.predict(x_test)
        print("Random classifier")
        #print (y_rand_pred)
        print("Accuracy score: ", accuracy_score(y_test,y_rand_pred))
        print("F1 score: ", f1_score(y_test,y_rand_pred,average="macro"))
        #print(classification_report(y_test,y_rand_pred))
        return

    
    if(classifier == Classifier.BernouilliNB):
        #NAIVE BAYES
        # bernouille NB
        
        
        alpha_range = [0,0.01,0.1,0.5,1,2,3,4]
        
        best_alpha_f1 = (alpha_range[0],0)
        alpha_f1_record = []
        for a in alpha_range:
            berNB_clf = BernoulliNB(alpha=a, binarize=0.0, class_prior=None, fit_prior=True)
            berNB_clf.fit(x_train, y_train)
            #BernoulliNB(alpha=a, binarize=0.1, class_prior=None, fit_prior=True)
            y_berNB_pred = berNB_clf.predict(x_valid)
            F1 = f1_score(y_valid,y_berNB_pred,average='macro')
            alpha_f1_record.append( (a,F1))
            if(F1 > best_alpha_f1[1]):
                best_alpha_f1 = (a,F1)
        
        
        #
        
        #
        best_alpha = best_alpha_f1[0]
        print ("best alpha ",best_alpha)
        berNB_clf = BernoulliNB(alpha=best_alpha, binarize=0.0, class_prior=None, fit_prior=True)
        berNB_clf.fit(x_train, y_train)
        
        print("Bernouilli NB classifier")
        
        print("alpha range: ", alpha_range)
        print (alpha_f1_record)
        #print(y_berNB_pred)
        print("Accuracy score on test set: ", accuracy_score(y_test,berNB_clf.predict(x_test)))
        print("F1 score train: ", f1_score(y_train,berNB_clf.predict(x_train),average='macro'))
        print("F1 score valid: ", f1_score(y_valid,berNB_clf.predict(x_valid),average='macro'))
        print("F1 score test: ", f1_score(y_test,berNB_clf.predict(x_test),average='macro'))
        #print(classification_report(y_test,y_berNB_pred))
        return

    # gaussian NB
    if(classifier == Classifier.GaussianNB):
        gauNB_clf = GaussianNB()
        gauNB_clf.fit(x_train, y_train)

        y_gauNB_pred = gauNB_clf.predict(x_test)
        print("Gaussian NB classifier")
        #print(y_gauNB_pred)
        print ("No hyperparameters to be learned")
        print("Accuracy score on test set: ", accuracy_score(y_test,y_gauNB_pred))
        print("F1 score train: ", f1_score(y_train,gauNB_clf.predict(x_train),average='macro'))
        print("F1 score valid: ", f1_score(y_valid,gauNB_clf.predict(x_valid),average='macro'))
        print("F1 score test: ", f1_score(y_test,y_gauNB_pred,average='macro'))
        #print(classification_report(y_test,y_gauNB_pred))
        return

    
    # SVM
    if(classifier == Classifier.LinearSVM):
        
        ps = PredefinedSplit([0]*len(x_train) + [1]*len(x_valid))
       
        
        linearSVM_clf = LinearSVC()
        
        Cs = [1,10,100]
        tols = [1e-5,1e-4,1e-3,1e-2]
        max_iters = [1,10,100]
        
        print("C range ", Cs)
        print("tol range ", tols)
        print("max_iter range ", max_iters)
        params_grid = {'C':Cs,'tol':tols,'max_iter':max_iters}
        
        clf = RandomizedSearchCV(linearSVM_clf,params_grid,cv=2,scoring = 'f1_macro')
    
        clf.fit(x_train_valid, y_train_valid)
        print(clf.best_params_)
        print("Accuracy score on test set: ", accuracy_score(y_test,clf.predict(x_test)))
        print("F1 score train: ", f1_score(y_train,clf.predict(x_train),average='macro'))
        print("F1 score valid: ", f1_score(y_valid,clf.predict(x_valid),average='macro'))
        print("F1 score test: ", f1_score(y_test,clf.predict(x_test),average='macro'))
        return
    
    
    # DT
    if(classifier == Classifier.DecisionTree):
        DT_clf = DecisionTreeClassifier(random_state=0)
        
        params_grid = {'max_depth':[10,20,None],'min_samples_leaf':[1,5,10,15],'criterion':['gini']}
        print ('range ',params_grid)
        
        clf = RandomizedSearchCV(DT_clf,params_grid,cv=2,verbose=1)
        clf.fit(x_train_valid,y_train_valid)
        print(clf.best_params_)
        #print("Accuracy score on test set: ", accuracy_score(y_test,clf.predict(x_test)))
        print("F1 score train: ", f1_score(y_train,clf.predict(x_train),average='macro'))
        print("F1 score valid: ", f1_score(y_valid,clf.predict(x_valid),average='macro'))
        print("F1 score test: ", f1_score(y_test,clf.predict(x_test),average='macro'))
        return


# -----YELP BBOW-----

### Random Classifier & Majority Classifier (YELP BBOW)

In [85]:
#Question a
a3(yelp_bbow_train, yelp_bbow_test, yelp_bbow_valid, Classifier.Random)
a3(yelp_bbow_train, yelp_bbow_test, yelp_bbow_valid, Classifier.Majority)

Random classifier
Accuracy score:  0.206
F1 score:  0.19171164871579427
Major classifier
Accuracy score:  0.351
F1 score:  0.10392301998519615


  'precision', 'predicted', average, warn_for)


### Bernouilli Naive Bayes Classifier (YELP BBOW)

In [89]:
a3(yelp_bbow_train, yelp_bbow_test, yelp_bbow_valid, Classifier.BernouilliNB)

  'setting alpha = %.1e' % _ALPHA_MIN)


0.01
Bernouilli NB classifier
alpha range:  [0, 0.01, 0.1, 0.5, 1, 2, 3, 4]
[(0, 0.28119867051278813), (0.01, 0.3834688068328239), (0.1, 0.36928058341966186), (0.5, 0.3484823877249245), (1, 0.32787582466183035), (2, 0.3017452114165385), (3, 0.264402649039394), (4, 0.2442534522248531)]
Accuracy score on test set:  0.4375
F1 score train:  0.7701132102709616
F1 score valid:  0.3834688068328239
F1 score test:  0.3610618444272477


### Linear SVM Classifier (YELP BBOW)

In [110]:
a3(yelp_bbow_train, yelp_bbow_test, yelp_bbow_valid, Classifier.LinearSVM)

C range  [1, 10, 100]
tol range  [1e-05, 0.0001, 0.001, 0.01]
max_iter range  [1, 10, 100]
{'tol': 1e-05, 'max_iter': 10, 'C': 10}
Accuracy score on test set:  0.466
F1 score train:  0.8968780457501515
F1 score valid:  0.8939968795056135
F1 score test:  0.4232184434136304


### Decision Tree Classifier (YELP BBOW)

In [None]:
print('running...')
a3(yelp_bbow_train, yelp_bbow_test, yelp_bbow_valid, Classifier.DecisionTree)

# ---- YELP FBOW-----

### Gaussian Naive Bayes Classifier (YELP FBOW)

In [114]:
a3(yelp_fbow_train, yelp_fbow_test, yelp_fbow_valid, Classifier.GaussianNB)

Gaussian NB classifier
No hyperparameters to be learned
Accuracy score on test set:  0.275
F1 score train:  0.6554568775139135
F1 score valid:  0.2377725803613623
F1 score test:  0.2313890394496927


### Linear SVM Classifier (YELP FBOW)

In [118]:
a3(yelp_fbow_train, yelp_fbow_test, yelp_fbow_valid, Classifier.LinearSVM)

C range  [1, 10, 100]
tol range  [1e-05, 0.0001, 0.001, 0.01]
max_iter range  [1, 10, 100]
{'tol': 1e-05, 'max_iter': 10, 'C': 1}
Accuracy score on test set:  0.4885
F1 score train:  0.8089433356523472
F1 score valid:  0.8011106117374922
F1 score test:  0.44080885452995844


### Decision Tree Classifier (YELP FBOW)

In [121]:
a3(yelp_fbow_train, yelp_fbow_test, yelp_fbow_valid, Classifier.DecisionTree)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  1.8min finished


{'min_samples_leaf': 1, 'max_depth': 10, 'criterion': 'gini'}
F1 score train:  0.4677385842989718
F1 score valid:  0.44628993660866617
F1 score test:  0.2785697732802583


# ----- IMDB BBOW-----

### Random & Majority Classifier (IMDB BBOW)

In [86]:
a3(IMDB_bbow_train, IMDB_bbow_test, IMDB_bbow_valid,Classifier.Random)
a3(IMDB_bbow_train, IMDB_bbow_test, IMDB_bbow_valid,Classifier.Majority)

Random classifier
Accuracy score:  0.4966
F1 score:  0.49659145494199275
Major classifier
Accuracy score:  0.5
F1 score:  0.3333333333333333


  'precision', 'predicted', average, warn_for)


### Bernouilli Naive Bayes Classifier (IMDB BBOW)

In [90]:
a3(IMDB_bbow_train, IMDB_bbow_test, IMDB_bbow_valid, Classifier.BernouilliNB)

  'setting alpha = %.1e' % _ALPHA_MIN)


0.1
Bernouilli NB classifier
alpha range:  [0, 0.01, 0.1, 0.5, 1, 2, 3, 4]
[(0, 0.8416702830954341), (0.01, 0.8434652477196463), (0.1, 0.8440634968239715), (0.5, 0.8428540489524542), (1, 0.8421430136279198), (2, 0.8417288666218146), (3, 0.8409158444817308), (4, 0.8410958605831917)]
Accuracy score on test set:  0.83248
F1 score train:  0.8717611744131653
F1 score valid:  0.8440634968239715
F1 score test:  0.8323167704370142


### Linear SVM Classifier (IMDB BBOW)

In [111]:
a3(IMDB_bbow_train, IMDB_bbow_test, IMDB_bbow_valid,Classifier.LinearSVM)

C range  [1, 10, 100]
tol range  [1e-05, 0.0001, 0.001, 0.01]
max_iter range  [1, 10, 100]
{'tol': 0.0001, 'max_iter': 10, 'C': 1}
Accuracy score on test set:  0.84324
F1 score train:  0.9592489229232437
F1 score valid:  0.9584786044400819
F1 score test:  0.8430564325257395


### Decision Tree Classifier (IMDB BBOW)

In [None]:
a3(IMDB_bbow_train, IMDB_bbow_test, IMDB_bbow_valid,Classifier.DecisionTree)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


# -----IMDB FBOW-----

### Gaussian Naive Bayes Classifier (IMDB FBOW)

In [115]:
a3(IMDB_fbow_train, IMDB_fbow_test, IMDB_fbow_valid,Classifier.GaussianNB)

Gaussian NB classifier
No hyperparameters to be learned
Accuracy score on test set:  0.64556
F1 score train:  0.7743603382010854
F1 score valid:  0.7048652925838272
F1 score test:  0.6295127390846781


### Linear SVM Classifier (IMDB FBOW)

In [116]:
a3(IMDB_fbow_train, IMDB_fbow_test, IMDB_fbow_valid,Classifier.LinearSVM)

C range  [1, 10, 100]
tol range  [1e-05, 0.0001, 0.001, 0.01]
max_iter range  [1, 10, 100]
{'tol': 0.0001, 'max_iter': 10, 'C': 10}
Accuracy score on test set:  0.85024
F1 score train:  0.9265944012734234
F1 score valid:  0.9266881755358323
F1 score test:  0.8502187181168699


### Decision Tree Classifier (IMDB FBOW)

In [None]:
a3(IMDB_fbow_train, IMDB_fbow_test, IMDB_fbow_valid,Classifier.DecisionTree)