# Sentiment Classification

In [1]:
# Python version

import sys
sys.version

'3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]'

In [2]:
# Importing packages

import numpy as np
import pandas as pd
import regex as re
import heapq

In [3]:
# importing the dataset

yelp_train = pd.read_table("./data/yelp-train.txt", header=None)
yelp_valid = pd.read_table("./data/yelp-valid.txt", header=None)
yelp_test = pd.read_table("./data/yelp-test.txt", header=None)

imdb_train = pd.read_table("./data/imdb-train.txt", header=None)
imdb_valid = pd.read_table("./data/imdb-valid.txt", header=None)
imdb_test = pd.read_table("./data/imdb-test.txt", header=None)

# 1- Converting to Binary bag-of-words and Frequency bag-of-words

### First step: Clean data and word vocabulary

In [4]:
# Function to clean the data

def pre_processing_steps(data):
    
    clean_data = []
    for phrase in data[0]:
        # 1. Remove any punctuation.
        without_punctuation = re.sub("[^a-zA-Z0-9]", " ", phrase)
        # 2. Convert to lower case. 
        to_lower_case = without_punctuation.lower()
        clean_data.append(to_lower_case)
    
    return clean_data

In [5]:
# Cleaning yelp and IMDB data

train_yelp = pre_processing_steps(yelp_train)
valid_yelp = pre_processing_steps(yelp_valid)
test_yelp = pre_processing_steps(yelp_test)

train_imdb = pre_processing_steps(imdb_train)
valid_imdb = pre_processing_steps(imdb_valid)
test_imdb = pre_processing_steps(imdb_test)

### Second step: Frequency of each word in the training set

In [6]:
# Function to count the most frequent words in a vocabulary

def mostFreq(clean_data):
     
    wordfreq = {}
    for phrase in clean_data:
        tokens = phrase.split()
        for token in tokens:
            if token not in wordfreq.keys():
                wordfreq[token] = 1
            else:
                wordfreq[token] += 1
                
    most_freq = heapq.nlargest(10000, wordfreq, key=wordfreq.get)
    
    return most_freq, wordfreq

In [7]:
# Counting the 10000 most frequent word for yelp and IMDB dataset

mostFreq_yelp, corpus_yelp = mostFreq(train_yelp)
mostFreq_imdb, corpus_imdb = mostFreq(train_imdb)

### Third step: Generate a 10,000 dimensional feature vector

In [8]:
# Function to create a binary bag-of-words

def binaryBoW(clean_data, most_freq):
    
    binary_bow = []
    for phrase in clean_data:
        phrase_tokens = phrase.split()
        phrase_vec = []
        for token in most_freq:
            if token in phrase_tokens:
                phrase_vec.append(1)
            else:
                phrase_vec.append(0)
                
        binary_bow.append(phrase_vec)
    binary_bow = np.asarray(binary_bow)
    binary_bow = pd.DataFrame(binary_bow)
    
    return binary_bow

In [9]:
# Creating the binary bag-of-words for yelp dataset

X_train_yelp_bbow = binaryBoW(train_yelp, mostFreq_yelp)
X_valid_yelp_bbow = binaryBoW(valid_yelp, mostFreq_yelp)
X_test_yelp_bbow = binaryBoW(test_yelp, mostFreq_yelp)

In [10]:
# Creating the binary bag-of-words for IMDB dataset

X_train_imdb_bbow = binaryBoW(train_imdb, mostFreq_imdb)
X_valid_imdb_bbow = binaryBoW(valid_imdb, mostFreq_imdb)
X_test_imdb_bbow = binaryBoW(test_imdb, mostFreq_imdb)

In [11]:
# Checking the train data for yelp

X_train_yelp_bbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,0,1,0,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6996,1,1,1,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
6997,1,1,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6998,1,1,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Checking the valid data for yelp

X_valid_yelp_bbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,0,0
2,1,1,1,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,1,1,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
997,1,1,1,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
998,1,1,1,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Checking the test data for yelp

X_test_yelp_bbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1,1,1,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,1,1,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1996,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,1,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1998,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Checking the train data for IMDB

X_train_imdb_bbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
14996,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
14997,1,1,0,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
14998,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Checking the valid data for IMDB

X_valid_imdb_bbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9996,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9997,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9998,1,1,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Checking the test data for IMDB

X_test_imdb_bbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,1,1,1,1,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
24996,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
24997,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
24998,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Function to create a frequency bag-of-words

from sklearn.feature_extraction.text import CountVectorizer

def frequencyBoW(clean_data, most_freq):
    
    cvec = CountVectorizer(vocabulary = most_freq).fit(clean_data)
    data_freqbow = cvec.transform(clean_data)
    data_freqbow = data_freqbow.toarray().astype('float64')
    sum_occurences = data_freqbow.sum(axis=1)
    for i in range(len(data_freqbow)):
        if data_freqbow.sum(axis=1)[i] != 0:
            data_freqbow[i] = data_freqbow[i] / sum_occurences[i]

    fequencyBoW = pd.DataFrame(data_freqbow)
    
    return fequencyBoW

In [18]:
# Creating the frequency bag-of-words for yelp dataset

X_train_yelp_freqbow = frequencyBoW(train_yelp, mostFreq_yelp)
X_valid_yelp_freqbow = frequencyBoW(valid_yelp, mostFreq_yelp)
X_test_yelp_freqbow = frequencyBoW(test_yelp, mostFreq_yelp)

In [19]:
# Creating the frequency bag-of-words for IMDB dataset

X_train_imdb_freqbow = frequencyBoW(train_imdb, mostFreq_imdb)
X_valid_imdb_freqbow = frequencyBoW(valid_imdb, mostFreq_imdb)
X_test_imdb_freqbow = frequencyBoW(test_imdb, mostFreq_imdb)

In [20]:
# Checking the train data for IMDB

X_train_imdb_freqbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.047619,0.000000,0.0,0.023810,0.000000,0.071429,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.064748,0.021583,0.0,0.021583,0.014388,0.043165,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.122449,0.040816,0.0,0.020408,0.040816,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.038043,0.021739,0.0,0.016304,0.029891,0.027174,0.043478,0.013587,0.021739,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.045802,0.030534,0.0,0.015267,0.007634,0.015267,0.000000,0.038168,0.022901,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.059603,0.013245,0.0,0.019868,0.026490,0.026490,0.013245,0.013245,0.013245,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14996,0.081152,0.039267,0.0,0.047120,0.023560,0.010471,0.010471,0.002618,0.028796,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,0.071429,0.035714,0.0,0.023810,0.035714,0.023810,0.047619,0.000000,0.011905,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14998,0.049608,0.044386,0.0,0.023499,0.007833,0.033943,0.026110,0.033943,0.026110,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Checking the valid data for IMDB

X_valid_imdb_freqbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.043333,0.040000,0.0,0.063333,0.020000,0.026667,0.033333,0.033333,0.020000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.053097,0.079646,0.0,0.035398,0.035398,0.008850,0.035398,0.017699,0.017699,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.079470,0.039735,0.0,0.026490,0.026490,0.000000,0.079470,0.019868,0.026490,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.032258,0.026882,0.0,0.026882,0.037634,0.010753,0.032258,0.021505,0.016129,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.043478,0.086957,0.0,0.043478,0.000000,0.000000,0.000000,0.000000,0.043478,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.079412,0.029412,0.0,0.035294,0.032353,0.023529,0.011765,0.032353,0.014706,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.052533,0.026266,0.0,0.018762,0.020638,0.011257,0.022514,0.026266,0.018762,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.040323,0.016129,0.0,0.008065,0.024194,0.016129,0.032258,0.008065,0.032258,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.063063,0.036036,0.0,0.036036,0.018018,0.009009,0.072072,0.000000,0.009009,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Checking the test data for IMDB

X_test_imdb_freqbow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.028571,0.038095,0.0,0.009524,0.038095,0.019048,0.028571,0.009524,0.033333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.077519,0.023256,0.0,0.038760,0.023256,0.007752,0.000000,0.031008,0.023256,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.041379,0.034483,0.0,0.006897,0.020690,0.013793,0.013793,0.034483,0.006897,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.076923,0.054945,0.0,0.010989,0.010989,0.043956,0.000000,0.010989,0.010989,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.038462,0.038462,0.0,0.016484,0.010989,0.010989,0.021978,0.010989,0.005495,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.009091,0.036364,0.0,0.018182,0.018182,0.045455,0.000000,0.018182,0.009091,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.025000,0.025000,0.0,0.025000,0.037500,0.006250,0.000000,0.031250,0.006250,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.057402,0.021148,0.0,0.051360,0.036254,0.015106,0.018127,0.012085,0.021148,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.061404,0.057018,0.0,0.017544,0.035088,0.008772,0.017544,0.017544,0.017544,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Getting y for the different dataset

y_train_yelp = yelp_train[1]
y_valid_yelp = yelp_valid[1]
y_test_yelp = yelp_test[1]

y_train_imdb = imdb_train[1]
y_valid_imdb = imdb_valid[1]
y_test_imdb = imdb_test[1]

In [24]:
# Submitting yelp-vocab.txt

import operator
yelp_vocab = sorted(corpus_yelp.items(), key=operator.itemgetter(1), reverse=True)[:10000]
yelp_vocab = pd.DataFrame(data= yelp_vocab)
yelp_vocab["id"] = 1
yelp_vocab["id"] = list(range(1,10001,1)) 
yelp_vocab = pd.DataFrame(yelp_vocab, columns=[0,"id", 1])
yelp_vocab.columns = ["word", "id", "frequency"]
yelp_vocab.to_csv('yelp-vocab.csv', index = False)

In [25]:
# Submitting IMDB-vocab.txt

imdb_vocab = sorted(corpus_imdb.items(), key=operator.itemgetter(1), reverse=True)[:10000]
imdb_vocab = pd.DataFrame(data= imdb_vocab)
imdb_vocab["id"] = 1
imdb_vocab["id"] = list(range(1,10001,1)) 
imdb_vocab = pd.DataFrame(imdb_vocab, columns=[0,"id", 1])
imdb_vocab.columns = ["word", "id", "frequency"]
imdb_vocab.to_csv('IMDB-vocab.csv', index = False)

In [26]:
# code to create data for submission

def dataForSubmission(clean_data, most_freq, data_vocab, y):

    dataToSubmit = []
    for phrase in clean_data:
        phrase_tokens = phrase.split()
        phrase_vec = []
        for token in phrase_tokens:
            if token in most_freq:
                A = data_vocab[data_vocab["word"]==token]
                A = A.iat[0,1]
                phrase_vec.append(A)
            else:
                phrase_vec.append(0)
                
        dataToSubmit.append(phrase_vec)
    dataToSubmit = np.asarray(dataToSubmit)
    dataToSubmit = pd.DataFrame(dataToSubmit)
    dataToSubmit["y"] = y
    
    return dataToSubmit

In [27]:
Yelp_valid = dataForSubmission(valid_yelp, mostFreq_yelp, yelp_vocab, y_valid_yelp)
Yelp_train = dataForSubmission(train_yelp, mostFreq_yelp, yelp_vocab, y_train_yelp)
Yelp_test = dataForSubmission(test_yelp, mostFreq_yelp, yelp_vocab, y_test_yelp)

In [28]:
IMDB_train = dataForSubmission(train_imdb, mostFreq_imdb, imdb_vocab, y_train_imdb)
IMDB_valid = dataForSubmission(valid_imdb, mostFreq_imdb, imdb_vocab, y_valid_imdb)
IMDB_test = dataForSubmission(test_imdb, mostFreq_imdb, imdb_vocab, y_test_imdb)

In [29]:
Yelp_valid.to_csv('Yelp_valid.csv', index = False)
Yelp_train.to_csv('Yelp_train.csv', index = False)
Yelp_test.to_csv('Yelp_test.csv', index = False)

IMDB_train.to_csv('IMDB_train.csv', index = False)
IMDB_valid.to_csv('IMDB_valid.csv', index = False)
IMDB_test.to_csv('IMDB_test.csv', index = False)

# 2- Yelp dataset with binary bag-of-words

## a) Random classifier and the majority-class classifier

### Random Classifier

In [132]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score

In [133]:
yelp_random_clf = DummyClassifier(strategy='uniform', random_state=5)
yelp_random_clf.fit(X_train_yelp_bbow, y_train_yelp)

DummyClassifier(random_state=5, strategy='uniform')

In [134]:
yelp_random_clf_y_pred_train = yelp_random_clf.predict(X_train_yelp_bbow)
yelp_random_clf_y_pred_test = yelp_random_clf.predict(X_test_yelp_bbow)

yelp_random_clf_F1_measure_train = f1_score(y_train_yelp, yelp_random_clf_y_pred_train, average="macro")
yelp_random_clf_F1_measure_test = f1_score(y_test_yelp, yelp_random_clf_y_pred_test, average="macro")

print(yelp_random_clf_F1_measure_train)
print(yelp_random_clf_F1_measure_test)

0.18093428588837118
0.17429892537504088


In [33]:
# Report performance of the random classifier

random_clf_yelp = {'F1_measure_train': yelp_random_clf_F1_measure_train, 
                   'F1_measure_test': yelp_random_clf_F1_measure_test}
random_clf_yelp = pd.DataFrame(data = random_clf_yelp, index=["Random classifier Performance"])
random_clf_yelp.to_csv('Assigment3_2005119_Q2_a_randclf.csv', index=True)

### Majority-class classifier

In [151]:
def majorityClassifier(y_train, y_test):
    
    y_pred_train = np.zeros(len(y_train))
    y_pred_test = np.zeros(len(y_test))
    majority_class = pd.Series(y_train).value_counts().idxmax()
    y_pred_train[:] = majority_class
    y_pred_test[:] = majority_class
    f1_measure_train = f1_score(y_train, y_pred_train, average="macro")
    f1_measure_test = f1_score(y_test, y_pred_test, average="macro")
    
    return f1_measure_train, f1_measure_test

In [152]:
yelp_majority_clf_F1_measure_train, yelp_majority_clf_F1_measure_test = majorityClassifier(y_train_yelp, y_test_yelp)

print(yelp_majority_clf_F1_measure_train)
print(yelp_majority_clf_F1_measure_test)

0.10426700464723279
0.10392301998519615


In [36]:
# Report the performance of the majority classifier

majority_clf_yelp = {'F1_measure_train': yelp_majority_clf_F1_measure_train, 
                     'F1_measure_test': yelp_majority_clf_F1_measure_test}
majority_clf_yelp = pd.DataFrame(data = majority_clf_yelp, index=["Majority-class classifier Performance"])
majority_clf_yelp.to_csv('Assigment3_2005119_Q2_a_majoclf.csv', index=True)

## b) Naive Bayes, Decision Trees and Linear SVM

### Naive Bayes/BernoulliNB classifier

In [37]:
# creating a function to search for the best alpha from 0 to 1

from sklearn.naive_bayes import BernoulliNB

def gridSearchForNaiveBayes(X_train, y_train, X_valid, y_valid):
    
    F_measure_valid = {}
    for alpha in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
        naiveBayes_clf = BernoulliNB(alpha=alpha)
        naiveBayes_clf.fit(X_train, y_train)
        y_pred_valid = naiveBayes_clf.predict(X_valid)
        naiveBayes_clf_F1_measure = f1_score(y_valid, y_pred_valid, average ="macro")
        F_measure_valid[alpha] = naiveBayes_clf_F1_measure
    
    return F_measure_valid

In [38]:
yelp_nb_clf_F1_measure = gridSearchForNaiveBayes(X_train_yelp_bbow, y_train_yelp, X_valid_yelp_bbow, y_valid_yelp)
yelp_nb_clf_F1_measure

{0.1: 0.35964001570135207,
 0.2: 0.3679136598912025,
 0.3: 0.3611576065520078,
 0.4: 0.34987672698417865,
 0.5: 0.3557683355688238,
 0.6: 0.35348863715721257,
 0.7: 0.3412864866459319,
 0.8: 0.3356222688453187,
 0.9: 0.3332404661802079,
 1: 0.33018856067105257}

In [39]:
# Best alpha

yelp_best_alpha = pd.Series(yelp_nb_clf_F1_measure).idxmax()
yelp_best_alpha

0.2

In [40]:
# Prediction on the train data with the best alpha

yelp_naiveBayes_clf = BernoulliNB(alpha = yelp_best_alpha)
yelp_naiveBayes_clf.fit(X_train_yelp_bbow, y_train_yelp)
yelp_naiveBayes_clf_y_pred_train = yelp_naiveBayes_clf.predict(X_train_yelp_bbow)
yelp_naiveBayes_clf_F1_measure_train = f1_score(y_train_yelp, yelp_naiveBayes_clf_y_pred_train, average ="macro")
yelp_naiveBayes_clf_F1_measure_train

0.6846622161210074

In [41]:
# Prediction on the test data for the best alpha

yelp_naiveBayes_clf_y_pred_test = yelp_naiveBayes_clf.predict(X_test_yelp_bbow)
yelp_naiveBayes_clf_F1_measure_test = f1_score(y_test_yelp, yelp_naiveBayes_clf_y_pred_test, average ="macro")
yelp_naiveBayes_clf_F1_measure_test

0.38176470414150865

### Decision Trees: In order to avoid overfiting we have 2 great possibilities to tune the hyperparameter:
    1- pre-pruning parameters (max_depth, max_leaf_nodes, min_samples_leaf etc.)
    2- post-pruning (ccp_alphas)

For computation issue, we have decided to tune 2 hyperparameters of the pre-pruning (max_depth and max_leaf_nodes)

In [42]:
# creating a function to search for the best couple max_depth/max_leaf nodes

from sklearn.tree import DecisionTreeClassifier

def prePruningForDecisionTree(X_train, y_train, X_valid, y_valid):
    
    F_measure_valid = {}
    for max_depth in [None,10,20,30,40,50,60,70]:
        for max_leaf_node in [None,10,20,30,40,50,60,70]:
            decisionTree_clf = DecisionTreeClassifier(max_depth = max_depth, max_leaf_nodes = max_leaf_node,
                                                     random_state= 5)
            decisionTree_clf.fit(X_train, y_train)
            y_pred_valid = decisionTree_clf.predict(X_valid)
            decisionTree_clf_F1_measure = f1_score(y_valid, y_pred_valid, average ="macro")
            F_measure_valid[str(max_depth) + "_" + str(max_leaf_node)] = decisionTree_clf_F1_measure
    
    return F_measure_valid

In [43]:
yelp_dt_clf_F1_measure = prePruningForDecisionTree(X_train_yelp_bbow, y_train_yelp, X_valid_yelp_bbow, y_valid_yelp)
yelp_dt_clf_F1_measure

{'None_None': 0.28166520302980935,
 'None_10': 0.21701473344953973,
 'None_20': 0.2635663438284466,
 'None_30': 0.2815000145225813,
 'None_40': 0.27691557302402076,
 'None_50': 0.3080295143123063,
 'None_60': 0.32024881459323795,
 'None_70': 0.3200197124572032,
 '10_None': 0.2947555087488034,
 '10_10': 0.21701473344953973,
 '10_20': 0.2635663438284466,
 '10_30': 0.2746980592585576,
 '10_40': 0.31030027839146457,
 '10_50': 0.3105135092983559,
 '10_60': 0.3088013698116824,
 '10_70': 0.3009431182326061,
 '20_None': 0.292611133446289,
 '20_10': 0.21701473344953973,
 '20_20': 0.2635663438284466,
 '20_30': 0.2815000145225813,
 '20_40': 0.27691557302402076,
 '20_50': 0.3080295143123063,
 '20_60': 0.32024881459323795,
 '20_70': 0.3200197124572032,
 '30_None': 0.29215792694449016,
 '30_10': 0.21701473344953973,
 '30_20': 0.2635663438284466,
 '30_30': 0.2815000145225813,
 '30_40': 0.27691557302402076,
 '30_50': 0.3080295143123063,
 '30_60': 0.32024881459323795,
 '30_70': 0.3200197124572032,
 '40

In [44]:
# Best couple max_depth/max_leaf nodes

yelp_best_dt_hyperparam = pd.Series(yelp_dt_clf_F1_measure).idxmax()
yelp_best_dt_hyperparam

'None_60'

In [153]:
# Prediction on the train data with the best couple max_depth/max_leaf

yelp_decisiontrees_clf = DecisionTreeClassifier(max_leaf_nodes = 60, random_state= 5)
yelp_decisiontrees_clf.fit(X_train_yelp_bbow, y_train_yelp)
yelp_decisiontrees_clf_y_pred_train = yelp_decisiontrees_clf.predict(X_train_yelp_bbow)
yelp_decisiontrees_clf_F1_measure_train = f1_score(y_train_yelp, yelp_decisiontrees_clf_y_pred_train, average ="macro")
yelp_decisiontrees_clf_F1_measure_train

0.37260697024184625

In [154]:
# Prediction on the test data with the best couple max_depth/max_leaf

yelp_decisiontrees_clf_y_pred_test = yelp_decisiontrees_clf.predict(X_test_yelp_bbow)
yelp_decisiontrees_clf_F1_measure_test = f1_score(y_test_yelp, yelp_decisiontrees_clf_y_pred_test, average ="macro")
yelp_decisiontrees_clf_F1_measure_test

0.3027913551628164

### Linear SVM: C is the regularization parameter

In [47]:
# creating a function to search for the best C from 1 to 100

from sklearn.svm import LinearSVC

def gridSearchForLinearSVC(X_train, y_train, X_valid, y_valid):
    
    F_measure_valid = {}
    for C in [1,10,20,30,40,50,60,70,80,90,100]:
        linearSVC_clf = LinearSVC(C = C, random_state=5, loss="hinge")
        linearSVC_clf.fit(X_train, y_train)
        y_pred_valid = linearSVC_clf.predict(X_valid)
        linearSVC_clf_F1_measure = f1_score(y_valid, y_pred_valid, average ="macro")
        F_measure_valid[C] = linearSVC_clf_F1_measure
    
    return F_measure_valid

In [48]:
import warnings
warnings.filterwarnings("ignore")

yelp_lsvc_clf_F1_measure = gridSearchForLinearSVC(X_train_yelp_bbow, y_train_yelp, X_valid_yelp_bbow, y_valid_yelp)
yelp_lsvc_clf_F1_measure

{1: 0.39707948730695647,
 10: 0.40796729908620843,
 20: 0.40988600937734543,
 30: 0.4169221874110124,
 40: 0.4136180608169787,
 50: 0.41577681702784586,
 60: 0.41378545108886194,
 70: 0.4144687495182063,
 80: 0.4157614944467003,
 90: 0.41643229268921383,
 100: 0.41670146881239667}

In [49]:
# Best C

yelp_best_C = pd.Series(yelp_lsvc_clf_F1_measure).idxmax()
yelp_best_C

30

In [50]:
# Prediction on the train data with the best C

yelp_linearsvc_clf = LinearSVC(C = yelp_best_C, random_state=5, loss="hinge")
yelp_linearsvc_clf.fit(X_train_yelp_bbow, y_train_yelp)
yelp_linearsvc_clf_y_pred_train = yelp_linearsvc_clf.predict(X_train_yelp_bbow)
yelp_linearsvc_clf_F1_measure_train = f1_score(y_train_yelp, yelp_linearsvc_clf_y_pred_train, average ="macro")
yelp_linearsvc_clf_F1_measure_train

0.9988230596221033

In [51]:
# Prediction on the test data with the best C

yelp_linearsvc_clf_y_pred_test = yelp_linearsvc_clf.predict(X_test_yelp_bbow)
yelp_linearsvc_clf_F1_measure_test = f1_score(y_test_yelp, yelp_linearsvc_clf_y_pred_test, average ="macro")
yelp_linearsvc_clf_F1_measure_test

0.39669066810332193

## d) Report training, validation, and test F1-measure for all the classifiers (with best hyper-parameter configuration).

In [52]:
# Report the performance on validation test for alpha for Naive Bayes

alpha_nb_yelp = pd.DataFrame(data = yelp_nb_clf_F1_measure, index=["F1-measure"])
alpha_nb_yelp.to_csv('Assigment3_2005119_Q2_d_alpha.csv', index=True)

In [53]:
# Report the performance on train and test data for Naive Bayes

naivesBayes_yelp = {'F1_measure_train': yelp_naiveBayes_clf_F1_measure_train, 
                   'F1_measure_test': yelp_naiveBayes_clf_F1_measure_test}
naivesBayes_yelp = pd.DataFrame(data = naivesBayes_yelp, index=["Naive Bayes Performance"])
naivesBayes_yelp.to_csv('Assigment3_2005119_Q2_d_nb.csv', index=True)

In [155]:
# Report the performance on validation test for max depth for Decision Trees

hyperparam_dt_yelp = pd.DataFrame(data = yelp_dt_clf_F1_measure, index=["F1-measure"])
hyperparam_dt_yelp.to_csv('Assigment3_2005119_Q2_d_hyperparam.csv', index=True)

In [156]:
# Report the performance on train and test data for Decision Trees

decisionTrees_yelp = {'F1_measure_train': yelp_decisiontrees_clf_F1_measure_train, 
                   'F1_measure_test': yelp_decisiontrees_clf_F1_measure_test}
decisionTrees_yelp = pd.DataFrame(data = decisionTrees_yelp, index=["Decision Trees Performance"])
decisionTrees_yelp.to_csv('Assigment3_2005119_Q2_d_dt.csv', index=True)

In [56]:
# Report the performance on validation test C for Linear SVC

c_lsvc_yelp = pd.DataFrame(data = yelp_lsvc_clf_F1_measure, index=["F1-measure"])
c_lsvc_yelp.to_csv('Assigment3_2005119_Q2_d_Cparam.csv', index=True)

In [57]:
# Report the performance on train and test data for Linear SVC

linearSVC_yelp = {'F1_measure_train': yelp_linearsvc_clf_F1_measure_train, 
                   'F1_measure_test': yelp_linearsvc_clf_F1_measure_test}
linearSVC_yelp = pd.DataFrame(data = linearSVC_yelp, index=["Linear SVC Performance"])
linearSVC_yelp.to_csv('Assigment3_2005119_Q2_d_lsvc.csv', index=True)

# 3- Yelp dataset with frequency bag-of-words

## a) Naive Bayes, Decision Trees and Linear SVM

### Naive Bayes/GaussianNB classifier

In [58]:
from sklearn.naive_bayes import GaussianNB

yelp_naiveBayes_clf = GaussianNB()
yelp_naiveBayes_clf.fit(X_train_yelp_freqbow, y_train_yelp)
yelp_naiveBayes_clf_y_pred_train = yelp_naiveBayes_clf.predict(X_train_yelp_freqbow)
yelp_naiveBayes_clf_F1_measure_train = f1_score(y_train_yelp, yelp_naiveBayes_clf_y_pred_train, average ="macro")
yelp_naiveBayes_clf_F1_measure_train

0.7818166417123237

In [59]:
# Prediction on the test data

yelp_naiveBayes_clf_y_pred_test = yelp_naiveBayes_clf.predict(X_test_yelp_freqbow)
yelp_naiveBayes_clf_F1_measure_test = f1_score(y_test_yelp, yelp_naiveBayes_clf_y_pred_test, average ="macro")
yelp_naiveBayes_clf_F1_measure_test

0.25302793206715596

### Decision Trees: In order to avoid overfiting we have 2 great possibilities to tune the hyperparameter:
    1- pre-pruning parameters (max_depth, max_leaf_nodes, min_samples_leaf etc.)
    2- post-pruning (ccp_alphas)

For computation issue, we have decided to tune 2 hyperparameters of the pre-pruning (max_depth and max_leaf_nodes)

In [60]:
yelp_dt_clf_F1_measure = prePruningForDecisionTree(X_train_yelp_freqbow, y_train_yelp, X_valid_yelp_freqbow, y_valid_yelp)
yelp_dt_clf_F1_measure

{'None_None': 0.30024993317321613,
 'None_10': 0.2572593871172099,
 'None_20': 0.29488328978857004,
 'None_30': 0.26220380125236525,
 'None_40': 0.2834488810942,
 'None_50': 0.3278964996902152,
 'None_60': 0.3267542682696005,
 'None_70': 0.32060447309750445,
 '10_None': 0.3069670462411807,
 '10_10': 0.2572593871172099,
 '10_20': 0.29488328978857004,
 '10_30': 0.25326850043478427,
 '10_40': 0.30546712600216963,
 '10_50': 0.3015475432193213,
 '10_60': 0.30515559514350815,
 '10_70': 0.3090394525583511,
 '20_None': 0.2912299350865731,
 '20_10': 0.2572593871172099,
 '20_20': 0.29488328978857004,
 '20_30': 0.26220380125236525,
 '20_40': 0.2834488810942,
 '20_50': 0.3278964996902152,
 '20_60': 0.3267542682696005,
 '20_70': 0.32060447309750445,
 '30_None': 0.31314035031581106,
 '30_10': 0.2572593871172099,
 '30_20': 0.29488328978857004,
 '30_30': 0.26220380125236525,
 '30_40': 0.2834488810942,
 '30_50': 0.3278964996902152,
 '30_60': 0.3267542682696005,
 '30_70': 0.32060447309750445,
 '40_None'

In [61]:
# Best couple max_depth/max_leaf nodes

best_dt_hyperparam = pd.Series(yelp_dt_clf_F1_measure).idxmax()
best_dt_hyperparam

'None_50'

In [62]:
# Prediction on the train data with the best couple max_depth/max_leaf

yelp_decisiontrees_clf = DecisionTreeClassifier(max_leaf_nodes = 50, random_state= 5)
yelp_decisiontrees_clf.fit(X_train_yelp_freqbow, y_train_yelp)
yelp_decisiontrees_clf_y_pred_train = yelp_decisiontrees_clf.predict(X_train_yelp_freqbow)
yelp_decisiontrees_clf_F1_measure_train = f1_score(y_train_yelp, yelp_decisiontrees_clf_y_pred_train, average ="macro")
yelp_decisiontrees_clf_F1_measure_train

0.3719816076105148

In [63]:
# Prediction on the test data with the best couple max_depth/max_leaf

yelp_decisiontrees_clf_y_pred_test = yelp_decisiontrees_clf.predict(X_test_yelp_freqbow)
yelp_decisiontrees_clf_F1_measure_test = f1_score(y_test_yelp, yelp_decisiontrees_clf_y_pred_test, average ="macro")
yelp_decisiontrees_clf_F1_measure_test

0.29202714250481937

### Linear SVM: C is the regularization parameter

In [64]:
yelp_lsvc_clf_F1_measure = gridSearchForLinearSVC(X_train_yelp_freqbow, y_train_yelp, X_valid_yelp_freqbow, y_valid_yelp)
yelp_lsvc_clf_F1_measure

{1: 0.36295764171168776,
 10: 0.38773599593863056,
 20: 0.4109716732127847,
 30: 0.4207843887652981,
 40: 0.4303973080455855,
 50: 0.42919910946579787,
 60: 0.42949166345183476,
 70: 0.44180121955078366,
 80: 0.4255512105694052,
 90: 0.4314049667627812,
 100: 0.4320736914584865}

In [65]:
# Best C

best_C = pd.Series(yelp_lsvc_clf_F1_measure).idxmax()
best_C

70

In [66]:
# Prediction on the train data with the best C

yelp_linearsvc_clf = LinearSVC(C = best_C, random_state=5, loss="hinge")
yelp_linearsvc_clf.fit(X_train_yelp_freqbow, y_train_yelp)
yelp_linearsvc_clf_y_pred_train = yelp_linearsvc_clf.predict(X_train_yelp_freqbow)
yelp_linearsvc_clf_F1_measure_train = f1_score(y_train_yelp, yelp_linearsvc_clf_y_pred_train, average ="macro")
yelp_linearsvc_clf_F1_measure_train

0.7619013158765364

In [67]:
# Prediction on the test data with the best C

yelp_linearsvc_clf_y_pred_test = yelp_linearsvc_clf.predict(X_test_yelp_freqbow)
yelp_linearsvc_clf_F1_measure_test = f1_score(y_test_yelp, yelp_linearsvc_clf_y_pred_test, average ="macro")
yelp_linearsvc_clf_F1_measure_test

0.43692061490272954

## c) Report training, validation, and test F1-measure for all the classifiers (with best hyper-parameter configuration).

In [68]:
# Report the performance on train and test data for Naive Bayes

naivesBayes_yelp = {'F1_measure_train': yelp_naiveBayes_clf_F1_measure_train, 
                   'F1_measure_test': yelp_naiveBayes_clf_F1_measure_test}
naivesBayes_yelp = pd.DataFrame(data = naivesBayes_yelp, index=["Naive Bayes Performance"])
naivesBayes_yelp.to_csv('Assigment3_2005119_Q3_c_nb.csv', index=True)

In [69]:
# Report the performance on validation test for max depth/max leaf for Decision Trees

hyperparam_dt_yelp = pd.DataFrame(data = yelp_dt_clf_F1_measure, index=["F1-measure"])
hyperparam_dt_yelp.to_csv('Assigment3_2005119_Q3_c_hyperparam.csv', index=True)

In [70]:
# Report the performance on train and test data for Decision Trees

decisionTrees_yelp = {'F1_measure_train': yelp_decisiontrees_clf_F1_measure_train, 
                   'F1_measure_test': yelp_decisiontrees_clf_F1_measure_test}
decisionTrees_yelp = pd.DataFrame(data = decisionTrees_yelp, index=["Decision Trees Performance"])
decisionTrees_yelp.to_csv('Assigment3_2005119_Q3_c_dt.csv', index=True)

In [71]:
# Report the performance on validation test C for Linear SVC

c_lsvc_yelp = pd.DataFrame(data = yelp_lsvc_clf_F1_measure, index=["F1-measure"])
c_lsvc_yelp.to_csv('Assigment3_2005119_Q3_c_Cparam.csv', index=True)

In [72]:
# Report the performance on train and test data for Linear SVC

linearSVC_yelp = {'F1_measure_train': yelp_linearsvc_clf_F1_measure_train, 
                   'F1_measure_test': yelp_linearsvc_clf_F1_measure_test}
linearSVC_yelp = pd.DataFrame(data = linearSVC_yelp, index=["Linear SVC Performance"])
linearSVC_yelp.to_csv('Assigment3_2005119_Q3_c_lsvc.csv', index=True)

# 4- IMDB dataset with binary bag-of-words

## a) Random classifier

### Random classifier

In [73]:
imdb_random_clf = DummyClassifier(strategy='uniform', random_state=37)
imdb_random_clf.fit(X_train_imdb_bbow, y_train_imdb)

DummyClassifier(random_state=37, strategy='uniform')

In [74]:
imdb_random_clf_y_pred_train = imdb_random_clf.predict(X_train_imdb_bbow)
imdb_random_clf_y_pred_test = imdb_random_clf.predict(X_test_imdb_bbow)

imdb_random_clf_F1_measure_train = f1_score(y_train_imdb, imdb_random_clf_y_pred_train)
imdb_random_clf_F1_measure_test = f1_score(y_test_imdb, imdb_random_clf_y_pred_test)

print(imdb_random_clf_F1_measure_train)
print(imdb_random_clf_F1_measure_test)

0.49936830906310264
0.49964083326682096


In [75]:
# Report performance of the random classifier

random_clf_imdb = {'F1_measure_train': imdb_random_clf_F1_measure_train, 
                   'F1_measure_test': imdb_random_clf_F1_measure_test}
random_clf_imdb = pd.DataFrame(data = random_clf_imdb, index=["Random classifier Performance"])
random_clf_imdb.to_csv('Assigment3_2005119_Q4_a.csv', index=True)

## b) Naive Bayes, Decision Trees and Linear SVM

### Naive Bayes/BernoulliNB classifier: single hyper-parameter alpha (controls model complexity). Large alpha more smoothing (less complex model) and small alpha less smoothing (more complex model)

In [76]:
# creating a function to search for the best alpha from 0 to 1

def gridSearchForBernoulliNB(X_train, y_train, X_valid, y_valid):
    
    F_measure_valid = {}
    for alpha in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
        naiveBayes_clf = BernoulliNB(alpha=alpha)
        naiveBayes_clf.fit(X_train, y_train)
        y_pred_valid = naiveBayes_clf.predict(X_valid)
        naiveBayes_clf_F1_measure = f1_score(y_valid, y_pred_valid)
        F_measure_valid[alpha] = naiveBayes_clf_F1_measure
    
    return F_measure_valid

In [77]:
imdb_nb_clf_F1_measure = gridSearchForBernoulliNB(X_train_imdb_bbow, y_train_imdb, X_valid_imdb_bbow, y_valid_imdb)
imdb_nb_clf_F1_measure

{0.1: 0.844421906693712,
 0.2: 0.8443903428687362,
 0.3: 0.843984581050923,
 0.4: 0.8432805521721478,
 0.5: 0.8430775477060496,
 0.6: 0.8427251497613971,
 0.7: 0.8427787934186473,
 0.8: 0.8426292796911512,
 0.9: 0.8423299786520281,
 1: 0.8420624427946709}

In [78]:
# Best alpha

imdb_best_alpha = pd.Series(imdb_nb_clf_F1_measure).idxmax()
imdb_best_alpha

0.1

In [79]:
# Prediction on the train data with the best alpha

imdb_naiveBayes_clf = BernoulliNB(alpha = imdb_best_alpha)
imdb_naiveBayes_clf.fit(X_train_imdb_bbow, y_train_imdb)
imdb_naiveBayes_clf_y_pred_train = imdb_naiveBayes_clf.predict(X_train_imdb_bbow)
imdb_naiveBayes_clf_F1_measure_train = f1_score(y_train_imdb, imdb_naiveBayes_clf_y_pred_train)
imdb_naiveBayes_clf_F1_measure_train

0.8711598321829747

In [80]:
# Prediction on the test data for the best alpha

imdb_naiveBayes_clf_y_pred_test = imdb_naiveBayes_clf.predict(X_test_imdb_bbow)
imdb_naiveBayes_clf_F1_measure_test = f1_score(y_test_imdb, imdb_naiveBayes_clf_y_pred_test)
imdb_naiveBayes_clf_F1_measure_test

0.8278224974200206

### Decision Trees: In order to avoid overfiting we have 2 great possibilities to tune the hyperparameter:
    1- pre-pruning parameters (max_depth, max_leaf_nodes, min_samples_leaf etc.)
    2- post-pruning (ccp_alphas)

For computation issue, we have decided to tune 2 hyperparameters of the pre-pruning (max_depth and max_leaf_nodes)

In [81]:
def pruningForDecisionTree(X_train, y_train, X_valid, y_valid):
    
    F_measure_valid = {}
    for max_depth in [None,10,20,30,40,50]:
        for max_leaf_node in [None,10,20,30,40,50]:
            decisionTree_clf = DecisionTreeClassifier(max_depth = max_depth, max_leaf_nodes = max_leaf_node,
                                                     random_state= 5)
            decisionTree_clf.fit(X_train, y_train)
            y_pred_valid = decisionTree_clf.predict(X_valid)
            decisionTree_clf_F1_measure = f1_score(y_valid, y_pred_valid)
            F_measure_valid[str(max_depth) + "_" + str(max_leaf_node)] = decisionTree_clf_F1_measure
    
    return F_measure_valid

In [82]:
imdb_dt_clf_F1_measure = pruningForDecisionTree(X_train_imdb_bbow, y_train_imdb, X_valid_imdb_bbow, y_valid_imdb)
imdb_dt_clf_F1_measure

{'None_None': 0.7022488755622189,
 'None_10': 0.7329795021961932,
 'None_20': 0.7354711396151948,
 'None_30': 0.7487149646009116,
 'None_40': 0.7458944708969001,
 'None_50': 0.7482758620689657,
 '10_None': 0.74877916440586,
 '10_10': 0.7329795021961932,
 '10_20': 0.7456368362421092,
 '10_30': 0.7447280799112097,
 '10_40': 0.7545979084024522,
 '10_50': 0.7549037250314917,
 '20_None': 0.7416981132075472,
 '20_10': 0.7329795021961932,
 '20_20': 0.7354711396151948,
 '20_30': 0.7487149646009116,
 '20_40': 0.7464309993201903,
 '20_50': 0.7488042854409797,
 '30_None': 0.729107475262075,
 '30_10': 0.7329795021961932,
 '30_20': 0.7354711396151948,
 '30_30': 0.7487149646009116,
 '30_40': 0.7458944708969001,
 '30_50': 0.7482758620689657,
 '40_None': 0.7198815984213123,
 '40_10': 0.7329795021961932,
 '40_20': 0.7354711396151948,
 '40_30': 0.7487149646009116,
 '40_40': 0.7458944708969001,
 '40_50': 0.7482758620689657,
 '50_None': 0.7171607018935263,
 '50_10': 0.7329795021961932,
 '50_20': 0.7354711

In [83]:
# Best couple max_depth/max_leaf nodes

imdb_best_dt_hyperparam = pd.Series(imdb_dt_clf_F1_measure).idxmax()
imdb_best_dt_hyperparam

'10_50'

In [84]:
# Prediction on the train data with the best couple max_depth/max_leaf

imdb_decisiontrees_clf = DecisionTreeClassifier(max_depth = 10, max_leaf_nodes = 50, random_state= 37)
imdb_decisiontrees_clf.fit(X_train_imdb_bbow, y_train_imdb)
imdb_decisiontrees_clf_y_pred_train = imdb_decisiontrees_clf.predict(X_train_imdb_bbow)
imdb_decisiontrees_clf_F1_measure_train = f1_score(y_train_imdb, imdb_decisiontrees_clf_y_pred_train)
imdb_decisiontrees_clf_F1_measure_train

0.7735905577856329

In [85]:
# Prediction on the test data with the best couple max_depth/max_leaf

imdb_decisiontrees_clf_y_pred_test = imdb_decisiontrees_clf.predict(X_test_imdb_bbow)
imdb_decisiontrees_clf_F1_measure_test = f1_score(y_test_imdb, imdb_decisiontrees_clf_y_pred_test)
imdb_decisiontrees_clf_F1_measure_test

0.7579144732097787

### Linear SVM: C is the regularization parameter

In [86]:
def hyperparamForLinearSVC(X_train, y_train, X_valid, y_valid):
    
    F_measure_valid = {}
    for c in [1,10,20,30,40,50,60,70,80,90]:
        linearSVC_clf = LinearSVC(C = c, random_state=5, loss="hinge")
        linearSVC_clf.fit(X_train, y_train)
        y_pred_valid = linearSVC_clf.predict(X_valid)
        linearSVC_clf_F1_measure = f1_score(y_valid, y_pred_valid)
        F_measure_valid[c] = linearSVC_clf_F1_measure
    
    return F_measure_valid

In [87]:
imdb_lsvc_clf_F1_measure = hyperparamForLinearSVC(X_train_imdb_bbow, y_train_imdb, X_valid_imdb_bbow, y_valid_imdb)
imdb_lsvc_clf_F1_measure

{1: 0.8455823293172691,
 10: 0.8443149859381278,
 20: 0.8443149859381278,
 30: 0.8443149859381278,
 40: 0.8443149859381278,
 50: 0.8443149859381278,
 60: 0.8443149859381278,
 70: 0.8443149859381278,
 80: 0.8443149859381278,
 90: 0.8443149859381278}

In [88]:
# Best C

imdb_best_C = pd.Series(imdb_lsvc_clf_F1_measure).idxmax()
imdb_best_C

1

In [89]:
# Prediction on the train data with the best C

imdb_linearsvc_clf = LinearSVC(C = imdb_best_C, random_state=5, loss="hinge")
imdb_linearsvc_clf.fit(X_train_imdb_bbow, y_train_imdb)
imdb_linearsvc_clf_y_pred_train = imdb_linearsvc_clf.predict(X_train_imdb_bbow)
imdb_linearsvc_clf_F1_measure_train = f1_score(y_train_imdb, imdb_linearsvc_clf_y_pred_train)
imdb_linearsvc_clf_F1_measure_train

1.0

In [90]:
# Prediction on the test data with the best C

imdb_linearsvc_clf_y_pred_test = imdb_linearsvc_clf.predict(X_test_imdb_bbow)
imdb_linearsvc_clf_F1_measure_test = f1_score(y_test_imdb, imdb_linearsvc_clf_y_pred_test)
imdb_linearsvc_clf_F1_measure_test

0.8321438651667271

## d) Report training, validation, and test F1-measure for all the classifiers (with best hyper-parameter configuration).

In [91]:
# Report the performance on validation test for alpha for Naive Bayes

alpha_nb_imdb = pd.DataFrame(data = imdb_nb_clf_F1_measure, index=["F1-measure"])
alpha_nb_imdb.to_csv('Assigment3_2005119_Q4_d_alpha.csv', index=True)

In [92]:
# Report the performance on train and test data for Naive Bayes

naivesBayes_imdb = {'F1_measure_train': imdb_naiveBayes_clf_F1_measure_train, 
                   'F1_measure_test': imdb_naiveBayes_clf_F1_measure_test}
naivesBayes_imdb = pd.DataFrame(data = naivesBayes_imdb, index=["Naive Bayes Performance"])
naivesBayes_imdb.to_csv('Assigment3_2005119_Q4_d_nb.csv', index=True)

In [93]:
# Report the performance on validation test for max depth for Decision Trees

hyperparam_dt_imdb = pd.DataFrame(data = imdb_dt_clf_F1_measure, index=["F1-measure"])
hyperparam_dt_imdb.to_csv('Assigment3_2005119_Q4_d_hyperparam.csv', index=True)

In [94]:
# Report the performance on train and test data for Decision Trees

decisionTrees_imdb = {'F1_measure_train': imdb_decisiontrees_clf_F1_measure_train, 
                   'F1_measure_test': imdb_decisiontrees_clf_F1_measure_test}
decisionTrees_imdb = pd.DataFrame(data = decisionTrees_imdb, index=["Decision Trees Performance"])
decisionTrees_imdb.to_csv('Assigment3_2005119_Q4_d_dt.csv', index=True)

In [95]:
# Report the performance on validation test C for Linear SVC

c_lsvc_imdb = pd.DataFrame(data = imdb_lsvc_clf_F1_measure, index=["F1-measure"])
c_lsvc_imdb.to_csv('Assigment3_2005119_Q4_d_Cparam.csv', index=True)

In [96]:
# Report the performance on train and test data for Linear SVC

linearSVC_imdb = {'F1_measure_train': imdb_linearsvc_clf_F1_measure_train, 
                   'F1_measure_test': imdb_linearsvc_clf_F1_measure_test}
linearSVC_imdb = pd.DataFrame(data = linearSVC_imdb, index=["Linear SVC Performance"])
linearSVC_imdb.to_csv('Assigment3_2005119_Q4_d_lsvc.csv', index=True)

# 5- IMDB dataset with frequency bag-of-words

## a) Naive Bayes, Decision Trees and Linear SVM

### Naive Bayes/GaussianNB classifier

In [97]:
imdb_naiveBayes_clf = GaussianNB()
imdb_naiveBayes_clf.fit(X_train_imdb_freqbow, y_train_imdb)
imdb_naiveBayes_clf_y_pred_train = imdb_naiveBayes_clf.predict(X_train_imdb_freqbow)
imdb_naiveBayes_clf_F1_measure_train = f1_score(y_train_imdb, imdb_naiveBayes_clf_y_pred_train)
imdb_naiveBayes_clf_F1_measure_train

0.8489434608794975

In [98]:
# Prediction on the test data

imdb_naiveBayes_clf_y_pred_test = imdb_naiveBayes_clf.predict(X_test_imdb_freqbow)
imdb_naiveBayes_clf_F1_measure_test = f1_score(y_test_imdb, imdb_naiveBayes_clf_y_pred_test)
imdb_naiveBayes_clf_F1_measure_test

0.6384412273513612

### Decision Trees: In order to avoid overfiting we have 2 great possibilities to tune the hyperparameter:
    1- pre-pruning parameters (max_depth, max_leaf_nodes, min_samples_leaf etc.)
    2- post-pruning (ccp_alphas)

For computation issue, we have decided to tune 2 hyperparameters of the pre-pruning (max_depth and max_leaf_nodes)

In [99]:
imdb_dt_clf_F1_measure = pruningForDecisionTree(X_train_imdb_freqbow, y_train_imdb, X_valid_imdb_freqbow, y_valid_imdb)
imdb_dt_clf_F1_measure

{'None_None': 0.700486448922863,
 'None_10': 0.7236219712327444,
 'None_20': 0.7239695252979098,
 'None_30': 0.727973133148953,
 'None_40': 0.7380743342112964,
 'None_50': 0.7360426119550206,
 '10_None': 0.7390219099565498,
 '10_10': 0.7236219712327444,
 '10_20': 0.7251997661274606,
 '10_30': 0.7365353886643482,
 '10_40': 0.7388878556816068,
 '10_50': 0.7370190511766904,
 '20_None': 0.7268394967594358,
 '20_10': 0.7236219712327444,
 '20_20': 0.7239695252979098,
 '20_30': 0.727973133148953,
 '20_40': 0.7380743342112964,
 '20_50': 0.7380485893416927,
 '30_None': 0.7168750605796259,
 '30_10': 0.7236219712327444,
 '30_20': 0.7239695252979098,
 '30_30': 0.727973133148953,
 '30_40': 0.7380743342112964,
 '30_50': 0.7360426119550206,
 '40_None': 0.7084841961052941,
 '40_10': 0.7236219712327444,
 '40_20': 0.7239695252979098,
 '40_30': 0.727973133148953,
 '40_40': 0.7380743342112964,
 '40_50': 0.7360426119550206,
 '50_None': 0.7018749386472956,
 '50_10': 0.7236219712327444,
 '50_20': 0.723969525

In [100]:
# Best couple max_depth/max_leaf nodes

best_dt_hyperparam = pd.Series(imdb_dt_clf_F1_measure).idxmax()
best_dt_hyperparam

'10_None'

In [101]:
# Prediction on the train data with the best couple max_depth/max_leaf

imdb_decisiontrees_clf = DecisionTreeClassifier(max_depth= 10, random_state= 5)
imdb_decisiontrees_clf.fit(X_train_imdb_freqbow, y_train_imdb)
imdb_decisiontrees_clf_y_pred_train = imdb_decisiontrees_clf.predict(X_train_imdb_freqbow)
imdb_decisiontrees_clf_F1_measure_train = f1_score(y_train_imdb, imdb_decisiontrees_clf_y_pred_train)
imdb_decisiontrees_clf_F1_measure_train

0.797141625084704

In [102]:
# Prediction on the test data with the best couple max_depth/max_leaf

imdb_decisiontrees_clf_y_pred_test = imdb_decisiontrees_clf.predict(X_test_imdb_freqbow)
imdb_decisiontrees_clf_F1_measure_test = f1_score(y_test_imdb, imdb_decisiontrees_clf_y_pred_test)
imdb_decisiontrees_clf_F1_measure_test

0.7427241837789925

### Linear SVM: C is the regularization parameter

In [103]:
imdb_lsvc_clf_F1_measure = hyperparamForLinearSVC(X_train_imdb_freqbow, y_train_imdb, X_valid_imdb_freqbow, y_valid_imdb)
imdb_lsvc_clf_F1_measure

{1: 0.721996517701683,
 10: 0.8305829508837028,
 20: 0.848235294117647,
 30: 0.8573114367194414,
 40: 0.864099772344848,
 50: 0.8668908227848101,
 60: 0.868189459112034,
 70: 0.8698154395713434,
 80: 0.8719976277552635,
 90: 0.8739095955590801}

In [104]:
# Best C

best_C = pd.Series(imdb_lsvc_clf_F1_measure).idxmax()
best_C

90

In [105]:
# Prediction on the train data with the best C

imdb_linearsvc_clf = LinearSVC(C = best_C, random_state=5, loss="hinge")
imdb_linearsvc_clf.fit(X_train_imdb_freqbow, y_train_imdb)
imdb_linearsvc_clf_y_pred_train = imdb_linearsvc_clf.predict(X_train_imdb_freqbow)
imdb_linearsvc_clf_F1_measure_train = f1_score(y_train_imdb, imdb_linearsvc_clf_y_pred_train)
imdb_linearsvc_clf_F1_measure_train

0.9138616485555262

In [106]:
# Prediction on the test data with the best C

imdb_linearsvc_clf_y_pred_test = imdb_linearsvc_clf.predict(X_test_imdb_freqbow)
imdb_linearsvc_clf_F1_measure_test = f1_score(y_test_imdb, imdb_linearsvc_clf_y_pred_test)
imdb_linearsvc_clf_F1_measure_test

0.8727200318598168

## c) Report training, validation, and test F1-measure for all the classifiers (with best hyper-parameter configuration).

In [107]:
# Report the performance on train and test data for Naive Bayes

naivesBayes_imdb = {'F1_measure_train': imdb_naiveBayes_clf_F1_measure_train, 
                   'F1_measure_test': imdb_naiveBayes_clf_F1_measure_test}
naivesBayes_imdb = pd.DataFrame(data = naivesBayes_imdb, index=["Naive Bayes Performance"])
naivesBayes_imdb.to_csv('Assigment3_2005119_Q5_c_nb.csv', index=True)

In [108]:
# Report the performance on validation test for max depth/max leaf for Decision Trees

hyperparam_dt_imdb = pd.DataFrame(data = imdb_dt_clf_F1_measure, index=["F1-measure"])
hyperparam_dt_imdb.to_csv('Assigment3_2005119_Q5_c_hyperparam.csv', index=True)

In [109]:
# Report the performance on train and test data for Decision Trees

decisionTrees_imdb = {'F1_measure_train': imdb_decisiontrees_clf_F1_measure_train, 
                   'F1_measure_test': imdb_decisiontrees_clf_F1_measure_test}
decisionTrees_imdb = pd.DataFrame(data = decisionTrees_imdb, index=["Decision Trees Performance"])
decisionTrees_imdb.to_csv('Assigment3_2005119_Q5_c_dt.csv', index=True)

In [110]:
# Report the performance on validation test C for Linear SVC

c_lsvc_imdb = pd.DataFrame(data = imdb_lsvc_clf_F1_measure, index=["F1-measure"])
c_lsvc_imdb.to_csv('Assigment3_2005119_Q5_c_Cparam.csv', index=True)

In [111]:
# Report the performance on train and test data for Linear SVC

linearSVC_imdb = {'F1_measure_train': imdb_linearsvc_clf_F1_measure_train, 
                   'F1_measure_test': imdb_linearsvc_clf_F1_measure_test}
linearSVC_imdb = pd.DataFrame(data = linearSVC_imdb, index=["Linear SVC Performance"])
linearSVC_imdb.to_csv('Assigment3_2005119_Q5_c_lsvc.csv', index=True)