In [30]:
import os
import gc
import re
import string
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_20newsgroups

SEED = 551

### Util Functions

In [2]:
def text_preprocessing(text):
    # Remove special characters
    alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x)
    # Remove punctuations
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x.lower())
    # Remove white spaces
    strip = lambda x: str.rsplit(x)
    # Remove stopwords
    remove_stopwords = lambda x: " ".join([w for w in x if w not in stopwords.words('english')])
    return text.map(alphanumeric).map(punc_lower).map(strip).map(remove_stopwords)

In [3]:
def cross_validation_split(total_instances, k=5):
    fold_size = total_instances // k
    # Get shuffled indices
    inds = np.random.permutation(total_instances)

    for f in range(k):
        # Take the f'th fold to be validation data
        validation_inds = inds[f * fold_size : (f+1) * fold_size]

        # Take the remaining k - f folds to be testing data. Essentially
        # this means take everything except validation_inds
        train_inds = np.delete(inds, validation_inds)

        yield validation_inds, train_inds


# This is called for each hyperparameter combination we want to test
def kfoldCV(all_training_data, all_training_labels, model, k=5):
    total_instances = len(all_training_data)
    accuracies = []
    # For each fold, go through all
    for fold, (val_inds, train_inds) in enumerate(cross_validation_split(total_instances, k)):
        # Training
        X_train = all_training_data[train_inds]
        y_train = all_training_labels[train_inds]
        # Validation
        X_val = all_training_data[val_inds]
        y_val = all_training_labels[val_inds]

        model.fit(X_train, y_train)
        pred_validation_labels = model.predict(X_val)
        accuracy = evaluate_acc(pred_validation_labels, y_val)

        accuracies.append(accuracy)

    return np.sum(accuracies) / k


def evaluate_acc(Predicted_label, True_label):
    return np.sum(True_label == Predicted_label)/True_label.shape[0]

### Naïve Bayes Model

In [4]:
class MultinomialNB(object):
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        
    def __str__(self):
        return f'Multinomial NB model with alpha = {self.alpha}'

    def fit(self, X, y):
        count_sample = X.shape[0]
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.class_log_prior_ = [np.log(len(i) / count_sample) for i in separated]
        count = np.array([np.array(i).sum(axis=0) for i in separated]) + self.alpha
        self.feature_log_prob_ = np.log(count / count.sum(axis=1)[np.newaxis].T)
        return self

    def predict_log_proba(self, X):       
        return [(self.feature_log_prob_ * x).sum(axis=1) + self.class_log_prior_
                for x in X]

    def predict(self, X):
        return np.argmax(self.predict_log_proba(X), axis=1)

### Logistic Regression Model

In [5]:
class LogisticRegression:
    def __init__(self, learning_rate=0.1, max_iter=100, C=0.1, regularization='l2',
                 fit_intercept=True, multi_class=False, 
                 threshold = 0.5):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.fit_intercept = fit_intercept
        self.theta = None
        self.h = None
        self.threshold = threshold
        self.multi_class = multi_class
        self.regularization = regularization
        self.C = C
        
    def __str__(self):
        return f'LR model with learning_rate = {self.learning_rate}, max_iter = {self.max_iter}, C = {self.C}, regularization = {self.regularization}, fit_intercept = {self.fit_intercept}, threshold = {self.threshold}'
        
    def _sigmoid(self, z):
            return 1 / (1 + np.exp(-z))
    
    def _softmax(self, z):
        return np.exp(z) / np.sum(np.exp(z), axis=0)
            
    def _add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        X = np.hstack((intercept, X))
        return X

    def _gradient_descent(self, X, y):
        if self.multi_class:
            self.theta = np.zeros((X.shape[1], np.max(y)+1))
            y_ohe = np.zeros((y.size, np.max(y)+1))
            rows = np.arange(y.size)
            y_ohe[rows, y] = 1            
            bias = np.random.random(np.max(y)+1)
            for i in range(self.max_iter):
                z = X @ self.theta + bias
                h = self._softmax(z)
                if self.regularization == 'l2':
                    w_grad = (1/y.size)*(self.C*np.matmul(X.T, (h - y_ohe))+np.sum(self.theta))    
                elif self.regularization == 'l1':
                    w_grad = (1/y.size)*(self.C*np.matmul(X.T, (h - y_ohe))) 
                b_grad = (1/y.size)*np.sum(h - y_ohe)
                self.theta = self.theta - self.learning_rate * w_grad
                bias = bias - self.learning_rate * b_grad
        else:
            self.theta = np.random.random(X.shape[1])
            for i in range(self.epoch):
                z = np.matmul(X, self.theta)
                h = self._sigmoid(z)
                gradient = np.dot(X.T, (h-y)) / y.size
                self.theta = self.theta - (self.learning_rate * gradient)
                
    def fit(self, X, y):
        if self.fit_intercept:
            X = self._add_intercept(X)
        self._gradient_descent(X, y)
        
    def predict_proba(self, X):
        assert self.theta is not None
        
        if self.fit_intercept:
            X = self._add_intercept(X)
        
        if self.multi_class:
            return np.argmax(self._softmax(np.matmul(X, self.theta)), axis=1)
        else:
            return self._sigmoid(np.dot(X, self.theta))
        
    def predict(self, X):
        if self.multi_class:
            return self.predict_proba(X)
        else:
            return self.predict_proba(X) >= self.threshold

### 20 News Group Dataset

In [63]:
news_train = fetch_20newsgroups(subset='train',random_state=SEED, remove=('headers','footers','quotes'))
news_test = fetch_20newsgroups(subset='test', random_state=SEED, remove=('headers','footers','quotes'))

In [64]:
train_df = pd.DataFrame({'data': news_train.data, 'target': news_train.target})
train_df.head()

Unnamed: 0,data,target
0,"There's also Billy Jack, The Wild One, Smokey ...",8
1,\n\nI didn't see any smilies in this message s...,10
2,"\nPlease, define cell church. I missed it som...",15
3,\njust picked out this one point because it st...,15
4,\n\n\n\nCan somebody reconcile the apparent co...,0


In [65]:
test_df = pd.DataFrame({'data': news_test.data, 'target': news_test.target})
test_df.head()

Unnamed: 0,data,target
0,"Sorry about the garbage code, the following is...",5
1,\n\n\n\n\n,3
2,"From article <1993May12.111030@IASTATE.EDU>, b...",17
3,Hi \n\nI have heard about a linear mode for th...,3
4,I've had a Valentine for about 9 months now an...,7


In [66]:
# Experiment with 20%, 40%, 60%, 80% data
X_train, _, y_train, _ = train_test_split(train_df['data'], train_df['target'], test_size=0.8,random_state=123)

In [67]:
X_train = text_preprocessing(train_df.data)

In [68]:
y_train = train_df.target.to_numpy()

In [69]:
X_test = text_preprocessing(test_df.data)

In [70]:
y_test = test_df.target.to_numpy()

In [71]:
# Use TF-IDF Vectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=3)
X_train = vectorizer.fit_transform(X_train).toarray()
vocab = vectorizer.vocabulary_
vectorizer_1 = TfidfVectorizer(vocabulary=vocab)
X_test = vectorizer_1.fit_transform(X_test).toarray()

In [72]:
# For bigrams and trigrams
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(stop_words='english', ngram_range=(3, 3)) # (2, 2) for bigrams
# X_train = vectorizer.fit_transform(X_train).toarray()
# vocab = vectorizer.vocabulary_
# vectorizer_1 = CountVectorizer(vocabulary=vocab)
# X_test = vectorizer_1.fit_transform(test_df.data).toarray()

#### Hyperparameter tuning and results for the Naïve Bayes model

In [73]:
# Grid Search with cross validation for the best parameters for the dataset
alphas = [0.5, 1.0, 1.5, 2.0]

best_accuracy = -1
best_model = None
for alpha in alphas:
    nb_model = MultinomialNB(alpha=alpha)
    validation_acc = kfoldCV(X_train, y_train, nb_model, k=5)
    print(f'Average validation accuracy for {nb_model} is {round(validation_acc, 2)}')
    if validation_acc > best_accuracy:
        best_accuracy = validation_acc
        best_model = nb_model

print(f'Best validation accuracy of {round(best_accuracy, 2)}')

Average validation accuracy for Multinomial NB model with alpha = 0.5 is 0.86
Average validation accuracy for Multinomial NB model with alpha = 1.0 is 0.84
Average validation accuracy for Multinomial NB model with alpha = 1.5 is 0.82
Average validation accuracy for Multinomial NB model with alpha = 2.0 is 0.81
Best validation accuracy of 0.86


In [74]:
test_accuracy = kfoldCV(X_test, y_test, best_model, k=5)

In [75]:
print(f'Accuracy on the test set {round(test_accuracy, 4)}')

Accuracy on the test set 0.8633


#### Hyperparameter tuning and results for the Logistic Regression model

In [76]:
params = {
    'regularization': ['l2'],
    'C': [1, 0.5, 0.2],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'max_iter': [1, 5, 10],
} 

In [77]:
keys, values = zip(*params.items())
experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]

In [78]:
gc.collect()
best_accuracy = -1
best_model = None
accuracies = []
for e in experiments:
    mlr = LogisticRegression(learning_rate=e['learning_rate'], max_iter=e['max_iter'], C=e['C'], 
                                  regularization=e['regularization'], fit_intercept=True, 
                                  multi_class=True, threshold = 0.5)
    validation_acc = kfoldCV(X_train, y_train, mlr, k=5)
    print(f'Average validation accuracy for {mlr} is {round(validation_acc, 4)}')
    accuracies.append(validation_acc)
    if validation_acc > best_accuracy:
        best_accuracy = validation_acc
        best_model = mlr
    gc.collect()

print(f'Best validation accuracy of {round(best_accuracy, 2)} found for {mlr}')

Average validation accuracy for LR model with learning_rate = 0.001, max_iter = 1, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.7729
Average validation accuracy for LR model with learning_rate = 0.001, max_iter = 5, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.7729
Average validation accuracy for LR model with learning_rate = 0.001, max_iter = 10, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.7732
Average validation accuracy for LR model with learning_rate = 0.01, max_iter = 1, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.7725
Average validation accuracy for LR model with learning_rate = 0.01, max_iter = 5, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.771
Average validation accuracy for LR model with learning_rate = 0.01, max_iter = 10, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.7728
Average validation accuracy for LR model with le

In [83]:
test_accuracy = kfoldCV(X_test, y_test, best_model, k=5)

In [84]:
print(f'Accuracy on the test set {round(test_accuracy, 4)}')

Accuracy on the test set 0.7811


### Sentiment140 Dataset

In [41]:
header_list = ['sentiment', 'ids', 'date', 'flag', 'user', 'text']
data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=header_list)
X = data.text
y = data.sentiment
# Reducing the data to 10% of original data
X, _, y, _ = train_test_split(X, y, test_size=0.90,random_state=123)

In [43]:
# Experiment with 20%, 40%, 60%, 80% data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.8,random_state=123)

In [44]:
X_train = text_preprocessing(x_train)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=3)
X_train = vectorizer.fit_transform(X_train).toarray()
vocab = vectorizer.vocabulary_

In [None]:
# # For bigrams and trigrams
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(stop_words='english', ngram_range=(3, 3))
# X_train = vectorizer.fit_transform(X_train).toarray()
# vocab = vectorizer.vocabulary_

In [47]:
y_train = y_train.to_numpy()

In [48]:
header_list = ['sentiment', 'ids', 'date', 'flag', 'user', 'text']
test_data = pd.read_csv('new_testdata.manual.2009.06.14.csv', encoding='latin-1', names=header_list)

In [49]:
X_test = test_data.text
y_test = test_data.sentiment

In [50]:
X_test = text_preprocessing(X_test)
y_test = y_test.to_numpy()

In [51]:
vectorizer_1 = TfidfVectorizer(vocabulary=vocab)
X_test = vectorizer_1.fit_transform(X_test).toarray()

#### Hyperparameter tuning and results for the Naïve Bayes model

In [52]:
alphas = [0.5, 1.0, 1.5, 2.0]

best_accuracy = -1
best_model = None
for alpha in alphas:
    nb_model = MultinomialNB(alpha=alpha)
    validation_acc = kfoldCV(X_train, y_train, nb_model, k=5)
    print(f'Average validation accuracy for {nb_model} is {round(validation_acc, 2)}')
    if validation_acc > best_accuracy:
        best_accuracy = validation_acc
        best_model = nb_model

print(f'Best validation accuracy of {round(best_accuracy, 2)} found for {best_model}')

Average validation accuracy for Multinomial NB model with alpha = 0.5 is 0.4
Average validation accuracy for Multinomial NB model with alpha = 1.0 is 0.4
Average validation accuracy for Multinomial NB model with alpha = 1.5 is 0.4
Average validation accuracy for Multinomial NB model with alpha = 2.0 is 0.4
Best validation accuracy of 0.4 found for Multinomial NB model with alpha = 1.0


In [53]:
test_accuracy = kfoldCV(X_test, y_test, best_model, k=5)

In [54]:
print(f'Accuracy on the test set {round(test_accuracy, 4)}')

Accuracy on the test set 0.4479


#### Hyperparameter tuning and results for the Logistic Regression model

In [55]:
params = {
    'regularization': ['l2'],
    'C': [1, 0.5, 0.2],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'max_iter': [1, 5, 10],
} 

In [57]:
keys, values = zip(*params.items())
experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]

In [60]:
gc.collect()
best_accuracy = -1
best_model = None
accuracies = []
for e in experiments:
    mlr = LogisticRegression(learning_rate=e['learning_rate'], max_iter=e['max_iter'], C=e['C'], 
                                  regularization=e['regularization'], fit_intercept=True, 
                                  multi_class=True, threshold = 0.5)
    validation_acc = kfoldCV(X_train, y_train, mlr, k=5)
    print(f'Average validation accuracy for {mlr} is {round(validation_acc, 4)}')
    accuracies.append(validation_acc)
    if validation_acc > best_accuracy:
        best_accuracy = validation_acc
        best_model = mlr
    gc.collect()

print(f'Best validation accuracy of {round(best_accuracy, 2)} found for {mlr}')

Average validation accuracy for LR model with learning_rate = 0.001, max_iter = 1, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.4008
Average validation accuracy for LR model with learning_rate = 0.001, max_iter = 5, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.4005
Average validation accuracy for LR model with learning_rate = 0.001, max_iter = 10, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.3992
Average validation accuracy for LR model with learning_rate = 0.01, max_iter = 1, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.4
Average validation accuracy for LR model with learning_rate = 0.01, max_iter = 5, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.4005
Average validation accuracy for LR model with learning_rate = 0.01, max_iter = 10, C = 1, regularization = l2, fit_intercept = True, threshold = 0.5 is 0.4006
Average validation accuracy for LR model with lear

In [61]:
test_accuracy = kfoldCV(X_test, y_test, best_model, k=5)

In [62]:
print(f'Accuracy on the test set {round(test_accuracy, 4)}')

Accuracy on the test set 0.8676
