In [33]:
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import numpy as np
from data_importing import preprocess_data, text_to_features
import scipy

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
X, y, _, words_vocabulary = preprocess_data()
X = text_to_features(X, words_vocabulary)
            
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression Model

In [84]:
class LogisticRegression():
    def __init__(self, n_features, n_epochs=100):
        # Account for bias
        self.w = np.zeros(n_features + 1)
        self.lr = 0.005
        self.n_epochs = n_epochs
        
    def prob(self, x):
        """Probability that x belongs to the first class"""
        return scipy.special.expit(np.dot(self.w, x))
    
    def add_bias(self, X):
        return np.append(X, np.ones((len(X), 1)), axis=1)
    
    def fit(self, X, y):
        X_with_biases = self.add_bias(X)
        y = np.array(y)
        
        for k in range(self.n_epochs):
            gradient = np.zeros(len(self.w))
            total_log_likelihood = 0
            
            ps = scipy.special.expit(np.matmul(X_with_biases, self.w))
            gradient = sum([(y[i] - ps[i]) * x for i, x in enumerate(X_with_biases)])
            
            # Compute log likelihood
            ps[y == 0] = 1 - ps[y == 0]
            log_likelihood = sum(ps)
            
            # Don't divide gradient by number of examples otherwise step becomes too small
            # Batch Gradient Descent
            self.w += self.lr * gradient
                
            avg_log_likelihood = log_likelihood / len(X)
                
#             print("Epoch {}, avg log likelihood: {}".format(k, avg_log_likelihood))
            
    def predict(self, X):
        return [1 if self.prob(x) >= 0.5 else 0 for x in self.add_bias(X)]

## Test

In [85]:
model = LogisticRegression(len(X[0]), n_epochs=500)
model.fit(X_train, y_train)

In [86]:
y_pred = model.predict(X_test)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

82.50% of sentences are correctly classified


## Comparison with sklearn

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

81.50% of sentences are correctly classified
