In [3]:
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import numpy as np
from data_importing import preprocess_data, text_to_features

%load_ext autoreload
%autoreload 2

In [4]:
X, y, _, words_vocabulary = preprocess_data()
X = text_to_features(X, words_vocabulary)
            
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression Model

In [55]:
class LogisticRegression():
    def __init__(self, n_features, n_epochs=100):
        # Account for bias
        self.w = np.zeros(n_features + 1)
        self.lr = 0.01
        self.n_epochs = n_epochs
        
    def logistic_function(self, x):
        return 1 / (1 + np.e ** (-x))
        
    def prob(self, x):
        """Probability that x belongs to the first class"""
        return self.logistic_function(np.dot(self.w, x))
    
    def add_bias(self, X):
        return np.append(X, np.ones((len(X), 1)), axis=1)
    
    def fit(self, X, y):
        X_with_biases = self.add_bias(X)
        for k in range(self.n_epochs):
            gradient = np.zeros(len(self.w))
            total_log_loss = 0
            print("weights: {}".format(self.w))

            # Compute gradient
            for i, x in enumerate(X_with_biases):
                p = self.prob(x)
                for j in range(len(gradient)):
                    gradient[j] += (y[i] - p) * x[j]
                log_loss = -(p if y[i] else 1 - p) 
                total_log_loss += log_loss
            
            # Don't divide gradient by number of examples otherwise step becomes too small
            # Batch Gradient Descent
            self.w += self.lr * gradient
                
            avg_log_loss = total_log_loss / len(X)
                
            print("Epoch {}, avg log loss: {}".format(k, avg_log_loss))
            
    def predict(self, X):
        return [1 if self.prob(x) >= 0.5 else 0 for x in self.add_bias(X)]

## Test

In [59]:
model = LogisticRegression(len(X[0]), n_epochs=150)
model.fit(X_train, y_train)

weights: [0. 0. 0. ... 0. 0. 0.]
Epoch 0, avg log loss: -0.5
weights: [-0.005 -0.015 -0.02  ...  0.    -0.005 -0.07 ]
Epoch 1, avg log loss: -0.5211786459545394
weights: [-0.00551311 -0.02803333 -0.03729169 ...  0.         -0.00986253
  0.13802717]
Epoch 2, avg log loss: -0.5360756857390196
weights: [-0.01587318 -0.04603247 -0.06150241 ...  0.         -0.01542143
 -0.54863224]
Epoch 3, avg log loss: -0.5469680987297589
weights: [ 0.00432945 -0.04869893 -0.06467028 ...  0.         -0.0185988
  1.31622036]
Epoch 4, avg log loss: -0.507234920687012
weights: [-0.04028587 -0.08483825 -0.11148971 ...  0.         -0.02714125
 -2.24830681]
Epoch 5, avg log loss: -0.5152570214163604
weights: [-9.99397177e-04 -7.58375209e-02 -1.01891402e-01 ...  0.00000000e+00
 -2.75529762e-02  1.53042637e+00]
Epoch 6, avg log loss: -0.5080010331850897
weights: [-0.04689551 -0.11308418 -0.14948084 ...  0.         -0.03645135
 -2.17180332]
Epoch 7, avg log loss: -0.5216586514940353
weights: [-0.00774157 -0.104290

In [60]:
y_pred = model.predict(X_test)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

82.00% of sentences are correctly classified


## Comparison with sklearn

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

81.50% of sentences are correctly classified
