In [1]:
import math
import numpy as np

In [2]:
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import numpy as np

tknzr = TweetTokenizer()
sws = stopwords.words('english')

In [3]:
X = []
y = []
word_to_index = {}
words_vocabulary = []


def preprocess_data():
    idx = 0
    with open('data/amazon_cells_labelled.txt', 'r') as f:
        for l in f:
            sentence, sentiment = l.split('\t')
            sentiment = int(sentiment)
            y.append(sentiment)

            words = tknzr.tokenize(sentence)
            # Remove stopwords does not improve accuracy but makes the model faster as we have less words to process
            words = [word for word in words if word not in sws]
            X.append(words)
            
            for word in words:
                if word not in word_to_index:
                    word_to_index[word] = idx
                    words_vocabulary.append(word)
                    idx += 1
    
    return X, y

def text_to_features(X):
    X_feat = []
    for x in X:
        x_feat = [1 if word in x else 0 for word in words_vocabulary]
        X_feat.append(x_feat)
    
    return X_feat
        
X, y = preprocess_data()
X = text_to_features(X)
            
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression Model

In [34]:
class LogisticRegression():
    def __init__(self, n_features):
        self.w = np.zeros(n_features)
        self.lr = 0.01
        self.n_epochs = 50
        
    def prob(self, x):
        """Probability that x belongs to the first class"""
        return math.e ** (np.dot(self.w, x)) / (1 + math.e ** (np.dot(self.w, x)))
    
    def fit(self, X, y):
        for k in range(self.n_epochs):
            gradient = np.zeros(len(self.w))
            total_log_loss = 0
            print("weights: {}".format(self.w))

            for i, x in enumerate(X):
                p = self.prob(x)
                for j in range(len(gradient)):
                    gradient[j] -= (y[i] - p) * x[j]
                log_loss = -(p if y[i] else 1 - p) 
                total_log_loss += log_loss

            for j in range(len(self.w)):
                self.w[j] = self.w[j] - self.lr * gradient[j]
                
            avg_log_loss = total_log_loss / len(X)
                
            print("Epoch {}, avg log loss: {}".format(k, avg_log_loss))
            
    def predict(self, X):
        return [1 if self.prob(x) >= 0.5 else 0 for x in X]

In [35]:
model = LogisticRegression(len(X[0]))

In [36]:
n_correct = 0

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
n_correct = sum(1 for i, _ in enumerate(y_pred) if y_pred[i] == y_test[i])

print("{0:.2f}% of sentences are correctly classified".format(n_correct * 100 / len(X_test)))

weights: [0. 0. 0. ... 0. 0. 0.]
Epoch 0, avg log loss: -0.5
weights: [-0.005 -0.015 -0.02  ...  0.     0.    -0.005]
Epoch 1, avg log loss: -0.5209261504091711
weights: [-0.00707297 -0.02890279 -0.03833086 ...  0.          0.
 -0.0100375 ]
Epoch 2, avg log loss: -0.5384455269170965
weights: [-0.0107872  -0.04359115 -0.05798188 ...  0.          0.
 -0.01511979]
Epoch 3, avg log loss: -0.5548847789840932
weights: [-0.01082037 -0.05667262 -0.07499605 ...  0.          0.
 -0.01997521]
Epoch 4, avg log loss: -0.5681343064887328
weights: [-0.01403005 -0.07109049 -0.09409497 ...  0.          0.
 -0.02479077]
Epoch 5, avg log loss: -0.5818273285073506
weights: [-0.01243363 -0.08327349 -0.1096133  ...  0.          0.
 -0.02926547]
Epoch 6, avg log loss: -0.5920242949027255
weights: [-0.01553357 -0.09742364 -0.12812278 ...  0.          0.
 -0.03371675]
Epoch 7, avg log loss: -0.6040058981611636
weights: [-0.0126728  -0.10872383 -0.14223119 ...  0.          0.
 -0.03777629]
Epoch 8, avg log loss