# Logistic regression (using GloVe)
The idea behind this approach is to average the word vectors over every tweet, and use this average vectors to train logistic regression.

Before running this notebook, make sure to have saved a embedding matrix named "embeddings.npy" beforehand. To do this, follow the instructions in Readme.md

## I) Imports

In [189]:
import numpy as np
import pickle

from logreg import *
from helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## II) Prepare features

### Load our GloVe word embeddings from file ...

In [4]:
# load word embeddings
embeddings = np.load('embeddings.npy')
# load vocabulary
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

### ... ord load pretrained embeddings

### Average word vectors over tweets

In [100]:
# Average vectors for training tweets

x_train = np.zeros((0, embeddings.shape[1]))
y_train = np.asarray([])

with open('pos_train.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train = np.append(x_train, mean, axis=0)
            y_train = np.append(y_train, 1)
        
with open('neg_train.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train = np.append(x_train, mean, axis=0)
            y_train = np.append(y_train, -1)

In [103]:
# [Optional] save variables

np.save('x_train', x_train)
np.save('y_train', y_train)

In [178]:
# [Optional] load variables to save time

x_train = np.load('x_train.npy')
y_train = np.load('y_train.npy')

In [111]:
# Average words 

x_submission = np.zeros((0, embeddings.shape[1]))
embeddings_mean = np.expand_dims(np.mean(embeddings, axis=0), axis=0)

with open('test_data.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        # TODO: filter out the IDs
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_submission = np.append(x_submission, mean, axis=0)
        else:
            # in case that we have no embedding for any word of the tweet
            # just use the overall mean of the embeddings
            x_submission = np.append(x_submission, embeddings_mean, axis=0)

In [112]:
# [Optional] save variables

np.save('x_submission', x_submission)

In [186]:
# [Optional] load variables to save time

x_submission = np.load('x_submission.npy')

## III) Train the model

In [179]:
# set aside a small portion for validation
np.random.seed(4133)
np.random.shuffle(x_train)
np.random.shuffle(y_train)

x_test = x_train[0:10000, :]
y_test = y_train[0:10000]
x_train_log = x_train[10001:, :]
y_train_log = y_train[10001:]

# train using logistic regression (SGD)
initial_w = np.random.rand(embeddings.shape[1])
epochs = 50
batch_size = 10
gamma = 0.0001
lambda_ = 0.05
print_every = int(50000 / batch_size)

weights, loss = reg_logistic_regression(y_train_log, x_train_log, initial_w, epochs, batch_size, gamma, lambda_, print_every)

epoch	 1 	loss:  7.397889551882617
epoch	 1 	loss:  7.310262854673712
epoch	 1 	loss:  7.256834514203162
epoch	 2 	loss:  7.194713854586844
epoch	 2 	loss:  7.153948949222492
epoch	 2 	loss:  7.136153655741204
epoch	 3 	loss:  7.089086464402596
epoch	 3 	loss:  7.070922796714321
epoch	 3 	loss:  7.052130447356217
epoch	 4 	loss:  7.026688884754656
epoch	 4 	loss:  7.013396232726991
epoch	 4 	loss:  7.003113173044302
epoch	 5 	loss:  6.991018839991395
epoch	 5 	loss:  6.981029060951287
epoch	 5 	loss:  6.9754783894156445
epoch	 6 	loss:  6.965948806117815
epoch	 6 	loss:  6.960269312570624
epoch	 6 	loss:  6.959923317978275
epoch	 7 	loss:  6.951230434433854
epoch	 7 	loss:  6.949458512216098
epoch	 7 	loss:  6.948083237312386
epoch	 8 	loss:  6.942848726614485
epoch	 8 	loss:  6.942260721381591
epoch	 8 	loss:  6.94178359712171
epoch	 9 	loss:  6.939054554094291
epoch	 9 	loss:  6.938797938075486
epoch	 9 	loss:  6.936865105319706
epoch	 10 	loss:  6.935862402197973
epoch	 10 	loss:  6

## IV) Test predictions

### Tests on a local validation set

In [180]:
y_pred = predict_logistic_labels(weights, x_test)
accuracy = get_accuracy(y_test, y_pred)
print(accuracy)

0.5049


### Predict labels for the test dataset, prepare submission csv file

In [188]:
filename = 'logreg1.csv'

y_submission = predict_logistic_labels(weights, x_submission)
ids = np.arange(len(y_submission)) + 1

create_csv_submission(ids, y_submission, filename)