# Logistic regression (using GloVe)
The idea behind this approach is to average the word vectors over every tweet, and use this average vectors to train logistic regression.

Before running this notebook, make sure to have saved a embedding matrix named "embeddings.npy" beforehand. To do this, follow the instructions in Readme.md

## I) Imports

In [1]:
import numpy as np
import pickle

from logreg import *
from helpers import *

%load_ext autoreload
%autoreload 2

## II) Prepare features

### Load our GloVe word embeddings from file ...

In [2]:
# load word embeddings
embeddings = np.load('embeddings200.npy')
# load vocabulary
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

### ... ord load pretrained embeddings

### Average word vectors over tweets

In [3]:
# Average vectors for training tweets

'''
Appending a row for each tweet is impracticable slow. 
However, we can not know in advance the number of tweets we will
be appended (this is because we skip tweets fr which we have no embeddings).
therefore we allocate a too big array fr x_train and cut wht's too much
in the end.
'''
allocate_columns = 3000000
x_train = np.zeros((allocate_columns, embeddings.shape[1]))
y_train = np.zeros(allocate_columns)
counter = 0

with open('pos_train.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train[counter, :] = mean
            y_train[counter] = 1
            counter += 1
        if counter % 100000 == 0:
            print(str(counter), " tweets processed")
            
with open('neg_train.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train[counter, :] = mean
            y_train[counter] = -1
            counter += 1
        if counter % 100000 == 0:
            print(str(counter), " tweets processed")
            
# cut zero rows in x_train and y_train
y_train = y_train[np.nonzero(y_train)]
x_train = x_train[np.nonzero(y_train)]
                               
# Shuffle tweets
x_train, y_train = shuffle(x_train, y_train)

100000  tweets processed
200000  tweets processed
300000  tweets processed
400000  tweets processed
500000  tweets processed
600000  tweets processed
700000  tweets processed


KeyboardInterrupt: 

In [None]:
# [Optional] save variables

np.save('x_train_own_200', x_train)
np.save('y_train', y_train)

In [None]:
# [Optional] load variables to save time

x_train = np.load('x_train_own_200.npy')
y_train = np.load('y_train.npy')

In [None]:
# Average words for testing data

allocate_columns = 300000
x_submission = np.zeros((allocate_columns, embeddings.shape[1]))
embeddings_mean = np.expand_dims(np.mean(embeddings, axis=0), axis=0)
counter = 0

with open('test_data.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        # TODO: filter out the IDs
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_submission[counter, :] = mean
        else:
            # in case that we have no embedding for any word of the tweet
            # just use the overall mean of the embeddings
            x_submission[counter, :] = embeddings_mean
        counter += 1
        if counter % 5000 == 0:
            print(str(counter), " tweets processed")

In [None]:
# [Optional] save variables

np.save('x_submission_own_200', x_submission)

In [None]:
# [Optional] load variables to save time

x_submission = np.load('x_submission_own_200.npy')

## III) Train the model

In [None]:
# set aside a small portion for validation

x_test = x_train[0:100000, :]
y_test = y_train[0:100000]
x_train_log = x_train[100001:, :]
y_train_log = y_train[100001:]

# standardize
#x_train_log, mean, std = standardize(x_train_log)

# add offset
x_train_log = add_offset(x_train_log)

# train using logistic regression (SGD)
initial_w = np.random.rand(x_train_log.shape[1])
epochs = 3
batch_size = 10
gamma = 0.00005
lambda_ = 0.1
print_every = int(500000 / batch_size)

weights, loss = reg_logistic_regression(y_train_log, x_train_log, initial_w, epochs, batch_size, gamma, lambda_, print_every)

# free up memory
del x_train_log
del y_train_log

## IV) Test predictions

### Tests on a local validation set

In [None]:
# standardize
#x_test_log = standardize_test(x_test, mean, std)

# add offset
x_test_log = add_offset(x_test)

y_pred = predict_logistic_labels(weights, x_test_log)
accuracy = get_accuracy(y_test, y_pred)
print(accuracy)

### Predict labels for the test dataset, prepare submission csv file

In [None]:
filename = 'logreg_own_200.csv'

# add offset
x_submission_log = add_offset(x_submission)

# standardize
#x_submission_log = standardize_test(x_submission_log, mean, std)

y_submission = predict_logistic_labels(weights, x_submission_log)
ids = np.arange(len(y_submission)) + 1

create_csv_submission(ids, y_submission, filename)