# Logistic regression (using GloVe)
The idea behind this approach is to average the word vectors over every tweet, and use this average vectors to train logistic regression.

Before running this notebook, make sure to have saved a embedding matrix named "embeddings.npy" beforehand. To do this, follow the instructions in Readme.md

## I) Imports

In [1]:
import numpy as np
import pickle

from logreg import *
from helpers import *

%load_ext autoreload
%autoreload 2

## II) Prepare features

### Load our GloVe word embeddings from file ...

In [2]:
# load word embeddings
embeddings = np.load('our_glove/embeddings200_wstp_reduced.npy')
# load vocabulary
with open('our_glove/vocab_wstp_reduced.pkl', 'rb') as f:
    vocab = pickle.load(f)

### Average word vectors over tweets

In [3]:
# Average vectors for training tweets

'''
Appending a row for each tweet is impracticable slow. 
However, we can not know in advance the number of tweets we will
be appended (this is because we skip tweets fr which we have no embeddings).
therefore we allocate a too big array fr x_train and cut wht's too much
in the end.
'''
allocate_columns = 3000000
x_train = np.zeros((allocate_columns, embeddings.shape[1]))
y_train = np.zeros(allocate_columns)
counter = 0

with open('pos_train.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train[counter, :] = mean
            y_train[counter] = 1
            counter += 1
        if counter % 100000 == 0:
            print(str(counter), " tweets processed")
            
with open('neg_train.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train[counter, :] = mean
            y_train[counter] = -1
            counter += 1
        if counter % 100000 == 0:
            print(str(counter), " tweets processed")
            
# cut zero rows in x_train and y_train
y_train = y_train[np.nonzero(y_train)]
x_train = x_train[np.nonzero(y_train)]
                               
# Shuffle tweets
x_train, y_train = shuffle(x_train, y_train)

100000  tweets processed


In [4]:
# Average words for testing data

allocate_columns = 100000
x_submission = np.zeros((allocate_columns, embeddings.shape[1]))
embeddings_mean = np.expand_dims(np.mean(embeddings, axis=0), axis=0)
counter = 0

with open('test_data.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        # filter out the IDs and first comma
        tweet = line[(line.index(",")+1):]
        for word in tweet.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_submission[counter, :] = mean
        else:
            # in case that we have no embedding for any word of the tweet
            # just use the overall mean of the embeddings
            x_submission[counter, :] = embeddings_mean
        counter += 1
        if counter % 5000 == 0:
            print(str(counter), " tweets processed")
            
# cut zero rows in x_submission
x_submission = x_submission[np.nonzero(x_submission[:, 1])]

5000  tweets processed
10000  tweets processed


## III) Train the model

In [5]:
# set aside a small portion for validation
testset = 10000

x_test = x_train[0:testset, :]
y_test = y_train[0:testset]
x_train_log = x_train[testset + 1:, :]
y_train_log = y_train[testset + 1:]

# standardize
#x_train_log, mean, std = standardize(x_train_log)

# add offset
x_train_log = add_offset(x_train_log)

# train using logistic regression (SGD)
initial_w = np.random.rand(x_train_log.shape[1])
epochs = 40
batch_size = 100
gamma = 0.0001
lambda_ = 0.01
print_every = int(50000 / batch_size)

weights, loss = reg_logistic_regression(y_train_log, x_train_log, initial_w, epochs, batch_size, gamma, lambda_, print_every)

# free up memory
del x_train_log
del y_train_log

epoch	 1 	loss:  101.60808320306838
epoch	 1 	loss:  89.46890000634279
epoch	 1 	loss:  88.74502279751542
epoch	 2 	loss:  86.74261658818506
epoch	 2 	loss:  85.5287878404817
epoch	 2 	loss:  84.99114056905529
epoch	 3 	loss:  82.78421032737292
epoch	 3 	loss:  81.97269585076036
epoch	 3 	loss:  81.64709160306684
epoch	 4 	loss:  79.83286769897857
epoch	 4 	loss:  78.94698801479404
epoch	 4 	loss:  78.00850796274956
epoch	 5 	loss:  77.02217473100167
epoch	 5 	loss:  76.55057951240451
epoch	 5 	loss:  75.84759444049834
epoch	 6 	loss:  75.06225093420917
epoch	 6 	loss:  74.50552295716363
epoch	 6 	loss:  73.80550422548086
epoch	 7 	loss:  73.2934325432172
epoch	 7 	loss:  72.58927737773942
epoch	 7 	loss:  72.31241187128458
epoch	 8 	loss:  71.54655374564737
epoch	 8 	loss:  71.0374342913379
epoch	 8 	loss:  71.11263219301361
epoch	 9 	loss:  70.48907710937294
epoch	 9 	loss:  70.00935653397616
epoch	 9 	loss:  70.00763184026945
epoch	 10 	loss:  69.5531047834208
epoch	 10 	loss:  69.1

## IV) Test predictions

### Tests on a local validation set

In [6]:
# standardize
#x_test_log = standardize_test(x_test, mean, std)

# add offset
x_test_log = add_offset(x_test)

y_pred = predict_logistic_labels(weights, x_test_log)
accuracy = get_accuracy(y_test, y_pred)
print(accuracy)

0.6007


### Predict labels for the test dataset, prepare submission csv file

In [7]:
filename = 'logreg_preprocessed_own_reduced.csv'

# standardize
#x_submission_log = standardize_test(x_submission_log, mean, std)

# add offset
x_submission_log = add_offset(x_submission)

y_submission = predict_logistic_labels(weights, x_submission_log)
ids = np.arange(len(y_submission)) + 1

create_csv_submission(ids, y_submission, filename)