# Logistic regression (using GloVe)
The idea behind this approach is to average the word vectors over every tweet, and use this average vectors to train logistic regression.

Before running this notebook, make sure to have saved a embedding matrix named "embeddings.npy" beforehand. To do this, follow the instructions in Readme.md

## I) Imports

In [1]:
import numpy as np
import pickle

from logreg import *
from helpers import *

%load_ext autoreload
%autoreload 2

## II) Prepare features

### Load our GloVe word embeddings from file ...

In [2]:
# load word embeddings
embeddings = np.load('our_glove/embeddings200_reduced.npy')
# load vocabulary
with open('our_glove/vocab_reduced.pkl', 'rb') as f:
    vocab = pickle.load(f)

### Average word vectors over tweets

In [3]:
# Average vectors for training tweets

'''
Appending a row for each tweet is impracticable slow. 
However, we can not know in advance the number of tweets we will
be appended (this is because we skip tweets fr which we have no embeddings).
therefore we allocate a too big array fr x_train and cut wht's too much
in the end.
'''
allocate_columns = 3000000
x_train = np.zeros((allocate_columns, embeddings.shape[1]))
y_train = np.zeros(allocate_columns)
counter = 0

with open('train_pos_preprocessed.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train[counter, :] = mean
            y_train[counter] = 1
            counter += 1
        if counter % 100000 == 0:
            print(str(counter), " tweets processed")
            
with open('train_neg_preprocessed.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_train[counter, :] = mean
            y_train[counter] = -1
            counter += 1
        if counter % 100000 == 0:
            print(str(counter), " tweets processed")
            
# cut zero rows in x_train and y_train
y_train = y_train[np.nonzero(y_train)]
x_train = x_train[np.nonzero(y_train)]
                               
# Shuffle tweets
x_train, y_train = shuffle(x_train, y_train)

100000  tweets processed


In [4]:
# [Optional] save variables
np.save('x_train', x_train)
np.save('y_train', y_train)

In [5]:
# [Optional] load variables to save time
x_train = np.load('x_train.npy')
y_train = np.load('y_train.npy')

In [6]:
# Average words for testing data

allocate_columns = 300000
x_submission = np.zeros((allocate_columns, embeddings.shape[1]))
embeddings_mean = np.expand_dims(np.mean(embeddings, axis=0), axis=0)
counter = 0

with open('test_data_preprocessed.txt') as f:
    for line in f:
        total = np.zeros((1, embeddings.shape[1]))
        wordcount = 0
        # TODO: filter out the IDs
        for word in line.strip().split():
            index = vocab.get(word, -1);
            # skip words for which we have no embedding
            if(index != -1):
                total += embeddings[index, :]
                wordcount += 1
        if(wordcount > 0):
            mean = total / wordcount
            x_submission[counter, :] = mean
        else:
            # in case that we have no embedding for any word of the tweet
            # just use the overall mean of the embeddings
            x_submission[counter, :] = embeddings_mean
        counter += 1
        if counter % 5000 == 0:
            print(str(counter), " tweets processed")
            
# cut zero rows in x_submission
x_submission = x_submission[np.nonzero(x_submission[:, 1])]

5000  tweets processed
10000  tweets processed


In [7]:
# [Optional] save variables
np.save('x_submission', x_submission)

In [8]:
# [Optional] load variables to save time
x_submission = np.load('x_submission.npy')

## III) Train the model

In [9]:
# set aside a small portion for validation
testset = 1000

x_test = x_train[0:testset, :]
y_test = y_train[0:testset]
x_train_log = x_train[testset + 1:, :]
y_train_log = y_train[testset + 1:]

# standardize
#x_train_log, mean, std = standardize(x_train_log)

# add offset
x_train_log = add_offset(x_train_log)

# train using logistic regression (SGD)
initial_w = np.random.rand(x_train_log.shape[1])
epochs = 40
batch_size = 100
gamma = 0.0001
lambda_ = 0.01
print_every = int(50000 / batch_size)

weights, loss = reg_logistic_regression(y_train_log, x_train_log, initial_w, epochs, batch_size, gamma, lambda_, print_every)

# free up memory
del x_train_log
del y_train_log

epoch	 1 	loss:  80.41898758565523
epoch	 1 	loss:  72.63990867432156
epoch	 1 	loss:  72.27247042275462
epoch	 2 	loss:  71.36846290841906
epoch	 2 	loss:  70.9661952624341
epoch	 2 	loss:  70.56315594898248
epoch	 3 	loss:  70.12605256449098
epoch	 3 	loss:  69.6703350456351
epoch	 3 	loss:  69.63211465423336
epoch	 4 	loss:  69.31261762711519
epoch	 4 	loss:  69.08028647668529
epoch	 4 	loss:  68.99231228631497
epoch	 5 	loss:  68.82432824619028
epoch	 5 	loss:  68.72600853653859
epoch	 5 	loss:  68.63533220042841
epoch	 6 	loss:  68.48845100318616
epoch	 6 	loss:  68.6223296969754
epoch	 6 	loss:  68.3797046782826
epoch	 7 	loss:  68.36875349583184
epoch	 7 	loss:  68.43275071749726
epoch	 7 	loss:  68.34038315851258
epoch	 8 	loss:  68.28139345077676
epoch	 8 	loss:  68.31877622957936
epoch	 8 	loss:  68.2911010593855
epoch	 9 	loss:  68.20385412113002
epoch	 9 	loss:  68.29297923386456
epoch	 9 	loss:  68.1807471177709
epoch	 10 	loss:  68.21568128259491
epoch	 10 	loss:  68.2191

## IV) Test predictions

### Tests on a local validation set

In [10]:
# standardize
#x_test_log = standardize_test(x_test, mean, std)

# add offset
x_test_log = add_offset(x_test)

y_pred = predict_logistic_labels(weights, x_test_log)
accuracy = get_accuracy(y_test, y_pred)
print(accuracy)

0.593


### Predict labels for the test dataset, prepare submission csv file

In [11]:
filename = 'logreg_preprocessing12_reduced.csv'

# standardize
#x_submission_log = standardize_test(x_submission_log, mean, std)

# add offset
x_submission_log = add_offset(x_submission)

y_submission = predict_logistic_labels(weights, x_submission_log)
ids = np.arange(len(y_submission)) + 1

create_csv_submission(ids, y_submission, filename)