In [42]:
import numpy as np
import pickle
import pandas as pd
import os
import csv

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

Open our embeddings

In [2]:
emb = np.load('embeddings.npy')

Load vocabulary

In [3]:
with open("vocab.pkl", "rb") as f:
        vocab = pickle.load(f)

### Vectorize positive tweets

In [4]:
num_lines_pos = sum(1 for line in open('Datasets/twitter-datasets/train_pos_full.txt'))

In [5]:
train_pos = np.zeros((num_lines_pos,emb.shape[1]))
with open('Datasets/twitter-datasets/train_pos_full.txt') as f:
    for line_index, line in enumerate(f):
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        train_pos[line_index] = line_fet

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [6]:
index_to_remove_pos = np.unique([x for x,y in np.argwhere(np.isnan(train_pos))])

In [7]:
train_pos_2 = np.delete(train_pos,index_to_remove_pos,axis = 0)

### Vectorize negative tweets

In [8]:
num_lines_neg = sum(1 for line in open('Datasets/twitter-datasets/train_neg_full.txt'))

In [9]:
train_neg = np.zeros((num_lines_neg,emb.shape[1]))
with open('Datasets/twitter-datasets/train_neg_full.txt') as f:
    for line_index, line in enumerate(f):
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        train_neg[line_index] = line_fet

In [10]:
index_to_remove_neg = np.unique([x for x,y in np.argwhere(np.isnan(train_neg))])

In [11]:
train_neg_2 = np.delete(train_neg,index_to_remove_neg,axis = 0)

### Get total training set

In [12]:
X = np.vstack((train_pos_2,train_neg_2))
y_pos = np.ones(train_pos_2.shape[0])
y_neg = np.repeat(-1,train_neg_2.shape[0])
Y = np.hstack((y_pos,y_neg))

### Train Logistic Model

In [13]:
def split_data(x, y, ratio, myseed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(myseed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

In [14]:
x_tr, x_te, y_tr, y_te = split_data(X,Y,0.8)

In [49]:
clf = LogisticRegression(penalty='l2', dual=False, tol=10e-10, C=10e-4, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=None, solver='liblinear', max_iter=10000, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=None, l1_ratio=None).fit(x_tr,y_tr)

In [50]:
clf.score(x_te,y_te)

0.5711191335740072

### Vectorize test tweets

In [17]:
num_lines_test = sum(1 for line in open('Datasets/twitter-datasets/test_data.txt'))

In [18]:
test = np.zeros((num_lines_test,emb.shape[1]))
with open('Datasets/twitter-datasets/test_data.txt') as f:
    for line_index, line in enumerate(f):
        line = line.split(',',1)[1]
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        test[line_index] = line_fet

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [19]:
index_to_remove_test = np.unique([x for x,y in np.argwhere(np.isnan(test))])

In [20]:
test_2 = np.delete(test,index_to_remove_test,axis = 0)

In [21]:
prediction = clf.predict(test_2)
prediction_2 = np.insert(prediction, index_to_remove_test -1,1)

### Submission

In [23]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [24]:
create_csv_submission(range(1,10001), prediction_2, 'submission.csv')