In [150]:
import numpy as np
import pickle
import pandas as pd
import os
import csv

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC

# First try with given scripts (BAD)

Open our embeddings

In [2]:
emb = np.load('embeddings.npy')

Load vocabulary

In [3]:
with open("vocab.pkl", "rb") as f:
        vocab = pickle.load(f)

### Vectorize positive tweets

In [4]:
num_lines_pos = sum(1 for line in open('Datasets/twitter-datasets/train_pos_full.txt'))

In [5]:
train_pos = np.zeros((num_lines_pos,emb.shape[1]))
with open('Datasets/twitter-datasets/train_pos_full.txt') as f:
    for line_index, line in enumerate(f):
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        train_pos[line_index] = line_fet

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [6]:
index_to_remove_pos = np.unique([x for x,y in np.argwhere(np.isnan(train_pos))])

In [7]:
train_pos_2 = np.delete(train_pos,index_to_remove_pos,axis = 0)

### Vectorize negative tweets

In [8]:
num_lines_neg = sum(1 for line in open('Datasets/twitter-datasets/train_neg_full.txt'))

In [9]:
train_neg = np.zeros((num_lines_neg,emb.shape[1]))
with open('Datasets/twitter-datasets/train_neg_full.txt') as f:
    for line_index, line in enumerate(f):
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        train_neg[line_index] = line_fet

In [10]:
index_to_remove_neg = np.unique([x for x,y in np.argwhere(np.isnan(train_neg))])

In [11]:
train_neg_2 = np.delete(train_neg,index_to_remove_neg,axis = 0)

### Get total training set

In [12]:
X = np.vstack((train_pos_2,train_neg_2))
y_pos = np.ones(train_pos_2.shape[0])
y_neg = np.repeat(-1,train_neg_2.shape[0])
Y = np.hstack((y_pos,y_neg))

In [13]:
np.save('X',X)
np.save('Y',Y)

In [64]:
X = np.load('X.npy')
Y = np.load('Y.npy')

### Polynomial

In [36]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [37]:
X = build_poly(X,3)

___

In [65]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True, order='C')
X = poly.fit_transform(X)

In [67]:
X

array([[ 1.00000000e+00,  2.96126954e-01, -3.82799708e-02, ...,
         1.45567212e-03, -6.46967335e-03,  2.87541904e-02],
       [ 1.00000000e+00,  3.15962548e-01, -1.83284904e-02, ...,
         3.12987105e-05, -6.29418917e-04,  1.26576516e-02],
       [ 1.00000000e+00,  3.08120552e-01, -1.70786095e-01, ...,
         1.47494328e-05,  2.48567688e-04,  4.18903535e-03],
       ...,
       [ 1.00000000e+00,  3.28799104e-01, -8.18543578e-02, ...,
         3.01726021e-03, -8.18731286e-03,  2.22162118e-02],
       [ 1.00000000e+00,  2.52777100e-01, -1.10337703e-01, ...,
         1.83283686e-03,  1.72449017e-03,  1.62254830e-03],
       [ 1.00000000e+00,  3.21354338e-01,  9.62912834e-02, ...,
         1.86356312e-04,  2.05703618e-03,  2.27059539e-02]])

### Standardize

In [68]:
std = StandardScaler()

In [69]:
X = std.fit_transform(X)

### Split

In [70]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [6]:
np.save('X_train',X_train)
np.save('X_test',X_test)
np.save('Y_train',Y_train)
np.save('Y_test',Y_test)

In [7]:
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
Y_train = np.load('Y_train.npy')
Y_test = np.load('Y_test.npy')

array([-1., -1.,  1., ..., -1.,  1.,  1.])

### Logistic

In [71]:
logi = LogisticRegression(penalty='l2', dual=False, tol=10e-10, C=0.5, fit_intercept=True, intercept_scaling=1, 
                          class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', 
                          verbose=1, warm_start=False, n_jobs=None, l1_ratio=None)

In [None]:
logi.fit(X_train,Y_train)



[LibLinear]

In [56]:
logi.score(X_test,Y_test)

0.5519145574367231

# SVM

In [57]:
svm = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, tol=10e-10, C=0.5, multi_class='ovr', 
                fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=1, random_state=None, 
                max_iter=1000)

In [58]:
svm.fit(X_train,Y_train)

[LibLinear]

LinearSVC(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=1e-09,
          verbose=1)

In [59]:
svm.score(X_test,Y_test)

0.5521745652369571

### Vectorize test tweets

In [24]:
num_lines_test = sum(1 for line in open('Datasets/twitter-datasets/test_data.txt'))

In [25]:
test = np.zeros((num_lines_test,emb.shape[1]))
with open('Datasets/twitter-datasets/test_data.txt') as f:
    for line_index, line in enumerate(f):
        line = line.split(',',1)[1]
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        test[line_index] = line_fet

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [26]:
index_to_remove_test = np.unique([x for x,y in np.argwhere(np.isnan(test))])

In [27]:
test_2 = np.delete(test,index_to_remove_test,axis = 0)

In [28]:
test_2 = std.fit_transform(test_2)

In [29]:
test_2 = build_poly(test_2,2)

In [30]:
prediction = clf.predict(test_2)
prediction_2 = np.insert(prediction, index_to_remove_test -1,-1)

# Word2Vec
## Vocabulary vectorizing
Read words in positive and neg tweets 

In [125]:
from gensim.models import word2vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [126]:
f = open("Datasets/twitter-datasets/train_pos.txt")
tweets_pos = [line.split() for line in f.readlines()]
f.close()

In [127]:
f = open("Datasets/twitter-datasets/train_neg.txt")
tweets_neg = [line.split() for line in f.readlines()]
f.close()

Vectorize the words

In [128]:
size = 200 

In [129]:
model = word2vec.Word2Vec(sentences=tweets_pos + tweets_neg,size = size)

2019-11-26 08:41:51,919 : INFO : collecting all words and their counts
2019-11-26 08:41:51,920 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-26 08:41:51,952 : INFO : PROGRESS: at sentence #10000, processed 141898 words, keeping 11411 word types
2019-11-26 08:41:51,978 : INFO : PROGRESS: at sentence #20000, processed 285174 words, keeping 18950 word types
2019-11-26 08:41:52,003 : INFO : PROGRESS: at sentence #30000, processed 429701 words, keeping 25161 word types
2019-11-26 08:41:52,035 : INFO : PROGRESS: at sentence #40000, processed 571007 words, keeping 30591 word types
2019-11-26 08:41:52,063 : INFO : PROGRESS: at sentence #50000, processed 714851 words, keeping 35669 word types
2019-11-26 08:41:52,091 : INFO : PROGRESS: at sentence #60000, processed 858157 words, keeping 40301 word types
2019-11-26 08:41:52,121 : INFO : PROGRESS: at sentence #70000, processed 1003256 words, keeping 44631 word types
2019-11-26 08:41:52,151 : INFO : PROGRESS: at

## Embeeding
### Positive

In [130]:
train_pos = np.zeros((len(tweets_pos),size))
for index, tokens in enumerate(tweets_pos):
    vect = [model.wv[token] for token in tokens if token in model.wv]
    train_pos[index] = np.mean(vect, axis = 0)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [131]:
index_to_remove_pos = np.unique([x for x,y in np.argwhere(np.isnan(train_pos))])

In [132]:
train_pos_2 = np.delete(train_pos,index_to_remove_pos,axis = 0)

### Negative

In [133]:
train_neg = np.zeros((len(tweets_neg),size))
for index, tokens in enumerate(tweets_neg):
    vect = [model.wv[token] for token in tokens if token in model.wv]
    train_neg[index] = np.mean(vect, axis = 0)

In [134]:
index_to_remove_neg = np.unique([x for x,y in np.argwhere(np.isnan(train_neg))])

In [135]:
train_neg_2 = np.delete(train_neg,index_to_remove_neg,axis = 0)

### Test

In [None]:
f = open("Datasets/twitter-datasets/test_data.txt")
tweets_test = [line.split() for line in f.readlines()]
f.close()

In [None]:
test = np.zeros((len(tweets_test),size))
for index, tokens in enumerate(tweets_test):
    vect = [model.wv[token] for token in tokens if token in model.wv]
    test[index] = np.mean(vect, axis = 0)

In [None]:
index_to_remove_test = np.unique([x for x,y in np.argwhere(np.isnan(test))])

In [None]:
test_2 = np.delete(test,index_to_remove_test,axis = 0)

## Combine
Combine pos and neg to have full training 

In [136]:
X = np.vstack((train_pos_2,train_neg_2))
y_pos = np.ones(train_pos_2.shape[0])
y_neg = np.repeat(-1,train_neg_2.shape[0])
Y = np.hstack((y_pos,y_neg))

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

## Train
Logistic Regression with Cross-validation so don't need to split 

In [168]:
logiCV = LogisticRegressionCV(Cs=10, fit_intercept=True, cv=5, dual=False, penalty='l2', scoring=None,
                     solver='newton-cg', tol=0.0001, max_iter=1000, class_weight=None, n_jobs=-1, verbose=1,
                     refit=True, intercept_scaling=1.0, multi_class='ovr', random_state=None, l1_ratios=None)

In [None]:
logiCV.fit(X,Y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


## Predict

In [164]:
prediction = logiCV.predict(test_2)
prediction_2 = np.insert(prediction, index_to_remove_test -1,-1)

### Submission

In [123]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [124]:
create_csv_submission(range(1,10001), prediction_2, 'submission.csv')

### Accuracy

In [165]:
solution = pd.read_csv('derived_solution.csv').Prediction
print("Accuracy : {:.02f}%".format(100*np.mean(solution == prediction_2)))

Accuracy : 75.66%


Cross validation with solver :
- lbfgs : 75.66