In [159]:
import numpy as np
import pickle
import pandas as pd
import os
import csv

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC

# First try with given scripts (BAD)

Open our embeddings

In [160]:
emb = np.load('embeddings.npy')

Load vocabulary

In [161]:
with open("vocab.pkl", "rb") as f:
        vocab = pickle.load(f)

### Vectorize positive tweets

In [162]:
num_lines_pos = sum(1 for line in open('Datasets/twitter-datasets/train_pos_full_cleaned.txt'))

In [163]:
train_pos = np.zeros((num_lines_pos,emb.shape[1]))
with open('Datasets/twitter-datasets/train_pos_full_cleaned.txt') as f:
    for line_index, line in enumerate(f):
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        train_pos[line_index] = line_fet

In [164]:
index_to_remove_pos = np.unique([x for x,y in np.argwhere(np.isnan(train_pos))])

In [165]:
train_pos_2 = np.delete(train_pos,index_to_remove_pos,axis = 0)

  """Entry point for launching an IPython kernel.


### Vectorize negative tweets

In [166]:
num_lines_neg = sum(1 for line in open('Datasets/twitter-datasets/train_neg_full_cleaned.txt'))

In [167]:
train_neg = np.zeros((num_lines_neg,emb.shape[1]))
with open('Datasets/twitter-datasets/train_neg_full_cleaned.txt') as f:
    for line_index, line in enumerate(f):
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        train_neg[line_index] = line_fet

In [168]:
index_to_remove_neg = np.unique([x for x,y in np.argwhere(np.isnan(train_neg))])

In [169]:
train_neg_2 = np.delete(train_neg,index_to_remove_neg,axis = 0)

  """Entry point for launching an IPython kernel.


### Get total training set

In [170]:
X = np.vstack((train_pos_2,train_neg_2))
y_pos = np.ones(train_pos_2.shape[0])
y_neg = np.repeat(-1,train_neg_2.shape[0])
Y = np.hstack((y_pos,y_neg))

In [171]:
np.save('X',X)
np.save('Y',Y)

In [172]:
X = np.load('X.npy')
Y = np.load('Y.npy')

### Polynomial

In [126]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [None]:
X = build_poly(X,3)

___

In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True, order='C')
X = poly.fit_transform(X)

In [None]:
X

### Standardize

In [173]:
std = StandardScaler()

In [174]:
X = std.fit_transform(X)

### Split

In [175]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [176]:
np.save('X_train',X_train)
np.save('X_test',X_test)
np.save('Y_train',Y_train)
np.save('Y_test',Y_test)

In [177]:
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
Y_train = np.load('Y_train.npy')
Y_test = np.load('Y_test.npy')

### Logistic

In [178]:
logi = LogisticRegression(penalty='l2', dual=False, tol=10e-10, C=0.5, fit_intercept=True, intercept_scaling=1, 
                          class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', 
                          verbose=1, warm_start=False, n_jobs=None)

In [179]:
logi.fit(X_train,Y_train)



[LibLinear]

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=1e-09, verbose=1, warm_start=False)

In [154]:
logi.score(X_test,Y_test)

0.5966421276625049

In [180]:
#old
logi.score(X_test,Y_test)

0.62238

# SVM

In [155]:
svm = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, tol=10e-10, C=0.5, multi_class='ovr', 
                fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=1, random_state=None, 
                max_iter=1000)

In [156]:
svm.fit(X_train,Y_train)

[LibLinear]

LinearSVC(C=0.5, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=1e-09,
     verbose=1)

In [157]:
svm.score(X_test,Y_test)

0.595322209290848

In [34]:
svm.score(X_test,Y_test)

0.5041910083820168

In [60]:
svm.score(X_test,Y_test)

0.5041910083820168

In [86]:
svm.score(X_test,Y_test)

0.5041910083820168

### Vectorize test tweets

In [None]:
num_lines_test = sum(1 for line in open('Datasets/twitter-datasets/test_data.txt'))

In [None]:
test = np.zeros((num_lines_test,emb.shape[1]))
with open('Datasets/twitter-datasets/test_data.txt') as f:
    for line_index, line in enumerate(f):
        line = line.split(',',1)[1]
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        test[line_index] = line_fet

In [None]:
index_to_remove_test = np.unique([x for x,y in np.argwhere(np.isnan(test))])

In [None]:
test_2 = np.delete(test,index_to_remove_test,axis = 0)

In [None]:
test_2 = std.fit_transform(test_2)

In [None]:
test_2 = build_poly(test_2,2)

In [None]:
prediction = clf.predict(test_2)
prediction_2 = np.insert(prediction, index_to_remove_test -1,-1)

# Word2Vec
## Vocabulary vectorizing
Read words in positive and neg tweets 

In [29]:
from gensim.models import word2vec
import gensim
import logging
import tempfile
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [31]:
f = open("Datasets/twitter-datasets/train_pos_full_cleaned.txt")
tweets_pos = [line.split() for line in f.readlines()]
f.close()

In [32]:
f = open("Datasets/twitter-datasets/train_neg_full_cleaned.txt")
tweets_neg = [line.split() for line in f.readlines()]
f.close()

Vectorize the words

In [33]:
# Parameters for Word2vec
size = 300
min_count = 5
epoch = 10

In [34]:
model = word2vec.Word2Vec(sentences=tweets_pos + tweets_neg, corpus_file=None, size=size, alpha=0.025, window=5,
                          min_count=min_count, max_vocab_size=None, sample=0.001, seed=1, workers=1, min_alpha=0.0001, sg=0,
                          hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, iter=epoch, null_word=0, trim_rule=None,
                          sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), max_final_vocab=None)

2019-12-11 15:10:31,545 : INFO : collecting all words and their counts
2019-12-11 15:10:31,548 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-12-11 15:10:31,594 : INFO : PROGRESS: at sentence #10000, processed 127472 words, keeping 12836 word types
2019-12-11 15:10:31,666 : INFO : PROGRESS: at sentence #20000, processed 256855 words, keeping 20142 word types
2019-12-11 15:10:31,711 : INFO : PROGRESS: at sentence #30000, processed 385671 words, keeping 26057 word types
2019-12-11 15:10:31,765 : INFO : PROGRESS: at sentence #40000, processed 514151 words, keeping 31480 word types
2019-12-11 15:10:31,831 : INFO : PROGRESS: at sentence #50000, processed 643058 words, keeping 36364 word types
2019-12-11 15:10:31,892 : INFO : PROGRESS: at sentence #60000, processed 772564 words, keeping 40846 word types
2019-12-11 15:10:31,943 : INFO : PROGRESS: at sentence #70000, processed 901334 words, keeping 45115 word types
2019-12-11 15:10:31,989 : INFO : PROGRESS: at 

2019-12-11 15:10:35,996 : INFO : PROGRESS: at sentence #720000, processed 9301095 words, keeping 205074 word types
2019-12-11 15:10:36,069 : INFO : PROGRESS: at sentence #730000, processed 9430444 words, keeping 207037 word types
2019-12-11 15:10:36,126 : INFO : PROGRESS: at sentence #740000, processed 9558939 words, keeping 208953 word types
2019-12-11 15:10:36,199 : INFO : PROGRESS: at sentence #750000, processed 9689112 words, keeping 210850 word types
2019-12-11 15:10:36,228 : INFO : PROGRESS: at sentence #760000, processed 9819464 words, keeping 212716 word types
2019-12-11 15:10:36,289 : INFO : PROGRESS: at sentence #770000, processed 9949877 words, keeping 214578 word types
2019-12-11 15:10:36,354 : INFO : PROGRESS: at sentence #780000, processed 10079045 words, keeping 216375 word types
2019-12-11 15:10:36,387 : INFO : PROGRESS: at sentence #790000, processed 10209081 words, keeping 218180 word types
2019-12-11 15:10:36,433 : INFO : PROGRESS: at sentence #800000, processed 1033

2019-12-11 15:10:39,745 : INFO : PROGRESS: at sentence #1430000, processed 19078868 words, keeping 359312 word types
2019-12-11 15:10:39,785 : INFO : PROGRESS: at sentence #1440000, processed 19225539 words, keeping 361430 word types
2019-12-11 15:10:39,825 : INFO : PROGRESS: at sentence #1450000, processed 19372052 words, keeping 363486 word types
2019-12-11 15:10:39,860 : INFO : PROGRESS: at sentence #1460000, processed 19518827 words, keeping 365528 word types
2019-12-11 15:10:39,899 : INFO : PROGRESS: at sentence #1470000, processed 19665445 words, keeping 367671 word types
2019-12-11 15:10:39,937 : INFO : PROGRESS: at sentence #1480000, processed 19811735 words, keeping 369824 word types
2019-12-11 15:10:39,976 : INFO : PROGRESS: at sentence #1490000, processed 19958006 words, keeping 372046 word types
2019-12-11 15:10:40,021 : INFO : PROGRESS: at sentence #1500000, processed 20105476 words, keeping 374098 word types
2019-12-11 15:10:40,079 : INFO : PROGRESS: at sentence #1510000,

2019-12-11 15:10:43,407 : INFO : PROGRESS: at sentence #2140000, processed 29495123 words, keeping 489887 word types
2019-12-11 15:10:43,455 : INFO : PROGRESS: at sentence #2150000, processed 29641809 words, keeping 491495 word types
2019-12-11 15:10:43,510 : INFO : PROGRESS: at sentence #2160000, processed 29787640 words, keeping 493054 word types
2019-12-11 15:10:43,561 : INFO : PROGRESS: at sentence #2170000, processed 29934864 words, keeping 494729 word types
2019-12-11 15:10:43,631 : INFO : PROGRESS: at sentence #2180000, processed 30081595 words, keeping 496425 word types
2019-12-11 15:10:43,655 : INFO : collected 496960 word types from a corpus of 30133876 raw words and 2183552 sentences
2019-12-11 15:10:43,656 : INFO : Loading a fresh vocabulary
2019-12-11 15:10:44,082 : INFO : effective_min_count=5 retains 83846 unique words (16% of original 496960, drops 413114)
2019-12-11 15:10:44,083 : INFO : effective_min_count=5 leaves 29564409 word corpus (98% of original 30133876, drops

2019-12-11 15:11:46,608 : INFO : EPOCH 1 - PROGRESS: at 88.94% examples, 326419 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:11:47,618 : INFO : EPOCH 1 - PROGRESS: at 90.45% examples, 326956 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:11:48,621 : INFO : EPOCH 1 - PROGRESS: at 91.87% examples, 327280 words/s, in_qsize 2, out_qsize 0
2019-12-11 15:11:49,633 : INFO : EPOCH 1 - PROGRESS: at 93.37% examples, 327779 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:11:50,634 : INFO : EPOCH 1 - PROGRESS: at 94.81% examples, 328090 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:11:51,635 : INFO : EPOCH 1 - PROGRESS: at 96.24% examples, 328381 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:11:52,643 : INFO : EPOCH 1 - PROGRESS: at 97.67% examples, 328627 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:11:53,657 : INFO : EPOCH 1 - PROGRESS: at 99.10% examples, 328832 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:11:54,284 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-

2019-12-11 15:12:59,268 : INFO : EPOCH 2 - PROGRESS: at 95.27% examples, 328948 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:13:00,281 : INFO : EPOCH 2 - PROGRESS: at 96.67% examples, 329052 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:13:01,294 : INFO : EPOCH 2 - PROGRESS: at 98.07% examples, 329148 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:13:02,301 : INFO : EPOCH 2 - PROGRESS: at 99.48% examples, 329271 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:13:02,670 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-12-11 15:13:02,672 : INFO : EPOCH - 2 : training on 30133876 raw words (22519505 effective words) took 68.4s, 329339 effective words/s
2019-12-11 15:13:03,711 : INFO : EPOCH 3 - PROGRESS: at 1.57% examples, 314161 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:13:04,730 : INFO : EPOCH 3 - PROGRESS: at 3.20% examples, 322938 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:13:05,743 : INFO : EPOCH 3 - PROGRESS: at 4.80% examples, 324230 words/s, in_qs

2019-12-11 15:14:12,598 : INFO : EPOCH 3 - PROGRESS: at 74.56% examples, 233908 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:14:13,606 : INFO : EPOCH 3 - PROGRESS: at 75.58% examples, 234106 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:14:14,625 : INFO : EPOCH 3 - PROGRESS: at 76.83% examples, 234995 words/s, in_qsize 2, out_qsize 0
2019-12-11 15:14:15,629 : INFO : EPOCH 3 - PROGRESS: at 78.06% examples, 235798 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:14:16,637 : INFO : EPOCH 3 - PROGRESS: at 79.33% examples, 236776 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:14:17,639 : INFO : EPOCH 3 - PROGRESS: at 80.65% examples, 237846 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:14:18,646 : INFO : EPOCH 3 - PROGRESS: at 81.77% examples, 238273 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:14:19,683 : INFO : EPOCH 3 - PROGRESS: at 82.45% examples, 237226 words/s, in_qsize 2, out_qsize 0
2019-12-11 15:14:20,739 : INFO : EPOCH 3 - PROGRESS: at 82.89% examples, 235373 words/s, in_qsiz

2019-12-11 15:15:25,944 : INFO : EPOCH 4 - PROGRESS: at 58.44% examples, 272511 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:15:26,959 : INFO : EPOCH 4 - PROGRESS: at 59.69% examples, 273074 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:15:27,969 : INFO : EPOCH 4 - PROGRESS: at 61.01% examples, 273954 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:15:28,977 : INFO : EPOCH 4 - PROGRESS: at 62.35% examples, 274971 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:15:29,997 : INFO : EPOCH 4 - PROGRESS: at 63.51% examples, 274964 words/s, in_qsize 2, out_qsize 0
2019-12-11 15:15:31,008 : INFO : EPOCH 4 - PROGRESS: at 64.60% examples, 274692 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:15:32,029 : INFO : EPOCH 4 - PROGRESS: at 65.92% examples, 275404 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:15:33,052 : INFO : EPOCH 4 - PROGRESS: at 67.26% examples, 276234 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:15:34,059 : INFO : EPOCH 4 - PROGRESS: at 68.57% examples, 276969 words/s, in_qsiz

2019-12-11 15:16:38,044 : INFO : EPOCH 5 - PROGRESS: at 72.06% examples, 380674 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:16:39,055 : INFO : EPOCH 5 - PROGRESS: at 73.59% examples, 380346 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:16:40,061 : INFO : EPOCH 5 - PROGRESS: at 75.05% examples, 379727 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:16:41,076 : INFO : EPOCH 5 - PROGRESS: at 76.58% examples, 379419 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:16:42,083 : INFO : EPOCH 5 - PROGRESS: at 78.15% examples, 379321 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:16:43,098 : INFO : EPOCH 5 - PROGRESS: at 79.71% examples, 379186 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:16:44,106 : INFO : EPOCH 5 - PROGRESS: at 81.27% examples, 379088 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:16:45,110 : INFO : EPOCH 5 - PROGRESS: at 82.83% examples, 379044 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:16:46,121 : INFO : EPOCH 5 - PROGRESS: at 84.29% examples, 378478 words/s, in_qsiz

2019-12-11 15:17:49,851 : INFO : EPOCH 6 - PROGRESS: at 90.60% examples, 377742 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:17:50,852 : INFO : EPOCH 6 - PROGRESS: at 92.15% examples, 377740 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:17:51,870 : INFO : EPOCH 6 - PROGRESS: at 93.71% examples, 377623 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:17:52,885 : INFO : EPOCH 6 - PROGRESS: at 95.30% examples, 377665 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:17:53,900 : INFO : EPOCH 6 - PROGRESS: at 96.89% examples, 377697 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:17:54,913 : INFO : EPOCH 6 - PROGRESS: at 98.48% examples, 377737 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:17:55,902 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-12-11 15:17:55,903 : INFO : EPOCH - 6 : training on 30133876 raw words (22519410 effective words) took 59.6s, 377646 effective words/s
2019-12-11 15:17:56,919 : INFO : EPOCH 7 - PROGRESS: at 1.96% examples, 401005 words/s, in_

2019-12-11 15:19:00,978 : INFO : EPOCH 8 - PROGRESS: at 11.26% examples, 383661 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:19:01,994 : INFO : EPOCH 8 - PROGRESS: at 13.18% examples, 384734 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:19:03,005 : INFO : EPOCH 8 - PROGRESS: at 15.09% examples, 385793 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:19:04,010 : INFO : EPOCH 8 - PROGRESS: at 17.01% examples, 386857 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:19:05,026 : INFO : EPOCH 8 - PROGRESS: at 18.92% examples, 387273 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:19:06,033 : INFO : EPOCH 8 - PROGRESS: at 20.79% examples, 387310 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:19:07,047 : INFO : EPOCH 8 - PROGRESS: at 22.71% examples, 387699 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:19:08,052 : INFO : EPOCH 8 - PROGRESS: at 24.62% examples, 388254 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:19:09,060 : INFO : EPOCH 8 - PROGRESS: at 26.52% examples, 388713 words/s, in_qsiz

2019-12-11 15:20:13,298 : INFO : EPOCH 9 - PROGRESS: at 36.02% examples, 390610 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:20:14,314 : INFO : EPOCH 9 - PROGRESS: at 37.92% examples, 390638 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:20:15,317 : INFO : EPOCH 9 - PROGRESS: at 39.82% examples, 390891 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:20:16,322 : INFO : EPOCH 9 - PROGRESS: at 41.68% examples, 390772 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:20:17,324 : INFO : EPOCH 9 - PROGRESS: at 43.58% examples, 391040 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:20:18,337 : INFO : EPOCH 9 - PROGRESS: at 45.49% examples, 391090 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:20:19,344 : INFO : EPOCH 9 - PROGRESS: at 47.39% examples, 391237 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:20:20,355 : INFO : EPOCH 9 - PROGRESS: at 49.28% examples, 391315 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:20:21,358 : INFO : EPOCH 9 - PROGRESS: at 51.00% examples, 391198 words/s, in_qsiz

2019-12-11 15:21:25,023 : INFO : EPOCH 10 - PROGRESS: at 58.97% examples, 390475 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:21:26,028 : INFO : EPOCH 10 - PROGRESS: at 60.53% examples, 390074 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:21:27,035 : INFO : EPOCH 10 - PROGRESS: at 62.07% examples, 389431 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:21:28,052 : INFO : EPOCH 10 - PROGRESS: at 63.63% examples, 388929 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:21:29,069 : INFO : EPOCH 10 - PROGRESS: at 65.23% examples, 388658 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:21:30,087 : INFO : EPOCH 10 - PROGRESS: at 66.82% examples, 388387 words/s, in_qsize 1, out_qsize 0
2019-12-11 15:21:31,106 : INFO : EPOCH 10 - PROGRESS: at 68.41% examples, 388134 words/s, in_qsize 2, out_qsize 0
2019-12-11 15:21:32,114 : INFO : EPOCH 10 - PROGRESS: at 69.97% examples, 387799 words/s, in_qsize 2, out_qsize 0
2019-12-11 15:21:33,120 : INFO : EPOCH 10 - PROGRESS: at 71.53% examples, 387501 words/s

## Embeeding
### Positive

In [35]:
train_pos = np.zeros((len(tweets_pos),size))
for index, tokens in enumerate(tweets_pos):
    vect = [model.wv[token] for token in tokens if token in model.wv]
    train_pos[index] = np.mean(vect, axis = 0)

In [36]:
index_to_remove_pos = np.unique([x for x,y in np.argwhere(np.isnan(train_pos))])

In [37]:
train_pos_2 = np.delete(train_pos,index_to_remove_pos,axis = 0)

  """Entry point for launching an IPython kernel.


### Negative

In [38]:
train_neg = np.zeros((len(tweets_neg),size))
for index, tokens in enumerate(tweets_neg):
    vect = [model.wv[token] for token in tokens if token in model.wv]
    train_neg[index] = np.mean(vect, axis = 0)

In [39]:
index_to_remove_neg = np.unique([x for x,y in np.argwhere(np.isnan(train_neg))])

In [40]:
train_neg_2 = np.delete(train_neg,index_to_remove_neg,axis = 0)

  """Entry point for launching an IPython kernel.


### Test

In [41]:
f = open("Datasets/twitter-datasets/test_data.txt")
tweets_test = [line.split() for line in f.readlines()]
f.close()

In [42]:
test = np.zeros((len(tweets_test),size))
for index, tokens in enumerate(tweets_test):
    vect = [model.wv[token] for token in tokens if token in model.wv]
    test[index] = np.mean(vect, axis = 0)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [43]:
index_to_remove_test = np.unique([x for x,y in np.argwhere(np.isnan(test))])

In [44]:
test_2 = np.delete(test,index_to_remove_test,axis = 0)

## Combine
Combine pos and neg to have full training 

In [45]:
X = np.vstack((train_pos_2,train_neg_2))
y_pos = np.ones(train_pos_2.shape[0])
y_neg = np.repeat(-1,train_neg_2.shape[0])
Y = np.hstack((y_pos,y_neg))

In [46]:
np.save('Word2vec_X',X)
np.save('Word2vec_Y',Y)
np.save('Word2vec_test',test_2)

In [2]:
X = np.load('Word2vec_X.npy')
Y = np.load('Word2vec_Y.npy')
test_2 = np.load('Word2vec_test.npy')

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

## Train
Logistic Regression with Cross-validation so don't need to split 

In [None]:
log = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10e2, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=None, solver='sag', max_iter=100000, multi_class='ovr',
                         verbose=0, warm_start=False, n_jobs=-1, l1_ratio=None)

In [None]:
log.fit(X_train,Y_train)

In [None]:
log.score(X_test,Y_test)

In [None]:
logiCV = LogisticRegressionCV(Cs=5, fit_intercept=True, cv=4, dual=False, penalty='l2', scoring=None,
                     solver='sag', tol=0.0001, max_iter=10000, class_weight=None, n_jobs=-1, verbose=0,
                     refit=True, intercept_scaling=1.0, multi_class='ovr', random_state=None, l1_ratios=None)

In [None]:
logiCV.fit(X,Y)

# Vader

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [37]:
f = open("Datasets/twitter-datasets/test_data.txt")
tweets = [line for line in f.readlines()]
f.close()

In [107]:
tweets_pos

['i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15\n',
 "because your logic is so dumb , i won't even crop out your name or your photo . tsk . <url>\n",
 '" just put casper in a box ! " looved the battle ! #crakkbitch\n',
 "thanks sir > > don't trip lil mama ... just keep doin ya thang !\n",
 'visiting my brother tmr is the bestest birthday gift eveerrr ! ! !\n',
 'yay ! ! #lifecompleted . tweet / facebook me to let me know please\n',
 '#1dnextalbumtitle : feel for you / rollercoaster of life . song cocept : life , #yolo , becoming famous ? <3 14 #followmeplz ! <3 x15\n',
 "workin hard or hardly workin rt at hardee's with my future coworker <user>\n",
 "i saw . i'll be replying in a bit .\n",
 'this is were i belong\n',
 'anddd to cheer #nationals2013 ?\n',
 'we send an invitation to shop on-line ! here you will find everything you need - without leaving home ... <url>\n',
 'just woke up , finna go to church\n',
 '

In [74]:
print(tt)
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
tokens = word_tokenize(tt)
result = [i for i in tokens if not i in stop_words]
result = ' '.join(result)
print (result)

i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15

dunno justin read mention . justin god knows , hope follow # believe 15


In [77]:
sid.polarity_scores('just woke up , finna go to church ')

{'neg': 0.0, 'neu': 0.686, 'pos': 0.314, 'compound': 0.4939}

In [82]:
from tqdm.autonotebook import tqdm

  """Entry point for launching an IPython kernel.


In [102]:
prediction_2 = []
stop_words = set(stopwords.words('english'))
for tweet in tqdm(tweets):
    tokens = word_tokenize(tweet)
    result = [i for i in tokens if not i in stop_words]
    result = ' '.join(result)

    ss = sid.polarity_scores(tweet)
    if ss['neu'] == 1:
        prediction_2.append(-1)
    elif ss['neg'] > ss['pos']:
        prediction_2.append(-1)
    else:
        prediction_2.append(1)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




# RNN

In [158]:
! pip install tensorflow


Collecting tensorflow
  Using cached https://files.pythonhosted.org/packages/2c/72/6b3264aa2889b7dde7663464b99587d95cd6a5f3b9b30181f14d78a63e64/tensorflow-2.0.0-cp37-cp37m-macosx_10_11_x86_64.whl
Collecting wrapt>=1.11.1 (from tensorflow)
[31mthinc 6.12.1 has requirement msgpack<0.6.0,>=0.5.6, but you'll have msgpack 0.6.2 which is incompatible.[0m
[31mthinc 6.12.1 has requirement wrapt<1.11.0,>=1.10.0, but you'll have wrapt 1.11.2 which is incompatible.[0m
[31mspacy 2.0.16 has requirement regex==2018.01.10, but you'll have regex 2019.4.14 which is incompatible.[0m
Installing collected packages: wrapt, tensorflow
  Found existing installation: wrapt 1.10.11
[31mCannot uninstall 'wrapt'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.[0m


In [9]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.layers import SpatialDropout1D

from keras_preprocessing import text


from sklearn.model_selection import train_test_split


from gensim.models import word2vec
import gensim
import logging
import tempfile
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Using TensorFlow backend.


ModuleNotFoundError: No module named 'tensorflow'

In [2]:
pos_df = pd.read_csv("Datasets/twitter-datasets/train_pos_cleaned.csv", index_col=0)
neg_df = pd.read_csv("Datasets/twitter-datasets/train_neg_cleaned.csv", index_col=0)

In [3]:
train = pd.concat([pos_df,neg_df])

In [4]:
train = train.sample(frac=1, random_state = 1)

Unnamed: 0,tweets,label
49673,k fine lah if liddat i also sleep loh haiz nights twitter,1
71551,going to be told im blind tomorrow ok slight exaggeration but yeah having my eyes tested then a day of uni work just need it done,0
5506,sometimes its nice to just buy a little bit of jewelry #liasophia,1
38370,this is my tweet,1
36930,im not always nice but i dont have a reason not to be,1


In [6]:
train = train.dropna()

In [11]:
test = pd.read_csv("Datasets/twitter-datasets/test_data_cleaned.csv", index_col=0)

In [20]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 100000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 50
# This is fixed.
EMBEDDING_DIM = 300
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(np.hstack((train.tweets.values,test.tweets.values)))
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 100260 unique tokens.


In [21]:
X = tokenizer.texts_to_sequences(train.tweets.values)
X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (199976, 50)


In [22]:
Y = train.label

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 1)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(159980, 50) (159980,)
(39996, 50) (39996,)


In [26]:
batch_size = 1024

print('Build model...')
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
#model.add(SpatialDropout1D(0.4))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
#model.add(Dense(64))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=3,
          validation_data=(X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Build model...
Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 159980 samples, validate on 39996 samples
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [127]:
X_test = tokenizer.texts_to_sequences(test.tweets.values)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

In [128]:
y_pred_prob = model.predict(X_test)

In [129]:
y_pred = np.ones_like(y_pred_prob)
y_pred[y_pred_prob<0.5] = -1

In [130]:
y_pred = y_pred.flatten()

## Predict

In [None]:
prediction = log.predict(test_2)
y_pred = np.insert(prediction, index_to_remove_test -1,-1)

### Submission

In [109]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [110]:
create_csv_submission(range(1,10001), y_pred, 'submission.csv')

### Accuracy

In [131]:
solution = pd.read_csv('derived_solution.csv').Prediction
print("Accuracy : {:.02f}%".format(100*np.mean(solution == y_pred)))

Accuracy : 75.95%


Cross validation with solver :
- lbfgs : 75.66
- newton-cg : 75.60%
- sag : 75.69%

Best on Aicrowd: (76.90%)
- not full tweets
- sag with C = 1, tol = 0.0001