In [1]:
import numpy as np
import pickle
import pandas as pd
import os
import csv

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC

# First try with given scripts (BAD)

Open our embeddings

In [88]:
emb = np.load('embeddings.npy')

Load vocabulary

In [None]:
with open("vocab.pkl", "rb") as f:
        vocab = pickle.load(f)

### Vectorize positive tweets

In [None]:
num_lines_pos = sum(1 for line in open('Datasets/twitter-datasets/train_pos_full.txt'))

In [None]:
train_pos = np.zeros((num_lines_pos,emb.shape[1]))
with open('Datasets/twitter-datasets/train_pos.txt') as f:
    for line_index, line in enumerate(f):
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        train_pos[line_index] = line_fet

In [None]:
index_to_remove_pos = np.unique([x for x,y in np.argwhere(np.isnan(train_pos))])

In [None]:
train_pos_2 = np.delete(train_pos,index_to_remove_pos,axis = 0)

### Vectorize negative tweets

In [None]:
num_lines_neg = sum(1 for line in open('Datasets/twitter-datasets/train_neg_full.txt'))

In [None]:
train_neg = np.zeros((num_lines_neg,emb.shape[1]))
with open('Datasets/twitter-datasets/train_neg.txt') as f:
    for line_index, line in enumerate(f):
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        train_neg[line_index] = line_fet

In [None]:
index_to_remove_neg = np.unique([x for x,y in np.argwhere(np.isnan(train_neg))])

In [None]:
train_neg_2 = np.delete(train_neg,index_to_remove_neg,axis = 0)

### Get total training set

In [86]:
X = np.vstack((train_pos_2,train_neg_2))
y_pos = np.ones(train_pos_2.shape[0])
y_neg = np.repeat(-1,train_neg_2.shape[0])
Y = np.hstack((y_pos,y_neg))

NameError: name 'train_pos_2' is not defined

In [None]:
np.save('X',X)
np.save('Y',Y)

In [None]:
X = np.load('X.npy')
Y = np.load('Y.npy')

### Polynomial

In [None]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly

In [None]:
X = build_poly(X,3)

___

In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True, order='C')
X = poly.fit_transform(X)

In [None]:
X

### Standardize

In [None]:
std = StandardScaler()

In [None]:
X = std.fit_transform(X)

### Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [None]:
np.save('X_train',X_train)
np.save('X_test',X_test)
np.save('Y_train',Y_train)
np.save('Y_test',Y_test)

In [None]:
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
Y_train = np.load('Y_train.npy')
Y_test = np.load('Y_test.npy')

### Logistic

In [None]:
logi = LogisticRegression(penalty='l2', dual=False, tol=10e-10, C=0.5, fit_intercept=True, intercept_scaling=1, 
                          class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', 
                          verbose=1, warm_start=False, n_jobs=None, l1_ratio=None)

In [None]:
logi.fit(X_train,Y_train)

In [None]:
logi.score(X_test,Y_test)

# SVM

In [None]:
svm = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, tol=10e-10, C=0.5, multi_class='ovr', 
                fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=1, random_state=None, 
                max_iter=1000)

In [None]:
svm.fit(X_train,Y_train)

In [None]:
svm.score(X_test,Y_test)

### Vectorize test tweets

In [None]:
num_lines_test = sum(1 for line in open('Datasets/twitter-datasets/test_data.txt'))

In [None]:
test = np.zeros((num_lines_test,emb.shape[1]))
with open('Datasets/twitter-datasets/test_data.txt') as f:
    for line_index, line in enumerate(f):
        line = line.split(',',1)[1]
        words = line.split()
        index = [vocab[word] for word in words if word in vocab.keys()]
        line_fet = np.mean(np.array([emb[i] for i in index]),axis = 0)
        test[line_index] = line_fet

In [None]:
index_to_remove_test = np.unique([x for x,y in np.argwhere(np.isnan(test))])

In [None]:
test_2 = np.delete(test,index_to_remove_test,axis = 0)

In [None]:
test_2 = std.fit_transform(test_2)

In [None]:
test_2 = build_poly(test_2,2)

In [None]:
prediction = clf.predict(test_2)
prediction_2 = np.insert(prediction, index_to_remove_test -1,-1)

# Word2Vec
## Vocabulary vectorizing
Read words in positive and neg tweets 

In [132]:
from gensim.models import word2vec
import gensim
import logging
import tempfile
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
f = open("Datasets/twitter-datasets/train_pos_cleaned.txt")
tweets_pos = [line.split() for line in f.readlines()]
f.close()

In [11]:
f = open("Datasets/twitter-datasets/train_neg_cleaned.txt")
tweets_neg = [line.split() for line in f.readlines()]
f.close()

Vectorize the words

In [12]:
# Parameters for Word2vec
size = 300
min_count = 5
epoch = 10

In [13]:
model = word2vec.Word2Vec(sentences=tweets_pos + tweets_neg, corpus_file=None, size=size, alpha=0.025, window=5,
                          min_count=min_count, max_vocab_size=None, sample=0.001, seed=1, workers=1, min_alpha=0.0001, sg=0,
                          hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, iter=epoch, null_word=0, trim_rule=None,
                          sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), max_final_vocab=None)

2019-12-09 16:01:52,170 : INFO : collecting all words and their counts
2019-12-09 16:01:52,174 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-12-09 16:01:52,204 : INFO : PROGRESS: at sentence #10000, processed 135260 words, keeping 13571 word types
2019-12-09 16:01:52,234 : INFO : PROGRESS: at sentence #20000, processed 273051 words, keeping 21361 word types
2019-12-09 16:01:52,264 : INFO : PROGRESS: at sentence #30000, processed 409541 words, keeping 27684 word types
2019-12-09 16:01:52,293 : INFO : PROGRESS: at sentence #40000, processed 545956 words, keeping 33450 word types
2019-12-09 16:01:52,324 : INFO : PROGRESS: at sentence #50000, processed 682534 words, keeping 38692 word types
2019-12-09 16:01:52,353 : INFO : PROGRESS: at sentence #60000, processed 819941 words, keeping 43497 word types
2019-12-09 16:01:52,381 : INFO : PROGRESS: at sentence #70000, processed 956698 words, keeping 48110 word types
2019-12-09 16:01:52,409 : INFO : PROGRESS: at 

2019-12-09 16:02:29,426 : INFO : EPOCH 10 - PROGRESS: at 32.67% examples, 583116 words/s, in_qsize 1, out_qsize 0
2019-12-09 16:02:30,432 : INFO : EPOCH 10 - PROGRESS: at 62.29% examples, 578838 words/s, in_qsize 2, out_qsize 0
2019-12-09 16:02:31,443 : INFO : EPOCH 10 - PROGRESS: at 89.36% examples, 577511 words/s, in_qsize 1, out_qsize 0
2019-12-09 16:02:31,854 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-12-09 16:02:31,855 : INFO : EPOCH - 10 : training on 2736336 raw words (1977467 effective words) took 3.4s, 575140 effective words/s
2019-12-09 16:02:32,868 : INFO : EPOCH 11 - PROGRESS: at 32.67% examples, 582330 words/s, in_qsize 1, out_qsize 0
2019-12-09 16:02:33,872 : INFO : EPOCH 11 - PROGRESS: at 62.96% examples, 586041 words/s, in_qsize 1, out_qsize 0
2019-12-09 16:02:34,878 : INFO : EPOCH 11 - PROGRESS: at 88.68% examples, 573903 words/s, in_qsize 1, out_qsize 0
2019-12-09 16:02:35,315 : INFO : worker thread finished; awaiting finish of 0 more thre

## Embeeding
### Positive

In [14]:
train_pos = np.zeros((len(tweets_pos),size))
for index, tokens in enumerate(tweets_pos):
    vect = [model.wv[token] for token in tokens if token in model.wv]
    train_pos[index] = np.mean(vect, axis = 0)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [15]:
index_to_remove_pos = np.unique([x for x,y in np.argwhere(np.isnan(train_pos))])

In [16]:
train_pos_2 = np.delete(train_pos,index_to_remove_pos,axis = 0)

### Negative

In [17]:
train_neg = np.zeros((len(tweets_neg),size))
for index, tokens in enumerate(tweets_neg):
    vect = [model.wv[token] for token in tokens if token in model.wv]
    train_neg[index] = np.mean(vect, axis = 0)

In [18]:
index_to_remove_neg = np.unique([x for x,y in np.argwhere(np.isnan(train_neg))])

In [19]:
train_neg_2 = np.delete(train_neg,index_to_remove_neg,axis = 0)

### Test

In [None]:
f = open("Datasets/twitter-datasets/test_data.txt")
tweets_test = [line.split() for line in f.readlines()]
f.close()

In [None]:
test = np.zeros((len(tweets_test),size))
for index, tokens in enumerate(tweets_test):
    vect = [model.wv[token] for token in tokens if token in model.wv]
    test[index] = np.mean(vect, axis = 0)

In [None]:
index_to_remove_test = np.unique([x for x,y in np.argwhere(np.isnan(test))])

In [None]:
test_2 = np.delete(test,index_to_remove_test,axis = 0)

## Combine
Combine pos and neg to have full training 

In [20]:
X = np.vstack((train_pos_2,train_neg_2))
y_pos = np.ones(train_pos_2.shape[0])
y_neg = np.repeat(-1,train_neg_2.shape[0])
Y = np.hstack((y_pos,y_neg))

In [None]:
np.save('Word2vec_X',X)
np.save('Word2vec_Y',Y)
np.save('Word2vec_test',test_2)

In [None]:
X = np.load('Word2vec_X.npy')
Y = np.load('Word2vec_Y.npy')
test_2 = np.load('Word2vec_test.npy')

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

## Train
Logistic Regression with Cross-validation so don't need to split 

In [None]:
log = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10e2, fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=None, solver='sag', max_iter=100000, multi_class='ovr',
                         verbose=0, warm_start=False, n_jobs=-1, l1_ratio=None)

In [None]:
log.fit(X_train,Y_train)

In [None]:
log.score(X_test,Y_test)

In [None]:
logiCV = LogisticRegressionCV(Cs=5, fit_intercept=True, cv=4, dual=False, penalty='l2', scoring=None,
                     solver='sag', tol=0.0001, max_iter=10000, class_weight=None, n_jobs=-1, verbose=0,
                     refit=True, intercept_scaling=1.0, multi_class='ovr', random_state=None, l1_ratios=None)

In [None]:
logiCV.fit(X,Y)

# Vader

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [37]:
f = open("Datasets/twitter-datasets/test_data.txt")
tweets = [line for line in f.readlines()]
f.close()

In [107]:
tweets_pos

['i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15\n',
 "because your logic is so dumb , i won't even crop out your name or your photo . tsk . <url>\n",
 '" just put casper in a box ! " looved the battle ! #crakkbitch\n',
 "thanks sir > > don't trip lil mama ... just keep doin ya thang !\n",
 'visiting my brother tmr is the bestest birthday gift eveerrr ! ! !\n',
 'yay ! ! #lifecompleted . tweet / facebook me to let me know please\n',
 '#1dnextalbumtitle : feel for you / rollercoaster of life . song cocept : life , #yolo , becoming famous ? <3 14 #followmeplz ! <3 x15\n',
 "workin hard or hardly workin rt at hardee's with my future coworker <user>\n",
 "i saw . i'll be replying in a bit .\n",
 'this is were i belong\n',
 'anddd to cheer #nationals2013 ?\n',
 'we send an invitation to shop on-line ! here you will find everything you need - without leaving home ... <url>\n',
 'just woke up , finna go to church\n',
 '

In [74]:
print(tt)
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
tokens = word_tokenize(tt)
result = [i for i in tokens if not i in stop_words]
result = ' '.join(result)
print (result)

i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15

dunno justin read mention . justin god knows , hope follow # believe 15


In [77]:
sid.polarity_scores('just woke up , finna go to church ')

{'neg': 0.0, 'neu': 0.686, 'pos': 0.314, 'compound': 0.4939}

In [82]:
from tqdm.autonotebook import tqdm

  """Entry point for launching an IPython kernel.


In [102]:
prediction_2 = []
stop_words = set(stopwords.words('english'))
for tweet in tqdm(tweets):
    tokens = word_tokenize(tweet)
    result = [i for i in tokens if not i in stop_words]
    result = ' '.join(result)

    ss = sid.polarity_scores(tweet)
    if ss['neu'] == 1:
        prediction_2.append(-1)
    elif ss['neg'] > ss['pos']:
        prediction_2.append(-1)
    else:
        prediction_2.append(1)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




# RNN

In [1]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.layers import SpatialDropout1D

from keras_preprocessing import text


from sklearn.model_selection import train_test_split


from gensim.models import word2vec
import gensim
import logging
import tempfile
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Using TensorFlow backend.


In [2]:
pos_df = pd.read_csv("Datasets/twitter-datasets/train_pos_cleaned.csv", index_col=0)
neg_df = pd.read_csv("Datasets/twitter-datasets/train_neg_cleaned.csv", index_col=0)

In [3]:
train = pd.concat([pos_df,neg_df])

In [4]:
train = train.sample(frac=1, random_state = 1)

Unnamed: 0,tweets,label
49673,k fine lah if liddat i also sleep loh haiz nights twitter,1
71551,going to be told im blind tomorrow ok slight exaggeration but yeah having my eyes tested then a day of uni work just need it done,0
5506,sometimes its nice to just buy a little bit of jewelry #liasophia,1
38370,this is my tweet,1
36930,im not always nice but i dont have a reason not to be,1


In [6]:
train = train.dropna()

In [11]:
test = pd.read_csv("Datasets/twitter-datasets/test_data_cleaned.csv", index_col=0)

In [20]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 100000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 50
# This is fixed.
EMBEDDING_DIM = 300
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(np.hstack((train.tweets.values,test.tweets.values)))
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 100260 unique tokens.


In [21]:
X = tokenizer.texts_to_sequences(train.tweets.values)
X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (199976, 50)


In [22]:
Y = train.label

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 1)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(159980, 50) (159980,)
(39996, 50) (39996,)


In [26]:
batch_size = 1024

print('Build model...')
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
#model.add(SpatialDropout1D(0.4))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
#model.add(Dense(64))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=3,
          validation_data=(X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Build model...
Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 159980 samples, validate on 39996 samples
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [127]:
X_test = tokenizer.texts_to_sequences(test.tweets.values)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

In [128]:
y_pred_prob = model.predict(X_test)

In [129]:
y_pred = np.ones_like(y_pred_prob)
y_pred[y_pred_prob<0.5] = -1

In [130]:
y_pred = y_pred.flatten()

## Predict

In [None]:
prediction = log.predict(test_2)
y_pred = np.insert(prediction, index_to_remove_test -1,-1)

### Submission

In [109]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [110]:
create_csv_submission(range(1,10001), y_pred, 'submission.csv')

### Accuracy

In [131]:
solution = pd.read_csv('derived_solution.csv').Prediction
print("Accuracy : {:.02f}%".format(100*np.mean(solution == y_pred)))

Accuracy : 75.95%


Cross validation with solver :
- lbfgs : 75.66
- newton-cg : 75.60%
- sag : 75.69%

Best on Aicrowd: (76.90%)
- not full tweets
- sag with C = 1, tol = 0.0001