In [2]:
import sklearn
import re
import numpy as np
import scipy as sp
import pandas as pd
import glove_solution
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures

In [3]:
# Import all vector words
all_words = np.load('embeddings.npy')

In [4]:
# Import vocab and transform it into an array
vocab_cut = pd.read_fwf('vocab_cut.txt', sep = "\t", header = None, error_bad_lines=False)
vocab_cut.columns = ['word']
voc = vocab_cut.values
voc

array([['<user>'],
       ['!'],
       ['i'],
       ...,
       ['#1dson'],
       ['#18'],
       ['#10fac']], dtype=object)

In [5]:
# Load train set positiv and train set negativ
tweet_pos = pd.read_csv('data/train_pos.txt', sep = '\t', header = None)
tweet_neg = pd.read_csv('data/train_neg.txt', sep = '\t', header = None)
tweet_test = pd.read_csv('data/test_data.txt', sep = '\t', header = None)

In [6]:
tweet_pos.head()

Unnamed: 0,0
0,<user> i dunno justin read my mention or not ....
1,"because your logic is so dumb , i won't even c..."
2,<user> just put casper in a box ! looved the...
3,<user> <user> thanks sir > > don't trip lil ma...
4,visiting my brother tmr is the bestest birthda...


In [7]:
tweet_neg.head()

Unnamed: 0,0
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...
1,glad i dot have taks tomorrow ! ! #thankful #s...
2,1-3 vs celtics in the regular season = were fu...
3,<user> i could actually kill that girl i'm so ...
4,<user> <user> <user> i find that very hard to ...


In [8]:
tweet_test.head()

Unnamed: 0,0
0,"1,sea doo pro sea scooter ( sports with the po..."
1,"2,<user> shucks well i work all week so now i ..."
2,"3,i cant stay away from bug thats my baby"
3,"4,<user> no ma'am ! ! ! lol im perfectly fine ..."
4,"5,whenever i fall asleep watching the tv , i a..."


In [9]:
def tweet_to_vect(tweet):
    """ Map a tweet to a vector using the
        vector of each word that is present
        in the tweet"""
    word_array = re.findall(r'\w+', tweet)
    vect = np.zeros(20)
    for word in word_array:
        i = np.argwhere(voc == word)
        if(i.shape[0] != 0): 
            vect += all_words[i[0][0]]
    return vect/len(word_array)

In [10]:
# Test
tweet_to_vect('i love machine learning')

array([ 0.08483506,  0.18905336,  0.13301682,  0.28761705,  0.15026169,
        0.16017279, -0.12708856, -0.16646328, -0.28536286,  0.23276146,
       -0.03472253, -0.06886545, -0.33791882, -0.17811641, -0.39209849,
       -0.05214223,  0.27099058, -0.06285391,  0.33469796,  0.25988553])

In [11]:
# Add attributes to the dataframes
tweet_pos['vect'] = ""
tweet_pos['pred'] = 1
tweet_neg['vect'] = ""
tweet_neg['pred'] = -1

tweet_pos.columns = ['tweet', 'vect', 'pred']
tweet_neg.columns = ['tweet', 'vect', 'pred']

In [12]:
tweet_pos.vect = tweet_pos.tweet.apply(lambda x: tweet_to_vect(x))
tweet_neg.vect = tweet_neg.tweet.apply(lambda x: tweet_to_vect(x))

In [13]:
tweet_pos.head()

Unnamed: 0,tweet,vect,pred
0,<user> i dunno justin read my mention or not ....,"[0.12187350126213467, 0.2392204874081336, 0.17...",1
1,"because your logic is so dumb , i won't even c...","[0.029361259698287817, 0.2845059508864009, 0.1...",1
2,<user> just put casper in a box ! looved the...,"[0.03723710238158048, 0.010057370590752113, 0....",1
3,<user> <user> thanks sir > > don't trip lil ma...,"[0.01164429224360294, -0.1313781541898918, 0.1...",1
4,visiting my brother tmr is the bestest birthda...,"[0.12157701280012964, 0.07784914951046543, 0.0...",1


In [14]:
tweet_neg.head()

Unnamed: 0,tweet,vect,pred
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,"[0.01879816550451843, 0.003126974568930025, 0....",-1
1,glad i dot have taks tomorrow ! ! #thankful #s...,"[0.2500810171748917, 0.05640994032588838, 0.18...",-1
2,1-3 vs celtics in the regular season = were fu...,"[0.08948027579672979, 0.1839799238063547, 0.22...",-1
3,<user> i could actually kill that girl i'm so ...,"[0.11785501831281628, 0.19467555676440992, 0.3...",-1
4,<user> <user> <user> i find that very hard to ...,"[0.08321729153265517, -0.12035350613718467, 0....",-1


In [15]:
# Map all -1 to 0
def neg_to_zero(array):
    ret = np.zeros(len(array))
    for i in range(len(array)):
        if(array[i] == 1):
            ret[i] = 1
    return ret

In [16]:
# Map all 0 to -1
def zero_to_neg(array):
    ret = np.ones(len(array))
    for i in range(len(array)):
        if(array[i] == 0):
            ret[i] = -1
    return ret

In [17]:
# Create train features and train predictions
all_tweets = tweet_neg.append(tweet_pos)
all_tweets = all_tweets.reset_index().drop(['index', 'tweet'], axis = 1)

In [18]:
# Create train features and train predictions
x_train = np.vstack(all_tweets.values[:, 0])
y_train = neg_to_zero(all_tweets.values[:, 1])

In [19]:
# Store the mean and standard deviation of x_train
mean = np.mean(x_train)
std = np.std(x_train)
# Standardize x_train
x_train = (x_train - mean) / std

In [20]:
def clean_data(array):
    """ Clean the data by deleting the id
        placed in the front of the tweet."""
    ret = np.zeros(len(array))
    for i in range(len(array)):
        drop_id = len(str(i+1)) + 1
        array[i, 0] = array[i, 0][int(drop_id):]
    return array

In [44]:
# Load test set 
tweets_test = pd.read_csv('data/test_data.txt', sep = '\t', header = None)
tweets_test.head()

Unnamed: 0,0
0,"1,sea doo pro sea scooter ( sports with the po..."
1,"2,<user> shucks well i work all week so now i ..."
2,"3,i cant stay away from bug thats my baby"
3,"4,<user> no ma'am ! ! ! lol im perfectly fine ..."
4,"5,whenever i fall asleep watching the tv , i a..."


In [22]:
# Clean test set
tweets_tst = clean_data(tweets_test.values)

In [23]:
# Map all tweets to the corresponding vector
x_test = [tweet_to_vect(str(x)) for x in tweets_tst]

In [None]:
# Create the model using logistic regression and predict the values
# logistic = LogisticRegression(solver = 'lbfgs')
# model = logistic.fit(x_train, y_train)

In [24]:
# Create the model using logistic regression and
# feature reduction with 5-fold cross validation
model = RFECV(LogisticRegression(solver = 'lbfgs'), step = 1, cv = 5)
model = model.fit(x_train, y_train)
model.support_ 

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False,  True,  True,  True,  True, False,
       False,  True])

In [25]:
# Print indices of non relevant features
np.argwhere(model.support_ == False)

array([[ 2],
       [10],
       [12],
       [17],
       [18]])

In [26]:
# Delete non relevant features from train set
x_tr = np.delete(x_train, [2, 10, 12, 17, 18], axis = 1)
x_tr.shape

(196970, 15)

In [45]:
# Standardize test setp and delete non relevant features
x_te = np.vstack(x_test)
x_te = (x_te - mean) / std
x_te = np.delete(x_te, [2, 10, 12, 17, 18], axis = 1)
x_te.shape

(10000, 15)

In [46]:
# Polynomial expension of degree 4
poly = PolynomialFeatures(2)
x_tr_poly = poly.fit_transform(x_tr)
x_te_poly = poly.fit_transform(x_te)
x_tr_poly.shape, x_te_poly.shape

((196970, 136), (10000, 136))

In [47]:
# Create a new model usign the same as before but with non
# relevant features deleted
log = LogisticRegression(solver = 'lbfgs', max_iter = 500)
mod = log.fit(x_tr_poly, y_train)

In [48]:
# Predict the values
y_test = mod.predict(x_te_poly)
y_pred = zero_to_neg(y_test)

In [55]:
def build_submission(y_pred):
    """ Build submission and save it into the
        folder 'prediction'."""
    ret = np.ones((len(y_pred), 2))
    for i in range(len(y_pred)):
        ret[i] = np.array([i+1, y_pred[i]])
    ret = ret.astype(int)
    sub = pd.DataFrame(data = ret)
    sub.columns = ['Id', 'Prediction']
    sub.to_csv('predictions/pred6.csv', index=None)

In [56]:
build_submission(y_pred)