In [34]:
%matplotlib inline
import sklearn
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures

from functions import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import and clean the data

In [26]:
# Import all vector words
all_vects = np.load('voc/embeddings.npy')

# Import vocab cut
vocab_cut = pd.read_fwf('voc/vocab_cut.txt', sep = "\t", header = None)
voc_cut = vocab_cut.values

# Import vocab with count
vocab = pd.read_csv('voc/vocab.txt', header = None, delim_whitespace = True)
voc = vocab.values

In [52]:
# Load and construct train sets
tweet_pos = pd.read_csv('data/train_pos.txt', header = None, sep = "\r\n", engine = 'python')
tweet_neg = pd.read_csv('data/train_neg.txt', header = None, sep = "\r\n", engine = 'python')
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0

tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']

all_tweets = tweet_neg.append(tweet_pos)
all_tweets = all_tweets.reset_index().drop(['index'], axis = 1)

x_train_tweet = np.vstack(all_tweets.values[:, 0])
x_train = [tweet_to_vect(str(x), voc_cut, all_vects) for x in x_train_tweet]
y_train = all_tweets.values[:, 1].astype(int)

In [53]:
# Load and construct test set
tweet_test = pd.read_csv('data/test_data.txt', header = None, sep = "\r\n", engine = 'python')
tweet_clean = clean_data(tweet_test.values)
x_test = [tweet_to_vect(str(x), voc_cut, all_vects) for x in tweet_clean]

In [54]:
# Store the mean and standard deviation of x_train
mean = np.mean(x_train)
std = np.std(x_train)

In [55]:
# Standardize train and test set
x_train = (x_train - mean) / std
x_test = (x_test - mean) / std

In [78]:
# Create the model using svm and feature reduction with 5-fold cross validation
clf = SVC(kernel='linear')
model = RFECV(clf, step = 1, cv = 5)

In [None]:
model = model.fit(x_train, y_train)  
model.support_ 

In [59]:
# Print indices of non relevant features
np.argwhere(model.support_ == False)

array([[ 2],
       [10],
       [12],
       [17]])

In [71]:
# Delete non relevant features from train and test set
x_tr = np.delete(x_train, [2, 10, 12, 17], axis = 1)
x_te = np.delete(x_test, [2, 10, 12, 17], axis = 1)
x_tr.shape, x_te.shape

((200000, 16), (10000, 16))

In [72]:
# Polynomial expension of degree 4
poly = PolynomialFeatures(2)
x_tr_poly = poly.fit_transform(x_tr)
x_te_poly = poly.fit_transform(x_te)
x_tr_poly.shape, x_te_poly.shape

((200000, 153), (10000, 153))

In [73]:
# Create a new model usign the same as before but with non
# relevant features deleted
log = LogisticRegression(solver = 'lbfgs', max_iter = 500)
mod = log.fit(x_tr_poly, y_train)

In [74]:
# Predict the values
y_test = mod.predict(x_te_poly)
y_pred = zero_to_neg(y_test)

In [76]:
build_submission(y_pred, '7')