In [5]:
%matplotlib inline
import sklearn
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import math
import nltk
nltk.download('punkt')

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer

from functions import *
%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/franckdessimoz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import and clean the data

In [6]:
# Import all vector words
all_vects = np.load('voc/embeddings.npy')

# Import vocab cut
vocab_cut = pd.read_fwf('voc/vocab_cut.txt', sep = "\t", header = None)
voc_cut = vocab_cut.values

# Import vocab with count
vocab = pd.read_csv('voc/vocab.txt', header = None, delim_whitespace = True)
voc = vocab.values

In [7]:
# Load and construct train sets
tweet_pos = pd.read_csv('data/train_pos.txt', header = None, sep = "\r\n", engine = 'python')
tweet_neg = pd.read_csv('data/train_neg.txt', header = None, sep = "\r\n", engine = 'python')
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0

tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']

all_tweets = tweet_neg.append(tweet_pos)
all_tweets = all_tweets.reset_index().drop(['index'], axis = 1)

In [8]:
doc_pos = ' '.join(tweet_pos.values[:, 0])
doc_neg = ' '.join(tweet_neg.values[:, 0])

vectorizer = TfidfVectorizer(analyzer = 'word')
vectorized_train = vectorizer.fit_transform([doc_pos, doc_neg])

dict_ = vectorizer.vocabulary_
score_pos = np.reshape(vectorized_train[0, :], (vectorized_train[0, :].shape[1],))
score_neg = np.reshape(vectorized_train[1, :], (vectorized_train[1, :].shape[1],))

In [9]:
x_train_tweet = np.vstack(all_tweets.values[:, 0])
x_train = [weighted_tweet_to_vect(str(x), voc_cut, all_vects, score_pos, score_neg, dict_) for x in x_train_tweet]
y_train = all_tweets.values[:, 1].astype(int)

In [10]:
# Load and construct test set
tweet_test = pd.read_csv('data/test_data.txt', header = None, sep = "\r\n", engine = 'python')
tweet_clean = clean_data(tweet_test.values)
x_test = [weighted_tweet_to_vect(str(x), voc_cut, all_vects, score_pos, score_neg, dict_) for x in tweet_clean]

In [11]:
# Store the mean and standard deviation of x_train
mean = np.mean(x_train)
std = np.std(x_train)

In [12]:
# Standardize train and test set
x_train = (x_train - mean) / std
x_test = (x_test - mean) / std

In [15]:
# Polynomial expension of degree 4
poly = PolynomialFeatures(2)
x_tr_poly = poly.fit_transform(x_train)
x_te_poly = poly.fit_transform(x_test)
x_tr_poly.shape, x_te_poly.shape

((200000, 231), (10000, 231))

In [None]:
# Create the model using svm and feature reduction with 5-fold cross validation
clf = LogisticRegression(solver = 'liblinear', max_iter = '200')
model = RFECV(clf, step = 1, cv = 5)

In [None]:
model = model.fit(x_tr_poly, y_train)  
model.support_ 



In [None]:
# Print indices of non relevant features
np.argwhere(model.support_ == False)

In [None]:
# Delete non relevant features from train and test set
x_tr_poly_redcuced = np.delete(x_tr_poly, np.argwhere(model.support_ == False)[:, 0], axis = 1)
x_te_poly_redcuced = np.delete(x_te_poly, np.argwhere(model.support_ == False)[:, 0], axis = 1)
x_tr_poly_redcuced.shape, x_te_poly_redcuced.shape

In [None]:
# Create a new model usign the same as before but with non
# relevant features deleted
log = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
mod = log.fit(x_tr_poly_reduced_, y_train)

In [None]:
# Predict the values
y_test = mod.predict(x_te_poly_reduced)
y_pred = zero_to_neg(y_test)

In [None]:
build_submission(y_pred, '9')