In [1]:
%matplotlib inline
import sklearn
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import math
import nltk
nltk.download('punkt')
from sklearn.model_selection import *
from sklearn.metrics import accuracy_score
import re
from sklearn.svm import SVR, LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer

from functions import *
%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/franckdessimoz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Import and clean the data

In [2]:
# Import all vector words
all_vects = np.load('voc/embeddings.npy')

# Import vocab cut
vocab_cut = pd.read_fwf('voc/vocab_cut.txt', sep = "\t", header = None)
voc_cut = vocab_cut.values

# Import vocab with count
vocab = pd.read_csv('voc/vocab.txt', header = None, delim_whitespace = True)
voc = vocab.values

In [3]:
# Load and construct train sets
tweet_pos = pd.read_csv('data/train_pos.txt', header = None, sep = "\r\n", engine = 'python')
tweet_neg = pd.read_csv('data/train_neg.txt', header = None, sep = "\r\n", engine = 'python')
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0

tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']

all_tweets = tweet_neg.append(tweet_pos)
all_tweets = all_tweets.reset_index().drop(['index'], axis = 1)

In [4]:
all_tweets

Unnamed: 0,tweet,pred
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0
5,wish i could be out all night tonight ! <user>,0
6,<user> i got kicked out the wgm,0
7,rt <user> <user> <user> yes she is ! u tell it...,0
8,why is she so perfect <url>,0
9,<user> hi harry ! did u havea good time in aus...,0


In [4]:
x_train_tweet = np.vstack(all_tweets.values[:, 0])
x_train = [tweet_to_vect(str(x), voc_cut, all_vects) for x in x_train_tweet]
y_train = all_tweets.values[:, 1].astype(int)

In [5]:
# Load and construct test set
tweet_test = pd.read_csv('data/test_data.txt', header = None, sep = "\r\n", engine = 'python')
tweet_clean = clean_data(tweet_test.values)
x_test = [tweet_to_vect(str(x), voc_cut, all_vects) for x in tweet_clean]

In [6]:
# Polynomial expension of degree 2
poly = PolynomialFeatures(3)
x_tr_poly = poly.fit_transform(x_train)
x_te_poly = poly.fit_transform(x_test)
x_tr_poly.shape, x_te_poly.shape

((200000, 1771), (10000, 1771))

In [7]:
mean = np.mean(x_tr_poly)
std = np.std(x_tr_poly)
x_tr_poly_std = (x_tr_poly - mean) / std

x_TR, x_TE, y_TR, y_TE = train_test_split(x_tr_poly_std, y_train, test_size = 0.2)

In [8]:
x_TR.shape, x_TE.shape, y_TR.shape, y_TE.shape

((160000, 1771), (40000, 1771), (160000,), (40000,))

In [9]:
# Create the model using svm and feature reduction with 5-fold cross validation
def classify(classifier, x_TR, y_TR, x_TE, y_TE):
    model = classifier.fit(x_TR, y_TR)  
    y_pred = model.predict(x_TE)
    return accuracy_score(y_pred, y_TE)

In [None]:
classify(LogisticRegression(solver='lbfgs', max_iter=500), x_TR, y_TR, x_TE, y_TE)

In [None]:
classify(SGDClassifier(tol=1e-3, max_iter=1000), x_TR, y_TR, x_TE, y_TE)

In [10]:
classify(LinearSVC(max_iter=500), x_TR, y_TR, x_TE, y_TE)



0.6453

In [None]:
# Print indices of non relevant features
np.argwhere(model.support_ == False)

In [None]:
# Delete non relevant features from train and test set
x_tr_poly_redcuced = np.delete(x_tr_poly, np.argwhere(model.support_ == False)[:, 0], axis = 1)
x_te_poly_redcuced = np.delete(x_te_poly, np.argwhere(model.support_ == False)[:, 0], axis = 1)
x_tr_poly_redcuced.shape, x_te_poly_redcuced.shape

In [None]:
# Create a new model usign the same as before but with non
# relevant features deleted
log = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
mod = log.fit(x_tr_poly_reduced_, y_train)

In [None]:
# Predict the values
y_test = mod.predict(x_te_poly_reduced)
y_pred = zero_to_neg(y_test)

In [None]:
build_submission(y_pred, '9')