In [1]:
%matplotlib inline
import sklearn
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import math
import nltk

from nltk.corpus import stopwords
from tokenizer import tokenizer

from sklearn.linear_model import *
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR, LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import *
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score

from functions import *
%load_ext autoreload
%autoreload 2

# Preprocess the data

In [2]:
# Import the data
tweet_pos, tweet_neg, tweet_test = import_data(full = False)

# Construct train set
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0
tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']
all_tweets = tweet_neg.append(tweet_pos)
tweet_TR = all_tweets.reset_index().drop(['index'], axis = 1)

# Construct test set
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [3]:
# Import stopwords
stopw = pd.read_csv('data/stopwords/twitter-stopwords-final.txt').values.flatten().tolist()

In [4]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words = stopw, tokenizer=tokenize, ngram_range=(1,4))
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

# Define classifiers

In [26]:
# Define a standard classifier
clf = SVC(kernel='linear', cache_size=2000, gamma='auto')

clf.fit(x_train[:80000, :], y_train[:80000])
# Compute the predicitions of x_test
y_pred = clf.predict(x_test)

In [28]:
# Compute the standard model's accuracy (SVC) using 80'000 samples - half of the small dataset
accuracy_score(y_pred, y_test)

0.841675

In [25]:
# Compute the standard model's accuracy (SVC) using 50'000 samples
accuracy_score(y_pred, y_test)

0.8351

In [23]:
# Compute the standard model's accuracy (SVC) using 20'000 samples.
accuracy_score(y_pred, y_test)

0.815075

# Find out the best parameters using small dataset

In [15]:
def param_selection(X, Y, nfolds, param_grid, classifier):
    """ Given the features and the predicitons, the number of cross validation,
        the parameter grid and the classifier, return the best parameters."""
    grid_search = GridSearchCV(classifier, param_grid, cv = nfolds)
    grid_search.fit(X, Y)
    grid_search.best_params_
    return grid_search.best_params_

In [16]:
# Define the list of parameters to test
losses = ['hinge', 'squared_hinge']
tols = [1e-5, 1e-4, 1e-3]
Cs = [0.1, 1, 10]
random_state = [42]

# Create the parameter grid
param_grid = {'loss': losses, 'tol': tols, 'C': Cs, 'random_state': random_state}

# Find the best parameters
best_parameters = param_selection(x_train, y_train, 5, param_grid, clf)

In [17]:
# Print the best parameters
best_parameters

{'C': 1, 'loss': 'hinge', 'random_state': 42, 'tol': 1e-05}

In [18]:
# Apply the best parameters
tol_opt = best_parameters['tol']
C_opt = best_parameters['C']
loss_opt = best_parameters['loss']

# Create a new classifier with the optimal parameters
clf_optimal = LinearSVC(C=C_opt, tol=tol_opt, loss=loss_opt, random_state=42)

model = clf_optimal.fit(x_train, y_train)

# Compute the predictions of x_test
y_pred = model.predict(x_test)

In [19]:
# Compute the optimal model's accuracy
accuracy_score(y_pred, y_test)

0.852025

# Get the predictions using optimal parameters on full dataset

In [20]:
# Import the data
tweet_pos, tweet_neg, tweet_test = import_data(full = True)

# Construct train set
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0
tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']
all_tweets = tweet_neg.append(tweet_pos)
tweet_TR = all_tweets.reset_index().drop(['index'], axis = 1)

# Construct test set
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [21]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words = stopw, tokenizer=tokenize, ngram_range=(1,4))
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [23]:
model_optimal = clf_optimal.fit(x_train, y_train)



In [24]:
y_pred = model_optimal.predict(x_test)
accuracy_score(y_pred, y_test)

0.876936

# Output the real predicitons

In [25]:
X_TE = vectorizer.transform(tweet_TE)

In [26]:
y_pred_TE = model_optimal.predict(X_TE)

In [27]:
build_submission(y_pred_TE, '14_linearSVC_optimal_3_updated_tokenizer')