In [71]:
%matplotlib inline
import sklearn
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import math
import nltk

from nltk.corpus import stopwords
from tokenizer import tokenizer

from sklearn.kernel_approximation import Nystroem

from sklearn.linear_model import *
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR, LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import *
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV

from functions import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preprocess the data

In [72]:
# Import the data
tweet_pos, tweet_neg, tweet_test = import_data(full = False)

# Construct train set
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0
tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']
all_tweets = tweet_neg.append(tweet_pos)
tweet_TR = all_tweets.reset_index().drop(['index'], axis = 1)

# Construct test set
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [73]:
# Import stopwords
stopw = pd.read_csv('data/stopwords/twitter-stopwords-final.txt').values.flatten().tolist()

In [74]:
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenize, stop_words=stopw, min_df=10, ngram_range=(1,4))
#vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenize, stop_words=stopw)
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

In [75]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
x_train.shape

(160000, 73967)

# Define classifiers

# Create the model using svm and feature reduction with 5-fold cross validation
def classify(classifier, x_TR, y_TR, x_TE, y_TE):
    model = classifier.fit(x_TR, y_TR)  
    y_pred = model.predict(x_TE)
    return accuracy_score(y_pred, y_TE)

classify(LogisticRegression(solver='lbfgs', max_iter=500, random_state=42), x_train, y_train, x_test, y_test)

classify(SGDClassifier(tol=1e-3, max_iter=1000, random_state=42), x_train, y_train, x_test, y_test)

classify(LinearSVC(random_state=42), x_train, y_train, x_test, y_test)

In [76]:
# Define a standard classifier
clf = LinearSVC(random_state=42)
clf.fit(x_train, y_train)
# Compute the predicitions of x_test
y_pred = clf.predict(x_test)

In [77]:
# Compute the standard model's accuracy (LinearSVC)
accuracy_score(y_pred, y_test)

0.84185

# Find out the best parameters using small dataset

In [78]:
def param_selection(X, Y, nfolds, param_grid, classifier):
    """ Given the features and the predicitons, the number of cross validation,
        the parameter grid and the classifier, return the best parameters."""
    grid_search = GridSearchCV(classifier, param_grid, cv = nfolds)
    grid_search.fit(X, Y)
    grid_search.best_params_
    return grid_search.best_params_

In [79]:
# Define the list of parameters to test
losses = ['hinge', 'squared_hinge']
tols = [1e-5, 1e-4, 1e-3]
Cs = [0.1, 1, 10]
random_state = [42]

# Create the parameter grid
param_grid = {'loss': losses, 'tol': tols, 'C': Cs, 'random_state': random_state}

# Find the best parameters
best_parameters = param_selection(x_train, y_train, 5, param_grid, clf)



In [80]:
# Print the best parameters
best_parameters

{'C': 1, 'loss': 'hinge', 'random_state': 42, 'tol': 1e-05}

In [81]:
# Apply the best parameters
tol_opt = best_parameters['tol']
C_opt = best_parameters['C']
loss_opt = best_parameters['loss']

# Create a new classifier with the optimal parameters
clf_optimal = LinearSVC(C=C_opt, tol=tol_opt, loss=loss_opt, random_state=42)

model = clf_optimal.fit(x_train, y_train)

# Compute the predictions of x_test
y_pred = model.predict(x_test)



In [82]:
# Compute the optimal model's accuracy
accuracy_score(y_pred, y_test)

0.8489

# Get the predictions using optimal parameters on full dataset

In [83]:
# Import the data
tweet_pos, tweet_neg, tweet_test = import_data(full = True)

# Construct train set
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0
tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']
all_tweets = tweet_neg.append(tweet_pos)
tweet_TR = all_tweets.reset_index().drop(['index'], axis = 1)

# Construct test set
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [84]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words = stopw, tokenizer=tokenize, ngram_range=(1,4))
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

In [85]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [86]:
model_optimal = clf_optimal.fit(x_train, y_train)



In [87]:
y_pred = model_optimal.predict(x_test)
accuracy_score(y_pred, y_test)

0.877302

# Output the real predicitons

In [88]:
X_TE = vectorizer.transform(tweet_TE)

In [89]:
y_pred_TE = model_optimal.predict(X_TE)

In [90]:
build_submission(y_pred_TE, '16_linearSVC_optimal_3_updated_tokenizer_df_min')