In [2]:
%matplotlib inline
import sklearn
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import math
import nltk

from textblob import TextBlob, Word

from keras import models
from keras import layers

from nltk.corpus import stopwords

from tokenizer import tokenizer

from sklearn.linear_model import *
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import *
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from functions import *
%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Preprocess the data

In [3]:
# Import the data
tweet_pos, tweet_neg, tweet_test = import_data(full = False)

# Construct train set
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0
tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']
all_tweets = tweet_neg.append(tweet_pos)
tweet_TR = all_tweets.reset_index().drop(['index'], axis = 1)

# Construct test set
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [4]:
# Import stopwords
#stopw = pd.read_csv('data/stopwords/twitter-stopwords.txt').values.flatten().tolist()
stopw = pd.read_csv('data/stopwords/twitter-stopwords - TA - Less.txt').values.flatten().tolist() +\
    pd.read_csv('data/stopwords/twitter-stopwords - TA.txt').values.flatten().tolist() +\
    pd.read_csv('data/stopwords/twitter-stopwords.txt').values.flatten().tolist()

In [26]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words = stopw, tokenizer=tokenize, ngram_range=(1,4))
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

In [27]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

# Define classifiers

In [28]:
# Define a standard classifier
clf = LinearSVC(random_state=42)
clf.fit(x_train, y_train)
# Compute the predicitions of x_test
y_pred = clf.predict(x_test)

In [29]:
# Compute the standard model's accuracy (8, 9)
accuracy_score(y_pred, y_test)

0.8535

In [29]:
# Compute the standard model's accuracy (1, 4)
accuracy_score(y_pred, y_test)

0.85315

# Find out the best parameters using small dataset

In [24]:
def param_selection(X, Y, nfolds, param_grid, classifier):
    """ Given the features and the predicitons, the number of cross validation,
        the parameter grid and the classifier,  return the best parameters."""
    grid_search = GridSearchCV(classifier, param_grid, cv = nfolds)
    grid_search.fit(X, Y)
    grid_search.best_params_
    return grid_search.best_params_

In [None]:
# Define the list of parameters to test
losses = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
alphas = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
tols = [1e-4, 1e-3, 1e-2, 1e-1]

# Create the parameter grid
param_grid = {'loss': losses, 'tol': tols, 'alpha': alphas}

# Find the best parameters
best_parameters = param_selection(x_train, y_train, 5, param_grid, clf)



In [None]:
# Print the best parameters
best_parameters

In [None]:
# Apply the best parameters
tol_opt = best_parameters['tol']
gamma_opt = best_parameters['gamma']
loss_opt = best_parameters['loss']

# Create a new classifier with the optimal parameters
clf_optimal = SGDClassifier(gamma=gamma_opt, tol=tol_opt, loss=loss_opt)
model = clf_optimal.fit(x_train, y_train)

# Compute the predictions of x_test
y_pred = model.predict(x_test)

In [None]:
# Compute the optimal model's accuracy
accuracy_score(y_pred, y_test)

# Get the predictions using optimal parameters on full dataset

In [None]:
# Import the data
tweet_pos, tweet_neg, tweet_test = import_data(full = True)

# Construct train set
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0
tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']
all_tweets = tweet_neg.append(tweet_pos)
tweet_TR = all_tweets.reset_index().drop(['index'], axis = 1)

# Construct test set
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [None]:
# Import stopwords
#stopw = pd.read_csv('data/stopwords/twitter-stopwords.txt').values.flatten().tolist()
stopw = pd.read_csv('data/stopwords/twitter-stopwords - TA - Less.txt').values.flatten().tolist() +\
    pd.read_csv('data/stopwords/twitter-stopwords - TA.txt').values.flatten().tolist() +\
    pd.read_csv('data/stopwords/twitter-stopwords.txt').values.flatten().tolist()

In [None]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words = stopw, tokenizer=tokenize, ngram_range=(1,2))
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
model_optimal = clf_optimal.fit(x_train, y_train)

# Keep track of scores per classifier (small dataset)

In [51]:
# Logistic regression, small dataset
accuracy_score(y_test, y_pred)

0.84425

In [44]:
# Logistic regression, small dataset
accuracy_score(y_test, y_pred)

0.8447

# Keep track of scores per classifier (full dataset)

In [95]:
# Logistic regression
accuracy_score(y_test, y_pred)

0.865242

In [98]:
# LinearSVC
accuracy_score(y_test, y_pred)

0.867602

In [65]:
accuracy_score(y_test, y_pred)

0.85035

# Output the real predicitons

In [49]:
X_TE = vectorizer.transform(tweet_TE)

In [50]:
y_pred_TE = model_optimal.predict(X_TE)

In [51]:
build_submission(y_pred_TE, '12')