In [None]:
import sklearn
import pandas as pd
import math
import nltk
from nltk.corpus import stopwords
from tokenizer import tokenizer
from sklearn.linear_model import *
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from functions import *

# 1. Preprocess the data

We start the preprocessing of the data by importing the datasets. We then created a dataframe containing all train tweet with the sentiment prediction associated, which is 0 for negative tweet and 1 for positive tweet. Then we created a TfidfVectorizer with tuned toknizer, stopwords, ngram_range and df_min. We applied the vectorizer to the tweets and splitted the vectorized train tweets into a train set and a test set, in order to compute accuracy localy.

In [None]:
# Import the small dataset
tweet_pos, tweet_neg, tweet_test = import_data(full = False)

# Construct train and test set
tweet_TR = construct_train_set(tweet_pos, tweet_neg)
tweet_TE = construct_test_set(tweet_test)

# Import stopwords
stopw = pd.read_csv('data/twitter-stopwords.txt').values.flatten().tolist()

In [None]:
# Create TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopw, tokenizer=tokenize, ngram_range=(1,4), min_df=10)

# Apply TfidfVectorizer to the small train set
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

In [None]:
# Split the small train set for local accuracy computation
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

# 2. Define the standard classifiers

We define a LinearSVC classifier and computed the accuracy with the classifier's standard parameters on the small dataset.

In [None]:
# Define a standard LinearSVC classifier
clf = LinearSVC(random_state=42)
clf.fit(x_train, y_train)

# Compute the predicitions of x_test
y_pred = clf.predict(x_test)

In [None]:
# Compute the standard classifier's accuracy (LinearSVC)
print('Dataset: Small\nClassifier: LinearSVC\nParameters: Standard\nAccuracy: {:2.2%}'.format(accuracy_score(y_pred, y_test)))

# 3. Find out the best parameters using the small dataset and Grid Search

We perform a grid search using 5-fold crossvalidation in order to find the parameters that optimize the classifier. Note that we only choose 3 parameters to test for each parameters, in order to keep the computation runable on our computers.

In [None]:
# Define the list of parameters to test
losses = ['hinge', 'squared_hinge']
tols = [1e-5, 1e-4, 1e-3]
Cs = [0.1, 1, 10]

# Create the parameter grid
param_grid = {'loss': losses, 'tol': tols, 'C': Cs}

# Find the best parameters
best_parameters = param_selection(x_train, y_train, 5, param_grid, clf)

In [None]:
# Apply the best parameters
loss_opt = best_parameters['loss']
tol_opt = best_parameters['tol']
C_opt = best_parameters['C']

In [None]:
# Print best parameters
print('Best parameters:\nloss: {}\ntol: {}\nC: {}'.format(loss_opt, tol_opt, C_opt))                       

We construct a new LinearSVC classifier with the optimal parameters obtained with the grid search. After that we compute the new accuracy of the optimal classifier on the small dataset.

In [None]:
# Create a new LinearSVC classifier with the optimal parameters
clf_optimal = LinearSVC(C=C_opt, tol=tol_opt, loss=loss_opt, random_state=42)
model_optimal = clf_optimal.fit(x_train, y_train)

# Compute the predictions of x_test
y_pred_optimal = model_optimal.predict(x_test)

In [None]:
# Compute the optimal classifier's accuracy (LinearSVC)
print('Dataset: Small\nClassifier: LinearSVC\nParameters: Optimal\nAccuracy: {:2.2%}'.format(accuracy_score(y_pred_optimal, y_test)))

# 4. Get the predictions using optimal classifier on full dataset

We load the full dataset, create the tweets' vector using the same TfidfVectorizer as above, and run the optimal LinearSVC classifier on the full dataset.

In [None]:
# Import the full dataset
tweet_pos_full, tweet_neg_full, tweet_test_full = import_data(full = True)

# Construct train and test set
tweet_TR_full = construct_train_set(tweet_pos_full, tweet_neg_full)
tweet_TE_full = construct_test_set(tweet_test_full)

In [None]:
# Apply TfidfVectorizer to the full train set
X_full = vectorizer.fit_transform(tweet_TR_full.values[:, 0])
Y_full = tweet_TR_full.values[:, 1].astype(int)

In [None]:
# Split the full train set for local accuracy computation
x_train_full, x_test_full, y_train_full, y_test_full = train_test_split(X_full, Y_full, test_size = 0.2)

In [None]:
# Apply the optimal classifier to the full dataset
model_optimal_full = clf_optimal.fit(x_train_full, y_train_full)

# Compute the predictions of x_test
y_pred_full = model_optimal_full.predict(x_test_full)

In [None]:
# Compute the optimal classifier's accuracy (LinearSVC)
print('Dataset: Full\nClassifier: LinearSVC\nParameters: Optimal\nAccuracy: {:2.2%}'.format(accuracy_score(y_pred_full, y_test_full)))

# 5. Output the final predicitons

We output the final predictions to be submitted on CrowdAi.

In [None]:
# Apply TfidfVectorizer to the test set
X_TE = vectorizer.transform(tweet_TE)

# Apply the optimal classifier to the test set
y_pred_TE = model_optimal.predict(X_TE)

# Build submission to be submitted on CrowdAi
build_submission(y_pred_TE, 'final_submission_ipynb')