In [1]:
%matplotlib inline
import sklearn
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import math
import nltk

from nltk.corpus import stopwords
from tokenizer import tokenizer

from sklearn.kernel_approximation import Nystroem

from sklearn.linear_model import *
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR, LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import *
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV

from functions import *
%load_ext autoreload
%autoreload 2

# Preprocess the data

In [2]:
# Import the data
tweet_pos, tweet_neg, tweet_test = import_data(full = False)

# Construct train set
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0
tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']
all_tweets = tweet_neg.append(tweet_pos)
tweet_TR = all_tweets.reset_index().drop(['index'], axis = 1)

# Construct test set
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [3]:
# Import stopwords
stopw = pd.read_csv('data/stopwords/twitter-stopwords-final.txt').values.flatten().tolist()

In [11]:
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenize, stop_words=stopw, min_df=10, ngram_range=(1,4))
#vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenize, stop_words=stopw)
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
x_train.shape

(160000, 73967)

# Define classifiers

# Create the model using svm and feature reduction with 5-fold cross validation
def classify(classifier, x_TR, y_TR, x_TE, y_TE):
    model = classifier.fit(x_TR, y_TR)  
    y_pred = model.predict(x_TE)
    return accuracy_score(y_pred, y_TE)

classify(LogisticRegression(solver='lbfgs', max_iter=500, random_state=42), x_train, y_train, x_test, y_test)

classify(SGDClassifier(tol=1e-3, max_iter=1000, random_state=42), x_train, y_train, x_test, y_test)

classify(LinearSVC(random_state=42), x_train, y_train, x_test, y_test)

In [6]:
# Define a standard classifier
clf = SGDClassifier(tol=1e-3, max_iter=1000, random_state=42)
is_ = [1000, 2000, 3000, 4000, 5000, 6000, 8000, 10000, 15000, 20000, 25000, 30000, 35000, 40000]
accs = []
for i in is_:
    clf.fit(x_train[:i, :], y_train[:i])
    # Compute the predicitions of x_test
    y_pred = clf.predict(x_test)
    acc = accuracy_score(y_pred, y_test)
    print(i, acc)
    accs += [[i, acc]]

1000 0.726975
2000 0.754825
3000 0.764875
4000 0.769925
5000 0.77655
6000 0.783725
8000 0.7975
10000 0.803725
15000 0.813325
20000 0.81555
25000 0.8185
30000 0.819775
35000 0.820275
40000 0.8217


In [13]:
# Define a standard classifier
clf = LinearSVC()
is_ = [1000, 2000, 3000, 4000, 5000, 6000, 8000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 60000, 80000, 100000]
accs = []
for i in is_:
    clf.fit(x_train[:i, :], y_train[:i])
    # Compute the predicitions of x_test
    y_pred = clf.predict(x_test)
    acc = accuracy_score(y_pred, y_test)
    print(i, acc)
    accs += [[i, acc]]

1000 0.7429
2000 0.762025
3000 0.77415
4000 0.782725
5000 0.78875
6000 0.7918
8000 0.79715
10000 0.801525
15000 0.80965
20000 0.813225
25000 0.816875
30000 0.819675
35000 0.822125
40000 0.824375
60000 0.83015
80000 0.8312
100000 0.83585


In [10]:
clf = LinearSVC()
clf.fit(x_train[:100000, :], y_train[:100000])
# Compute the predicitions of x_test
y_pred = clf.predict(x_test)
acc = accuracy_score(y_pred, y_test)
print(acc)


0.8238


In [8]:
acc

0.8238

In [8]:
# Compute the standard model's accuracy (LinearSVC)
accuracy_score(y_pred, y_test)

0.846725

# Find out the best parameters using small dataset

In [6]:
def param_selection(X, Y, nfolds, param_grid, classifier):
    """ Given the features and the predicitons, the number of cross validation,
        the parameter grid and the classifier, return the best parameters."""
    grid_search = GridSearchCV(classifier, param_grid, cv = nfolds)
    grid_search.fit(X, Y)
    grid_search.best_params_
    return grid_search.best_params_

In [10]:
# Define the list of parameters to test
solvers = ['lbfgs', 'sag', 'saga']
tols = [1e-5, 1e-4, 1e-3]
Cs = [0.1, 1, 10]
random_state = [42]

# Create the parameter grid
param_grid = {'solver': solvers, 'tol': tols, 'C': Cs, 'random_state': random_state}

# Find the best parameters
best_parameters = param_selection(x_train, y_train, 5, param_grid, clf)



In [11]:
# Print the best parameters
best_parameters

{'C': 1, 'random_state': 42, 'solver': 'lbfgs', 'tol': 1e-05}

In [8]:
# Apply the best parameters
#tol_opt = best_parameters['tol']
#C_opt = best_parameters['C']
#solver_opt = best_parameters['solver']

# Create a new classifier with the optimal parameters
#clf_optimal = LogisticRegression(C=C_opt, tol=tol_opt, solver=solver_opt, random_state=42)
clf_optimal = LogisticRegression(C=1, tol=1e-5, solver='lbfgs', random_state=42)
model = clf_optimal.fit(x_train, y_train)

# Compute the predictions of x_test
y_pred = model.predict(x_test)



## Compute the optimal model's accuracy
accuracy_score(y_pred, y_test)

# Get the predictions using optimal parameters on full dataset

In [None]:
# Import the data
tweet_pos, tweet_neg, tweet_test = import_data(full = True)

# Construct train set
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0
tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']
all_tweets = tweet_neg.append(tweet_pos)
tweet_TR = all_tweets.reset_index().drop(['index'], axis = 1)

# Construct test set
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [None]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words = stopw, tokenizer=tokenize, ngram_range=(1,4))
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
model_optimal = clf_optimal.fit(x_train, y_train)

In [None]:
y_pred = model_optimal.predict(x_test)
accuracy_score(y_pred, y_test)

# Output the real predicitons

In [None]:
X_TE = vectorizer.transform(tweet_TE)

In [None]:
y_pred_TE = model_optimal.predict(X_TE)

In [None]:
build_submission(y_pred_TE, '17_LogisticRegression_optimal_3_updated_tokenizer_df_min')