In [1]:
%matplotlib inline
import sklearn
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import math
import nltk

from nltk.corpus import stopwords
from tokenizer import tokenizer
from sklearn.linear_model import *
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR, SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import *
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
from functions import *

%load_ext autoreload
%autoreload 2

In [16]:
# Import the data
tweet_pos, tweet_neg, tweet_test = import_data(full = True)

# Construct train set
tweet_pos['pred'] = 1
tweet_neg['pred'] = 0
tweet_pos.columns = ['tweet', 'pred']
tweet_neg.columns = ['tweet', 'pred']
all_tweets = tweet_neg.append(tweet_pos)
tweet_TR = all_tweets.reset_index().drop(['index'], axis = 1)

# Construct test set
tweet_clean = clean_data(tweet_test.values)
np.reshape(tweet_clean, (10000,))
tweet_TE = tweet_clean.flatten()

In [17]:
# Import stopwords
stopw = pd.read_csv('data/stopwords/twitter-stopwords.txt').values.flatten().tolist()

In [18]:
def tokenize_(t):
    tweet_tok = TweetTokenizer()
    tokens = tweet_tok.tokenize(t)
    wnl = WordNetLemmatizer()
    stems = []
    for item in tokens:
        #stems.append(PorterStemmer().stem(wnl.lemmatize(item)))
        stems.append(wnl.lemmatize(item))
    return stems

In [19]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words = stopw, tokenizer=tokenize, ngram_range=(1,2))
X = vectorizer.fit_transform(tweet_TR.values[:, 0])
Y = tweet_TR.values[:, 1].astype(int)

KeyboardInterrupt: 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
x_trainain.shape

In [7]:
x_train.shape

(160000, 873456)

# Define the classifier

In [14]:
# Define a standard classifier
clf = LinearSVC(random_state=42)
clf.fit(x_train, y_train)
# Compute the predicitions of x_test
y_pred = clf.predict(x_test)

In [15]:
# Compute the standard model's accuracy
accuracy_score(y_pred, y_test)

0.847525

In [10]:
# Compute the standard model's accuracy
accuracy_score(y_pred, y_test)

0.8493

# Find out the best parameters

In [40]:
def param_selection(X, Y, nfolds, param_grid, classifier):
    grid_search = GridSearchCV(classifier, param_grid, cv = nfolds)
    grid_search.fit(X, Y)
    grid_search.best_params_
    return grid_search.best_params_

In [None]:
# Define the list of parameters to test
gammas = [1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]
Cs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15]
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
degrees = [2, 3, 4, 5, 6]

# Create the parameter grid
param_grid = {'kernel': kernels, 'gamma': gammas, 'C': cs, 'degree': degrees}

# Find the best parameters
best_parameters = param_selection(x_train, y_train, 5, param_grid, clf)

In [27]:
# Apply the best parameters
kernel_opt = best_parameters['kernel']
C_opt = best_parameters['C']
gamma_opt = best_parameters['gamma']
degree_opt = best_parameters['degree']

# Create a new classifier with the optimal parameters
clf_optimal = SVC(kernel=kerne_opt, C=C_opt, gamma=gamma_opt)
model_optimal = clf_optimal.fit(x_train, y_train)

# Compute the predictions of x_test
y_pred = model_optimal.predict(x_test)

Fitting LinearSVC()
Predicting LinearSVC()

Done


In [28]:
# Compute the optimal model's accuracy
accuracy_score(y_pred, y_test)

0.85165

# Output the real predicitons

In [49]:
X_TE = vectorizer.transform(tweet_TE)
y_pred_TE = model_optimal.predict(X_TE)
build_submission(y_pred_TE, '13_SVC_optimal_param')

# Other