In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from cleaner import TextCleanerTransformer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, NuSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from transformers import AutomatedTruncatedSVD

import pandas as pd
import numpy as np

import re

In [None]:
all_train = pd.read_csv('data/train.csv')

all_train.dropna(inplace=True)

le = LabelEncoder()

le.fit(all_train['Label'].values)

all_train.Label = le.transform(all_train['Label'].values)

train, test = train_test_split(all_train, test_size=0.3, random_state=10, shuffle=True)

text_preprocessing = Pipeline([
    ('stemm', TextCleanerTransformer(
        WordPunctTokenizer(), 
        SnowballStemmer("portuguese", ignore_stopwords=True), 
        [])),
    ('tfidf', TfidfVectorizer(strip_accents='unicode', lowercase=True)), 
    ('svd', AutomatedTruncatedSVD(variance_threshold=0.9, random_state=10, incr=2000))
])

text_preprocessing.fit(train['Request'], train['Label'].values)

In [None]:
import pickle

with open("text_processing_model_3.pkl", "wb") as fd:
    pickle.dump(text_preprocessing, fd)

In [None]:
with open("text_processing_model_3.pkl", "rb") as fd:
    text_preprocessing = pickle.load(fd)

In [None]:
X_train, y_train = text_preprocessing.transform(train['Request']), train['Label']

In [None]:
clf = LinearSVC(random_state=10)

clf.fit(X_train, y_train)

In [None]:
params = {
    'penalty': Categorical(['l2']), 
    'loss': Categorical(['squared_hinge']), 
    'dual': Categorical([False]),
    'tol': Real(1e-8, 1e-2, prior='log-uniform'), 
    'C': Real(1e-3, 10, prior='log-uniform'), 
    'max_iter': Integer(100, 2000), 
    'fit_intercept': Categorical([False, True])
}

In [None]:
bscv = BayesSearchCV(clf, params, scoring='accuracy', random_state=10, verbose=True, n_jobs=-1, n_iter=50, cv=3)

In [None]:
bscv.fit(X_train, y_train)

In [None]:
real_test = pd.read_csv('data/test.csv')

X_test = text_preprocessing.transform(real_test['Request'])

real_test_y_pred = gcv.best_estimator_.predict(X_test)

real_test['Label'] = le.inverse_transform(real_test_y_pred)

real_test.to_csv('submissions/model_3.csv', index=False)