In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from cleaner import TextCleanerTransformer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from transformers import AutomatedTruncatedSVD

import pandas as pd
import numpy as np

import re

In [4]:
all_train = pd.read_csv('data/train.csv')

all_train.dropna(inplace=True)

le = LabelEncoder()

le.fit(all_train['Label'].values)

all_train.Label = le.transform(all_train['Label'].values)

train, test = train_test_split(all_train, test_size=0.3, random_state=10, shuffle=True)

text_preprocessing = Pipeline([
    ('stemm', TextCleanerTransformer(
        WordPunctTokenizer(), 
        SnowballStemmer("portuguese", ignore_stopwords=True), 
        [])),
    ('tfidf', TfidfVectorizer(strip_accents='unicode', lowercase=True)), 
    ('svd', AutomatedTruncatedSVD(variance_threshold=0.9, random_state=10, incr=2000))
])

text_preprocessing.fit(train['Request'], train['Label'].values)

In [5]:
import pickle

with open("text_processing_model_3.pkl", "wb") as fd:
    pickle.dump(text_preprocessing, fd)

In [6]:
with open("text_processing_model_3.pkl", "rb") as fd:
    text_preprocessing = pickle.load(fd)

In [7]:
X_train, y_train = text_preprocessing.transform(train['Request']), train['Label']

In [9]:
clf = ExtraTreesClassifier(n_estimators=100, max_depth=10, random_state=10, n_jobs=-1)

In [10]:
clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=10, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=10, verbose=0, warm_start=False)

In [14]:
fit_params = {
    'class_weight': [None, 'balanced', 'balanced_subsample'], 
    'max_features': ['log2', 'sqrt'], 
    'n_estimators': [50, 100, 200], 
    'oob_score': [False, True], 
    'bootstrap': [True], 
    'max_depth': [5, 10, 30], 
    'min_samples_split': [2, 3, 5]
}

In [15]:
gcv = GridSearchCV(clf, fit_params, 
                   scoring='accuracy', 
                   cv=3, verbose=True, 
                   n_jobs=-1)

In [16]:
gcv.fit(X_train, y_train)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 48.1min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed: 63.5min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=10, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced', 'balanced_subsample'], 'max_features': ['log2', 'sqrt'], 'n_estimators': [50, 100, 200], 'oob_score': [False, True], 'bootstrap': [True], 'max_depth': [5, 10, 30], 'min_samples_split': [2, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=True)

In [17]:
real_test = pd.read_csv('data/test.csv')

X_test = text_preprocessing.transform(real_test['Request'])

real_test_y_pred = gcv.best_estimator_.predict(X_test)

real_test['Label'] = le.inverse_transform(real_test_y_pred)

real_test.to_csv('submissions/model_3.csv', index=False)

  if diff:
