In [1]:
from collections import Counter
import json
import os
import re
import string

import joblib
import nltk
from nltk.corpus import stopwords
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from unidecode import unidecode
import matplotlib.pyplot as plt
import numpy as np

import spacy

from common import Lematizator, cleaner, tokenizer, Dataset
from common import STOP_WORD_ES

nlp = spacy.load('es_core_news_sm')

%load_ext autoreload
%autoreload 2


# Load dataset

In [2]:
ds_path = '../data/ready/full_spanish_dataset.json'

with open(ds_path, 'r') as f:
    dataset_raw = json.load(f)

c = Counter([k['klass'] for k in dataset_raw])
c

Counter({'neutral': 89782, 'negative': 26272, 'positive': 107252})

In [3]:
dataset_raw[0]

{'text': '4.Denme una mano con RT para exigir q @lanacioncom retire la foto y respete el derecho a la intimidad de @MariaviicToriia su mamá y hermano.',
 'klass': 'neutral',
 'id_annotator': '87',
 'id': '332473940712751104'}

# Clean dataset

In [4]:
ds = Dataset()
corpus, target = ds.binary_class(dataset_raw, processed=False)
corpus_multi, target_multi = ds.multi_class(dataset_raw, processed=False)
corpus_multi_lema = Lematizator().fit_transform(corpus_multi)

Split dataset into classes

In [5]:
negative = [tweet for tweet, klass in zip(corpus_multi, target_multi) if klass==-1]
neutral = [tweet for tweet, klass in zip(corpus_multi, target_multi) if klass==0]
positive = [tweet for tweet, klass in zip(corpus_multi, target_multi) if klass==1]

Check cleaner function

In [6]:
[cleaner(k) for k in positive[100:120:2]]

['feria del libro | dolina se llevó el premio del lector con sus “cartas marcadas” ',
 'me encanto la idea de "cafe pendiente" que buena iniciativa para argentina, ojala todos los bares se sumen!!! ♥',
 'argentina renovó acuerdo que permitirá exportar a venezuela 10 mil automotores por año ',
 ' hola mi amorete, te extrañooooo',
 'el domingo 12 se lanza en la feria del libro de buenos aires la campaña de la onu el valiente no es violento,... ',
 'verdad?  quien quiere de verdad quiere en silencio, con hechos y nunca con palabras.',
 'argentina renovó acuerdo que permitirá exportar a venezuela 10 mil automotores por año ',
 'pablo migliore cerca de la libertad, rt si lo querés de vuelta defendiendo el arco del ',
 'unidos en la esperanza!.',
 '  que sea tierno y me abraze a todo momento.']

In [7]:
tweet = corpus[1].lower()
print(tweet)
print(tokenizer(tweet))
print([k for k in tokenizer(tweet) if k not in STOP_WORD_ES])

lo que me hizo reir gastón, no tiene nombre...
['lo', 'que', 'me', 'hizo', 'reir', 'gastón', 'no', 'tiene', 'nombre']
['hizo', 'reir', 'gastón', 'nombre']


In [8]:
print(tokenizer('en el 2001 los hombres irán a la luna'))


['en', 'el', 'los', 'hombres', 'irán', 'a', 'la', 'luna']


# EDA principal ADJETIVES

In [9]:
n_samples = 100
adj_nltk = Counter([token for k in corpus[:n_samples] for (token, pos) in nltk.pos_tag(tokenizer(k)) if pos in ['JJ']])

In [10]:
adj_nltk.most_common(10)

[('un', 15),
 ('una', 11),
 ('las', 4),
 ('nos', 3),
 ('se', 3),
 ('los', 3),
 ('lat', 2),
 ('por', 2),
 ('serán', 2),
 ('increíble', 2)]

In [11]:
# pos = ['ADJ', 'ADV', 'VERB']
pos = ['ADJ']
adj_spacy = Counter([k.lemma_ for tw in corpus[:100] for k in nlp(tw) if k.pos_ in pos])

In [12]:
adj_spacy.most_common(10)

[('Buen', 6),
 ('♥', 3),
 ('primario', 2),
 ('…', 2),
 ('retirar', 2),
 ('increíble', 2),
 ('insólito', 2),
 ('mejorar', 2),
 ('peor', 2),
 ('necesario', 2)]

In [13]:
def filter_by_pos(speech):
    pos = 'ADJ'
    filtered_speech = [
        ent.lemma_.lower() 
        for doc in nlp.pipe([cleaner(k) for k in positive],
                             disable=["parser", "ner", 'textcat'],
                             n_process=-1) 
        for ent in doc if ent.pos_==pos]
    
    return filtered_speech

adj_pos = filter_by_pos(positive)
adj_neutral = filter_by_pos(neutral)
adj_neg = filter_by_pos(negative)

In [14]:
voc = list({unidecode(k) for k,_ in Counter(adj_neg + adj_pos + adj_neutral).most_common(5000)})

In [15]:
voc[::100]

['',
 'amor!!gracias',
 'suculento',
 'intacto',
 'castulo',
 'paralelo',
 'adjunto',
 'tenir',
 '-no',
 'sentarse',
 'exactamente',
 'explotar',
 'incesante',
 'eeeeeeeeeeeeeeeeeenooooorme',
 'jordano',
 'informativo',
 'inteligente',
 'monedero',
 'desinformado',
 'indiscriminado',
 'chingonas',
 'irreductible',
 'celiaco',
 'matrimonial',
 'resacar',
 'informarse',
 'intubar',
 'revelador',
 'p.d',
 'resfriar',
 'tequiiero',
 'pillar',
 'artico',
 'ceramica',
 'tenebroso',
 'colapsar',
 'riguroso',
 'acojonar',
 'funcional',
 'blancanieves',
 'vecinal',
 'seco',
 'someter',
 'repetir',
 'hidrico',
 'tequiero',
 'ritmico',
 'oso',
 'jo']

# Modeling

## Text to num vector
BOW = Bag of word

In [55]:
def test_model(y_train, y_train_pred, y_test, y_test_pred):
    acc_train = accuracy_score(y_train, y_train_pred)
    acc_test = accuracy_score(y_test, y_test_pred)
    print(f'Accuracy train: {acc_train}')
    print(f'Accuracy test: {acc_test}')
    print(f'Confusion matrx train:\n{confusion_matrix(y_train, y_train_pred)}')
    print(f'Confusion matrx test:\n{confusion_matrix(y_test, y_test_pred)}')
    print(f'Report train:\n{classification_report(y_train, y_train_pred)}')
    print(f'Report test:\n{classification_report(y_test, y_test_pred)}')
    
def train_and_test_model(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train);
    y_train_pred = clf.predict(x_train)
    y_test_pred = clf.predict(x_test)
    
    test_model(y_train, y_train_pred, y_test, y_test_pred)
    
    return clf

In [17]:
hashing = HashingVectorizer(
    analyzer = "word",
    n_features=1000,
#     tokenizer=tokenizer_stemmer,
    preprocessor=None,
    #  stop_words=stopwords.words("spanish"),
    binary=False,
    strip_accents='unicode',
    encoding='utf-8',
    ngram_range=(1,3), )

vectorizer = CountVectorizer(  
    analyzer = 'word',
    tokenizer = tokenizer,
    strip_accents='unicode',
    preprocessor=cleaner,
    lowercase = True,
    stop_words = STOP_WORD_ES,
    max_features=5000,
#     min_df = 0.,
#     max_df = 1.9,
    ngram_range=(1, 3),
    binary=False,
)

vect_custom = CountVectorizer(  
    analyzer = 'word',
    tokenizer = tokenizer,
    strip_accents='unicode',
    preprocessor=cleaner,
    lowercase = True,
    stop_words = STOP_WORD_ES,
    max_features=5000,
#     min_df = 0.,
#     max_df = 1.9,
    ngram_range=(1, 3),
    binary=False,
    vocabulary=voc
)

In [18]:
vect_custom.fit(corpus[:10])
vect_custom.inverse_transform(corpus[0:2])

[array(['', 'posicional'], dtype='<U27')]

## Binary classification
### Logit

In [19]:
x_train_bin, x_test_bin, y_train_bin, y_test_bin =  \
    train_test_split(corpus, target, stratify=target, random_state=42, test_size=0.2)

In [20]:
steps=Pipeline(steps=[('processor', clone(vectorizer)),
                      ("clf",  LogisticRegressionCV(cv=3, class_weight='balanced', n_jobs=-1))])
logit_bin_pipeline = train_and_test_model(steps, x_train_bin, y_train_bin, x_test_bin, y_test_bin)

Accuracy train: 0.6883045151143523
Accuracy test: 0.6497659614304437
Confusion matrx train:
[[13826  7192]
 [26103 59698]]
Confusion matrx test:
[[ 2929  2325]
 [ 7028 14423]]
Report:
              precision    recall  f1-score   support

           0       0.35      0.66      0.45     21018
           1       0.89      0.70      0.78     85801

    accuracy                           0.69    106819
   macro avg       0.62      0.68      0.62    106819
weighted avg       0.79      0.69      0.72    106819



In [22]:
y_test_bin[:10]

[1, 1, 1, 1, 0, 0, 1, 1, 1, 1]

In [23]:
tweet = x_test_bin[2]
print(tweet)
print(f'probability: {logit_bin_pipeline.predict_proba([tweet])}, class: {logit_bin_pipeline.predict([tweet])}')

¡Vamos #APorLaDécima! Conoce la historia de las victorias del Real Madrid en la UCL: bit.ly/1i10lZRpic.twitter.com/omTDc1chu5
probability: [[0.01785559 0.98214441]], class: [1]


## Multiclass classification

Lamatizated dataset

In [24]:
x_train_raw, x_test_raw, y_train_raw, y_test_raw =  \
    train_test_split(corpus_multi, target_multi, stratify=target_multi, random_state=42, test_size=0.2)

Original dataset

In [25]:
x_train_lema, x_test_lema, y_train_lema, y_test_lema =  \
    train_test_split(corpus_multi_lema, target_multi, stratify=target_multi, random_state=42, test_size=0.2)

### Logit

balanced

In [49]:
steps = Pipeline(steps=[('processor', clone(vectorizer)),
                        ('clf', LogisticRegressionCV(cv=3,
                                                     class_weight='balanced',
                                                     n_jobs=-1,
                                                     solver='lbfgs',
                                                     multi_class='auto'))])
logit_multi_pipeline = train_and_test_model(steps, x_train_raw, y_train_raw, x_test_raw, y_test_raw)

Accuracy train: 0.5339558003627326
Accuracy test: 0.5053289149612646
Confusion matrx train:
[[11659  4103  5256]
 [13747 40254 17824]
 [21412 20914 43475]]
Confusion matrx test:
[[ 2514  1179  1561]
 [ 3536  9601  4820]
 [ 5454  5543 10454]]
Report:
              precision    recall  f1-score   support

          -1       0.25      0.55      0.34     21018
           0       0.62      0.56      0.59     71825
           1       0.65      0.51      0.57     85801

    accuracy                           0.53    178644
   macro avg       0.51      0.54      0.50    178644
weighted avg       0.59      0.53      0.55    178644



In [51]:
tweet = x_test_raw[3]
print(tweet)
print(f'probability: {logit_multi_pipeline.predict_proba([tweet])}, class: {logit_multi_pipeline.predict([tweet])}')

@SetteSettamen @Conpdepau ya veo ya.... 😡😡😡😡 a ti tambien sette que te tiro un botellin a la cabeza eh ajajjajaja
probability: [[0.6269805  0.0983468  0.27467269]], class: [-1]


**ATTENTION**: We need to pay mora atention to vocabulary to improve the performance. there are a lot of word with the same meaning, pluran and singular, etc

In [52]:
logit_multi_pipeline.steps[0][1].get_feature_names()[::300]

['abajo',
 'arte',
 'caminar',
 'conmigo',
 'dejas',
 'efe',
 'fav si',
 'hacerme',
 'israelí',
 'madrid',
 'muchas ganas',
 'palacio',
 'pongan',
 'ranking',
 'según',
 'terminal',
 'versión']

unbalanced

In [53]:
steps = Pipeline(steps=[('processor', clone(vectorizer)),
                        ('clf', LogisticRegressionCV(cv=3, class_weight=None, n_jobs=-1, solver='lbfgs', multi_class='auto'))])
unbalanced_logit_multi_pipeline = train_and_test_model(steps, x_train_raw, y_train_raw, x_test_raw, y_test_raw)

Accuracy train: 0.6011285013770403
Accuracy test: 0.5791276700550804
Confusion matrx train:
[[ 1074  5324 14620]
 [  374 39363 32088]
 [  486 18364 66951]]
Confusion matrx test:
[[  210  1346  3698]
 [   91  9371  8495]
 [  145  5022 16284]]
Report:
              precision    recall  f1-score   support

          -1       0.56      0.05      0.09     21018
           0       0.62      0.55      0.58     71825
           1       0.59      0.78      0.67     85801

    accuracy                           0.60    178644
   macro avg       0.59      0.46      0.45    178644
weighted avg       0.60      0.60      0.57    178644



In [31]:
# # Define the hiperparameters (Very simple because of memory issues)
# hiperparam_log_multi_pipeline = {  
#     "processor__max_features":[100, 1000, 3000, 5000],
#     "processor__ngram_range":[(1, 1), (1, 2), (1, 3),],
#     "processor__binary":[True, False],
# #     "clf__cv":[3, 5],
# #     "clf__class_weight":['balanced', None]",
#     }

# log_grid_search = GridSearchCV(estimator=log_multi_pipeline,
#                               param_grid=hiperparam_log_multi_pipeline,
#                               scoring="accuracy",
#                               cv=2,
#                               n_jobs=-1
#                              )
        

# #We train the model
# log_grid_search.fit(x_train, y_train)

# #This will take a very long time

# print(f'best score: {log_grid_search.best_score_}')

# print(f'score: {log_grid_search.score(x_test, y_test)}')

### MultinomialNB

In [32]:
steps_mnb = Pipeline(steps=[('processor', clone(vectorizer)),
                            ('clf', MultinomialNB())])
mnb_multi_pipeline = train_and_test_model(steps_mnb, x_train_lema, y_train_lema, x_test_lema, y_test_lema)

Accuracy train: 0.5930565818051544
Accuracy test: 0.5811651963637992
Confusion matrx train:
[[ 3474  5038 12506]
 [ 2691 40611 28523]
 [ 3479 20461 61861]]
Confusion matrx test:
[[  746  1258  3250]
 [  648 10000  7309]
 [  881  5360 15210]]
Report:
              precision    recall  f1-score   support

          -1       0.36      0.17      0.23     21018
           0       0.61      0.57      0.59     71825
           1       0.60      0.72      0.66     85801

    accuracy                           0.59    178644
   macro avg       0.53      0.48      0.49    178644
weighted avg       0.58      0.59      0.58    178644



In [33]:
steps_mnb_raw = Pipeline(steps=[('processor', clone(vectorizer)),
                            ('clf', MultinomialNB())])
mnb_multi_pipeline_raw = train_and_test_model(steps_mnb_raw, x_train_raw, y_train_raw, x_test_raw, y_test_raw)


Accuracy train: 0.59251920019704
Accuracy test: 0.5778290269132595
Confusion matrx train:
[[ 3056  5151 12811]
 [ 2134 40206 29485]
 [ 2843 20370 62588]]
Confusion matrx test:
[[  604  1278  3372]
 [  560  9779  7618]
 [  744  5283 15424]]
Report:
              precision    recall  f1-score   support

          -1       0.38      0.15      0.21     21018
           0       0.61      0.56      0.58     71825
           1       0.60      0.73      0.66     85801

    accuracy                           0.59    178644
   macro avg       0.53      0.48      0.48    178644
weighted avg       0.58      0.59      0.58    178644



In [34]:
mnb_multi_pipeline_raw.predict([k for k , p in zip(x_test_raw, y_test_raw) if p==-1][::100])

array([ 1,  1,  1, -1,  0,  1,  0,  1,  1,  1,  1,  1, -1,  1,  1,  0,  1,
        1,  1,  1,  1,  0, -1,  0,  0,  1,  1,  1,  1,  0,  1,  1,  1,  1,
        1,  1,  0,  1,  1,  1, -1,  0, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        0,  1])

### BernulliNB

In [35]:
steps_bernulli = Pipeline(steps=[('processor', clone(vectorizer)),
                          ('clf', BernoulliNB())])
bernulli_multi_pipeline = train_and_test_model(steps_bernulli, x_train_lema, y_train_lema, x_test_lema, y_test_lema)

Accuracy train: 0.5906215713933858
Accuracy test: 0.5782544444942009
Confusion matrx train:
[[ 3696  4700 12622]
 [ 3039 38960 29826]
 [ 3979 18967 62855]]
Confusion matrx test:
[[  807  1182  3265]
 [  753  9565  7639]
 [ 1045  4952 15454]]
Report:
              precision    recall  f1-score   support

          -1       0.34      0.18      0.23     21018
           0       0.62      0.54      0.58     71825
           1       0.60      0.73      0.66     85801

    accuracy                           0.59    178644
   macro avg       0.52      0.48      0.49    178644
weighted avg       0.58      0.59      0.58    178644



### SVM

In [36]:
steps_svc = Pipeline(steps=[('processor', clone(vectorizer)),
                            ('clf', LinearSVC(class_weight='balanced'))])
svc_multi_pipeline = train_and_test_model(steps_svc, x_train_lema, y_train_lema, x_test_lema, y_test_lema)



Accuracy train: 0.5925863728980543
Accuracy test: 0.5596480229277686
Confusion matrx train:
[[ 7251  4468  9299]
 [ 6048 40913 24864]
 [ 8707 19396 57698]]
Confusion matrx test:
[[ 1468  1235  2551]
 [ 1565  9710  6682]
 [ 2388  5246 13817]]
Report:
              precision    recall  f1-score   support

          -1       0.33      0.34      0.34     21018
           0       0.63      0.57      0.60     71825
           1       0.63      0.67      0.65     85801

    accuracy                           0.59    178644
   macro avg       0.53      0.53      0.53    178644
weighted avg       0.59      0.59      0.59    178644



In [37]:
steps_svc_raw = Pipeline(steps=[('processor', clone(vectorizer)),
                            ('clf', LinearSVC(class_weight='balanced'))])
svc_multi_pipeline_raw = train_and_test_model(steps_svc_raw, x_train_raw, y_train_raw, x_test_raw, y_test_raw)

Accuracy train: 0.5886175858131255
Accuracy test: 0.5552371143253773
Confusion matrx train:
[[ 7000  4694  9324]
 [ 5817 40672 25336]
 [ 8573 19747 57481]]
Confusion matrx test:
[[ 1449  1270  2535]
 [ 1523  9577  6857]
 [ 2304  5375 13772]]
Report:
              precision    recall  f1-score   support

          -1       0.33      0.33      0.33     21018
           0       0.62      0.57      0.59     71825
           1       0.62      0.67      0.65     85801

    accuracy                           0.59    178644
   macro avg       0.53      0.52      0.52    178644
weighted avg       0.59      0.59      0.59    178644



In [38]:
arr = svc_multi_pipeline.steps[-1][1].coef_[0,:]
vocb = np.array(svc_multi_pipeline.steps[0][1].get_feature_names())
vocb[np.argsort(arr)[-50:]]


array(['social caer', 'seguridad social caer', 'potter', 'telefono',
       'formación bolsa', 'horapunta', 'zumbido', 'deprimir', 'ice',
       'eternoalfredo leyenda', 'voto perdóname claudio', 'keylor ser',
       'temporada youtu', 'temporada youtu be', 'registrar subir agostar',
       'dioooos', 'considerar comer', 'vuelve campeón', 'as ser',
       'respirar hondo', 'horrible', 'aburrimiento', 'personar agostar',
       'buenos día familia', 'cst', 'charlie sheen', 'mariló montero',
       'unir causar', 'luchar vencer', 'ardiente di stéfano', 'alcaraz',
       'casar real madrid', 'pobrecillo', 'campeón copa',
       'in mexico city', 'ocho mañana', 'reino unido',
       'parir subir agostar', 'mesar cárcel',
       'retar alsicebucketchallenge', 'play store', 'casar real', 'mun',
       'bucket challenge', 'prohibir miembro',
       'internacional ciudad méxico', 'casillas saber', 'retar haber',
       'falto respetar', 'plancha'], dtype='<U35')

## Ensembles 

In [39]:
voting = VotingClassifier(
    estimators=[('mnb', MultinomialNB()),
                ('bernulli', BernoulliNB()),
                ('lr', LogisticRegression(solver='lbfgs', multi_class='auto')),
                ('svc', LinearSVC())
               ],
    voting='hard',
    n_jobs=None, )
steps_voting = Pipeline(steps=[('processor', clone(vectorizer)),
                            ('voting', voting),
                            ])
ensamble_multi_pipeline = train_and_test_model(steps_voting, x_train_lema, y_train_lema, x_test_lema, y_test_lema)



Accuracy train: 0.5994883679272743
Accuracy test: 0.5814562715507591
Confusion matrx train:
[[ 3418  5502 12098]
 [ 2526 43089 26210]
 [ 3258 21955 60588]]
Confusion matrx test:
[[  723  1406  3125]
 [  617 10451  6889]
 [  844  5812 14795]]
Report:
              precision    recall  f1-score   support

          -1       0.37      0.16      0.23     21018
           0       0.61      0.60      0.61     71825
           1       0.61      0.71      0.66     85801

    accuracy                           0.60    178644
   macro avg       0.53      0.49      0.50    178644
weighted avg       0.58      0.60      0.59    178644



# Testing models with artificial tweets

In [40]:
tweet = ['la concha de la lora',
         'que mal dia que tuve hoy',
         'odio ir a la escuela los viernes',
         'las noticias informan',
         'que mal todo esto',
         'buen día!!! te deseo lo mejor',
         'mal mal mal mal',
         'los capitalistas son una basura',
         'los comunistas son lo mejor',
         'los medios de comunicación mienten'

        ]

In [41]:
logit_bin_pipeline.predict(Lematizator().fit_transform(tweet))

array([1, 0, 1, 1, 0, 1, 0, 0, 1, 0])

In [42]:
logit_multi_pipeline.predict(tweet)

array([ 1, -1, -1,  0, -1,  1, -1, -1,  1, -1])

In [43]:
svc_multi_pipeline.predict(tweet)

array([ 1,  1, -1,  1,  1,  1, -1,  1,  1,  0])

In [61]:
pred_pos = unbalanced_logit_multi_pipeline.predict(x_test_raw)
pred_nn = logit_bin_pipeline.predict(x_test_raw) - 1
result = np.where(pred_pos == 1, pred_pos, pred_nn)

In [62]:
print(classification_report(y_test_raw, result))

              precision    recall  f1-score   support

          -1       0.15      0.19      0.17      5254
           0       0.61      0.32      0.42     17957
           1       0.57      0.76      0.65     21451

    accuracy                           0.51     44662
   macro avg       0.44      0.42      0.41     44662
weighted avg       0.54      0.51      0.50     44662



In [63]:
print(classification_report(y_test_raw, pred_pos))

              precision    recall  f1-score   support

          -1       0.47      0.04      0.07      5254
           0       0.60      0.52      0.56     17957
           1       0.57      0.76      0.65     21451

    accuracy                           0.58     44662
   macro avg       0.55      0.44      0.43     44662
weighted avg       0.57      0.58      0.55     44662



# Save best model

In [64]:
model_path = '../models/linear/'

In [73]:
logit_binary_file = os.path.join(model_path, 'logit_bin_pipeline_raw.joblib')
joblib.dump(logit_bin_pipeline, logit_binary_file)

['../models/linear/logit_bin_pipeline_raw.joblib']

In [74]:
logit_multi_file = os.path.join(model_path, 'logit_multi_pipeline.joblib')
if not logit_multi_pipeline.steps[0][0]=='lematizator':
    logit_multi_pipeline.steps.insert(0, ('lematizator', Lematizator()))
joblib.dump(logit_multi_pipeline, logit_multi_file)

['../models/linear/logit_multi_pipeline.joblib']

In [75]:
logit_multi_raw_file = os.path.join(model_path, 'logit_multi_pipeline_raw.joblib')
joblib.dump(logit_multi_pipeline, logit_multi_file)

['../models/linear/logit_multi_pipeline.joblib']

In [76]:
mnb_multi_file = os.path.join(model_path, 'mnb_multi_pipeline.joblib')
if not mnb_multi_pipeline.steps[0][0]=='lematizator':
    mnb_multi_pipeline.steps.insert(0, ('lematizator', Lematizator()))
joblib.dump(mnb_multi_pipeline, mnb_multi_file)

['../models/linear/mnb_multi_pipeline.joblib']

In [77]:
bernulli_multi_file = os.path.join(model_path, 'bernulli_multi_pipeline.joblib')
if not bernulli_multi_pipeline.steps[0][0]=='lematizator':
    bernulli_multi_pipeline.steps.insert(0, ('lematizator', Lematizator()))
joblib.dump(bernulli_multi_pipeline, bernulli_multi_file)

['../models/linear/bernulli_multi_pipeline.joblib']

In [78]:
svc_multi_file = os.path.join(model_path, 'svc_multi_pipeline.joblib')
if not svc_multi_pipeline.steps[0][0]=='lematizator':
    svc_multi_pipeline.steps.insert(0, ('lematizator', Lematizator()))
joblib.dump(svc_multi_pipeline, svc_multi_file)

['../models/linear/svc_multi_pipeline.joblib']

In [79]:
svc_multi_file_raw = os.path.join(model_path, 'svc_multi_pipeline_raw.joblib')
joblib.dump(svc_multi_pipeline_raw, svc_multi_file_raw)

['../models/linear/svc_multi_pipeline_raw.joblib']

In [80]:
ensamble_multi_file = os.path.join(model_path, 'ensemble_multi_pipeline.joblib')
if not ensamble_multi_pipeline.steps[0][0]=='lematizator':
    ensamble_multi_pipeline.steps.insert(0, ('lematizator', Lematizator()))
joblib.dump(ensamble_multi_pipeline, ensamble_multi_file)

['../models/linear/ensemble_multi_pipeline.joblib']