In [1]:
import os
import pandas as pd
os.chdir("/usr/app/src")
import numpy as np
import data_sourcing, data_preprocessing, data_splitting, feature_extraction,modeling
from sklearn import metrics
import nltk
import eli5
from eli5.lime import TextExplainer
import joblib
from eli5 import show_weights
from eli5 import show_prediction

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
df = data_sourcing.get("/usr/app/examples/data_example.csv", ";", 0, ["text","cats"])
df.head()

Unnamed: 0,text,cats
0,"� @ Governador Valadares, Minas Gerais https:/...",Neutro
1,"�� @ Governador Valadares, Minas Gerais https:...",Neutro
2,��� https://t.co/BnDsO34qK0,Neutro
3,��� PSOL vai questionar aumento de vereadores ...,Negativo
4,""" bom é bandido morto""\nDeputado Cabo Júlio é ...",Neutro


In [3]:
df["clean_text"] = data_preprocessing.clean(df, text_col = "text")
df.head()

Unnamed: 0,text,cats,clean_text
0,"� @ Governador Valadares, Minas Gerais https:/...",Neutro,governador valadares minas gerais https co b3...
1,"�� @ Governador Valadares, Minas Gerais https:...",Neutro,governador valadares minas gerais https co dp...
2,��� https://t.co/BnDsO34qK0,Neutro,https co bndso34qk0
3,��� PSOL vai questionar aumento de vereadores ...,Negativo,psol vai questionar aumento de vereadores pre...
4,""" bom é bandido morto""\nDeputado Cabo Júlio é ...",Neutro,bom é bandido morto deputado cabo júlio é con...


In [4]:
df["tokens"] = df["clean_text"].apply(feature_extraction.tokenizer("multi_word"))
df.head()

Unnamed: 0,text,cats,clean_text,tokens
0,"� @ Governador Valadares, Minas Gerais https:/...",Neutro,governador valadares minas gerais https co b3...,"[governador, valadares, minas, gerais, https, ..."
1,"�� @ Governador Valadares, Minas Gerais https:...",Neutro,governador valadares minas gerais https co dp...,"[governador, valadares, minas, gerais, https, ..."
2,��� https://t.co/BnDsO34qK0,Neutro,https co bndso34qk0,"[https, co, bndso34qk0]"
3,��� PSOL vai questionar aumento de vereadores ...,Negativo,psol vai questionar aumento de vereadores pre...,"[psol, vai, questionar, aumento, de, vereadore..."
4,""" bom é bandido morto""\nDeputado Cabo Júlio é ...",Neutro,bom é bandido morto deputado cabo júlio é con...,"[bom, é, bandido, morto, deputado, cabo, júlio..."


In [5]:
df["tokens_wosw"] = data_preprocessing.stop_words(df, token_col = "tokens")
df.head()

Unnamed: 0,text,cats,clean_text,tokens,tokens_wosw
0,"� @ Governador Valadares, Minas Gerais https:/...",Neutro,governador valadares minas gerais https co b3...,"[governador, valadares, minas, gerais, https, ...","[governador, valadares, minas, gerais, https, ..."
1,"�� @ Governador Valadares, Minas Gerais https:...",Neutro,governador valadares minas gerais https co dp...,"[governador, valadares, minas, gerais, https, ...","[governador, valadares, minas, gerais, https, ..."
2,��� https://t.co/BnDsO34qK0,Neutro,https co bndso34qk0,"[https, co, bndso34qk0]","[https, co, bndso34qk0]"
3,��� PSOL vai questionar aumento de vereadores ...,Negativo,psol vai questionar aumento de vereadores pre...,"[psol, vai, questionar, aumento, de, vereadore...","[psol, vai, questionar, aumento, vereadores, p..."
4,""" bom é bandido morto""\nDeputado Cabo Júlio é ...",Neutro,bom é bandido morto deputado cabo júlio é con...,"[bom, é, bandido, morto, deputado, cabo, júlio...","[bom, bandido, morto, deputado, cabo, júlio, c..."


In [6]:
df["num_cat"] = data_preprocessing.label_encoding(df, y_col = "cats")
df.head()

Unnamed: 0,text,cats,clean_text,tokens,tokens_wosw,num_cat
0,"� @ Governador Valadares, Minas Gerais https:/...",Neutro,governador valadares minas gerais https co b3...,"[governador, valadares, minas, gerais, https, ...","[governador, valadares, minas, gerais, https, ...",1
1,"�� @ Governador Valadares, Minas Gerais https:...",Neutro,governador valadares minas gerais https co dp...,"[governador, valadares, minas, gerais, https, ...","[governador, valadares, minas, gerais, https, ...",1
2,��� https://t.co/BnDsO34qK0,Neutro,https co bndso34qk0,"[https, co, bndso34qk0]","[https, co, bndso34qk0]",1
3,��� PSOL vai questionar aumento de vereadores ...,Negativo,psol vai questionar aumento de vereadores pre...,"[psol, vai, questionar, aumento, de, vereadore...","[psol, vai, questionar, aumento, vereadores, p...",0
4,""" bom é bandido morto""\nDeputado Cabo Júlio é ...",Neutro,bom é bandido morto deputado cabo júlio é con...,"[bom, é, bandido, morto, deputado, cabo, júlio...","[bom, bandido, morto, deputado, cabo, júlio, c...",1


In [7]:
train, valid, test = data_splitting.split(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1)
print(len(df))
print(len(train))
print(len(valid))
print(len(test))

8127
6502
812
813


In [8]:
model = modeling.model("1")
vec = model.vectorizing("count", "split")
model_id = model.fit(train, "cats", "clean_text")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END model=RandomForestClassifier(random_state=1), model__max_depth=28.0; total time=   0.9s
[CV] END model=RandomForestClassifier(random_state=1), model__max_depth=28.0; total time=   0.9s
[CV] END model=RandomForestClassifier(random_state=1), model__max_depth=28.0; total time=   0.9s
[CV] END model=RandomForestClassifier(random_state=1), model__max_depth=28.0; total time=   0.9s
[CV] END model=RandomForestClassifier(random_state=1), model__max_depth=28.0; total time=   0.9s
[CV] END model=RandomForestClassifier(random_state=1), model__max_depth=4.0; total time=   0.4s
[CV] END model=RandomForestClassifier(random_state=1), model__max_depth=4.0; total time=   0.4s
[CV] END model=RandomForestClassifier(random_state=1), model__max_depth=4.0; total time=   0.4s
[CV] END model=RandomForestClassifier(random_state=1), model__max_depth=4.0; total time=   0.4s
[CV] END model=RandomForestClassifier(random_state=1), model__max_dept

In [19]:
model.eli5()

Weight,Feature
0.0583  ± 0.1565,estado
0.0504  ± 0.1836,helicópteros
0.0320  ± 0.1529,calamidade
0.0254  ± 0.1092,financeira
0.0224  ± 0.0903,governo
0.0196  ± 0.0924,compra
0.0189  ± 0.0720,rt
0.0188  ± 0.0836,drogas
0.0172  ± 0.0821,dois
0.0152  ± 0.0566,co


In [19]:
import spacy
import pandas as pd
import numpy as np

In [46]:
from spacy.lang.pt.stop_words import STOP_WORDS

In [21]:
nlp = spacy.load("pt_core_news_sm")

In [38]:
import string
punctuations = string.punctuation
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [39]:
from spacy.lang.pt import Portuguese
parser = Portuguese()

In [47]:
stopwords = list(STOP_WORDS)

In [48]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [49]:
# ex1 = "Felipe Castanhari esta no podcast do flow e o papo esta muito bom"
ex1 = "He was walking with the walker in the Wall he may had sat and run with the runner"

In [50]:
spacy_tokenizer(ex1)

[]

In [9]:
import joblib
from eli5.sklearn import explain_weights_sklearn
from eli5.formatters import format_as_dataframe, format_as_dataframes
te = TextExplainer(random_state=42)
loaded_model = joblib.load('/usr/app/models/1.joblib')

te.fit("dasd a",loaded_model.predict_proba)
# te.show_prediction(target_names=model.named_steps["vec"].get_feature_names())
format_as_dataframe(explain_weights_sklearn(loaded_model, feature_names=loaded_model.named_steps["vec"].get_feature_names(), top=10))

Unnamed: 0,feature,weight,std
0,estado,0.058291,0.078267
1,helicópteros,0.050356,0.091794
2,calamidade,0.032039,0.076448
3,financeira,0.025352,0.054609
4,governo,0.02235,0.045167
5,compra,0.019631,0.046214
6,rt,0.018914,0.035989
7,drogas,0.018822,0.041795
8,dois,0.017164,0.041051
9,co,0.015248,0.028294


In [10]:
te = TextExplainer(random_state=42)
te.fit("deu bom", loaded_model.predict_proba)
te.show_prediction(target_names=loaded_model.named_steps["vec"].get_feature_names())

Contribution?,Feature
-0.281,<BIAS>
-3.073,Highlighted in text (sum)

Contribution?,Feature
-0.024,<BIAS>
-0.073,Highlighted in text (sum)

Contribution?,Feature
-0.076,<BIAS>
-0.811,Highlighted in text (sum)
