In [2]:
import warnings
import re
import pickle

import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from nltk.stem.snowball import PortugueseStemmer
from sklearn.ensemble import StackingClassifier

warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('../data/pt-br/dadosposanalise.csv')
df.head(2)

Unnamed: 0,title,body,label
0,É que Tite prestou homenagem em túmulo de Fi...,"Apesar da semelhança física, foto que circula ...",1
1,É que esposa do CEO da Pfizer morreu por com...,Myriam Bourla esteve em evento em 10 de novemb...,1


In [4]:
df.isna().sum()

title    0
body     1
label    0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.isna().sum()

title    0
body     0
label    0
dtype: int64

In [7]:
df = df.reset_index()

In [9]:
X = df['title'].to_list()
y = df[['label']]

In [10]:
ps = PorterStemmer()
words = stopwords.words('portuguese')



In [11]:
stemmer = PortugueseStemmer()
analyzer = TfidfVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stem_vectorizer = TfidfVectorizer(analyzer=stemmed_words)

In [12]:
transformer = Pipeline(
    steps= [
        ('stemmer_tfidf', stem_vectorizer)
    ]
)

In [13]:
decisiontree = DecisionTreeClassifier()

mlp = MLPClassifier(hidden_layer_sizes=(100,), 
                           activation='relu',
                           solver = 'adam',
                           alpha=0.001)

lr = LogisticRegression()

svc = SVC(probability=True)

models = {
    'decisiontree': decisiontree,
    'mlp': mlp,
    'lr': lr,
    'svc':svc
}

clf = StackingClassifier(
    estimators=[
        ('decisiontree', decisiontree),
        ('lr', lr),
        ('mlp', mlp),
        ('svc',svc)
    ],
    final_estimator=svc,
    cv=10
)


In [14]:
resultsacc = {model_name: None for model_name in models}
resultsf1 = {model_name: None for model_name in models}
for model_name, model in models.items():
    reg = Pipeline(
            steps=[
                ('preprocessor', transformer),
                ('regressor', model)
            ]
        )
        #scores = cross_val_score(reg, X, y, scoring='accuracy', cv=10)
    scores = cross_validate(reg, X, y, scoring=[ 'accuracy', 'f1'], cv=10)
    f1 = scores['test_f1'].mean()*100
    acc = scores['test_accuracy'].mean()*100
    resultsf1[model_name] = '{:.2f}'.format(f1)
    resultsacc[model_name] = '{:.2f}'.format(acc)

In [15]:
resultsacc

{'decisiontree': '84.31', 'mlp': '85.74', 'lr': '88.88', 'svc': '89.82'}

In [16]:
resultsf1

{'decisiontree': '84.04', 'mlp': '85.26', 'lr': '88.57', 'svc': '89.61'}

In [17]:
#best_clf = models[max(resultsacc)]

In [18]:
model = Pipeline(
        steps=[
            ('preprocessor', transformer),
            ('regressor', clf)
        ]
    )

model.fit(X, y)

Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('stemmer_tfidf',
                                  TfidfVectorizer(analyzer=<function stemmed_words at 0x0000014CFFE7FE50>))])),
                ('regressor',
                 StackingClassifier(cv=10,
                                    estimators=[('decisiontree',
                                                 DecisionTreeClassifier()),
                                                ('lr', LogisticRegression()),
                                                ('mlp',
                                                 MLPClassifier(alpha=0.001)),
                                                ('svc', SVC(probability=True))],
                                    final_estimator=SVC(probability=True)))])

In [19]:
pickle.dump(model, open('../models/modelofinal.joblib', 'wb'))