### Projeto Detector de Fake News - PyCaret ###

**Importação das bibliotecas**

In [1]:
import pandas as pd
import numpy as np
from pycaret.nlp import *
from nltk.corpus import stopwords
from minio import Minio
import warnings
warnings.filterwarnings('ignore')

In [2]:
client = Minio("localhost:9000",
              access_key = "minioadmin",
              secret_key = "minioadmin",
              secure = False)

**Baixa arquivo no Minio e lê com Pandas**

In [3]:
client.fget_object("processado",
                  "etl_fakenews.parquet",
                  "processado/etl_fakenews.parquet")

news = pd.read_parquet("processado/etl_fakenews.parquet")

In [6]:
news.head()

Unnamed: 0,text,subject,date,Status
0,"As U.S. budget fight looms, Republicans flip t...",politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,politicsNews,"December 29, 2017",True


**Etapa de NLP (Natural Language Processing)**

**Modelo utilizando LDA**

In [7]:
news_setup  = setup(data = news,
                       target = 'text',
                      session_id = 42,
                      log_experiment = True,
                      experiment_name = 'log_nlp_15_07_22')

Description,Value
session_id,42
Documents,44898
Vocab Size,69499
Custom Stopwords,False


In [8]:
model_lda = create_model('lda', multi_core = True)

In [9]:
lda_data = assign_model(model_lda)

In [10]:
lda_data.head()

Unnamed: 0,text,subject,date,Status,Topic_0,Topic_1,Topic_2,Topic_3,Dominant_Topic,Perc_Dominant_Topic
0,budget fight loom flip fiscal script conservat...,politicsNews,"December 31, 2017",True,0.955739,0.00089,0.000892,0.042479,Topic 0,0.96
1,military people allow first time enlist start ...,politicsNews,"December 29, 2017",True,0.572835,0.16247,0.263579,0.001116,Topic 0,0.57
2,senior republican senator let investigation li...,politicsNews,"December 31, 2017",True,0.20782,0.601269,0.001652,0.189259,Topic 1,0.6
3,help australian diplomat campaign tell austral...,politicsNews,"December 30, 2017",True,0.060445,0.917274,0.020272,0.002009,Topic 1,0.92
4,want postal_service charge much call charge mu...,politicsNews,"December 29, 2017",True,0.818576,0.136466,0.042884,0.002074,Topic 0,0.82


In [11]:
lda_data.drop(['text', 'subject', 'date','Dominant_Topic', 'Perc_Dominant_Topic' ], axis = 1, inplace = True)

In [12]:
lda_data.head()

Unnamed: 0,Status,Topic_0,Topic_1,Topic_2,Topic_3
0,True,0.955739,0.00089,0.000892,0.042479
1,True,0.572835,0.16247,0.263579,0.001116
2,True,0.20782,0.601269,0.001652,0.189259
3,True,0.060445,0.917274,0.020272,0.002009
4,True,0.818576,0.136466,0.042884,0.002074


**Etapa de Classificação**

In [13]:
from pycaret.classification import *

In [14]:
cl_news = setup(data = lda_data, target = 'Status', train_size = 0.8, session_id = 42)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,Status
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(44898, 5)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [15]:
models = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.881,0.9431,0.8451,0.8987,0.871,0.7607,0.762,1.61
et,Extra Trees Classifier,0.8732,0.9471,0.8422,0.8856,0.8633,0.7451,0.746,0.717
xgboost,Extreme Gradient Boosting,0.8589,0.932,0.8436,0.8575,0.8505,0.717,0.7171,1.284
lightgbm,Light Gradient Boosting Machine,0.8555,0.9305,0.8423,0.8522,0.8472,0.7101,0.7102,0.349
gbc,Gradient Boosting Classifier,0.8525,0.9273,0.8402,0.8483,0.8442,0.7042,0.7042,1.333
ada,Ada Boost Classifier,0.8482,0.9233,0.8299,0.8479,0.8388,0.6955,0.6957,0.38
dt,Decision Tree Classifier,0.8429,0.8414,0.8123,0.8507,0.831,0.6843,0.685,0.086
lr,Logistic Regression,0.8412,0.9144,0.8637,0.8139,0.838,0.6825,0.6838,2.991
svm,SVM - Linear Kernel,0.838,0.0,0.8551,0.8148,0.8338,0.676,0.6779,0.047
knn,K Neighbors Classifier,0.8341,0.902,0.8074,0.8379,0.8224,0.6668,0.6672,0.089


In [16]:
rf_model = create_model('rf', fold = 10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8778,0.9378,0.839,0.8973,0.8672,0.7542,0.7557
1,0.8778,0.9438,0.8408,0.8959,0.8675,0.7543,0.7556
2,0.882,0.9447,0.8537,0.8934,0.8731,0.7629,0.7636
3,0.8742,0.9374,0.8332,0.895,0.863,0.7469,0.7486
4,0.8875,0.9467,0.8496,0.9081,0.8779,0.7739,0.7753
5,0.8853,0.9459,0.8467,0.9061,0.8754,0.7694,0.7709
6,0.8789,0.9391,0.8367,0.9016,0.868,0.7564,0.7582
7,0.8886,0.949,0.8625,0.8993,0.8805,0.7763,0.7769
8,0.8736,0.9381,0.8396,0.8885,0.8633,0.7459,0.7469
9,0.8842,0.9482,0.8489,0.9017,0.8745,0.7671,0.7683


In [17]:
rf_tune = tune_model(rf_model,optimize = 'Precision')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8547,0.9258,0.832,0.8581,0.8448,0.7082,0.7085
1,0.8552,0.9309,0.8367,0.8558,0.8462,0.7095,0.7096
2,0.8552,0.931,0.8467,0.8487,0.8477,0.7098,0.7098
3,0.8413,0.9218,0.8321,0.834,0.833,0.6818,0.6818
4,0.8619,0.9314,0.8479,0.8599,0.8539,0.723,0.7231
5,0.8558,0.9302,0.842,0.853,0.8475,0.7107,0.7108
6,0.8511,0.9255,0.828,0.8545,0.841,0.701,0.7013
7,0.8622,0.9365,0.8525,0.8571,0.8548,0.7237,0.7237
8,0.8571,0.9318,0.839,0.8576,0.8482,0.7133,0.7135
9,0.8677,0.9369,0.8489,0.8698,0.8593,0.7345,0.7347


In [18]:
et_model = create_model('et')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8653,0.9425,0.8308,0.8792,0.8543,0.7292,0.7302
1,0.8722,0.9464,0.8449,0.8816,0.8629,0.7433,0.7439
2,0.8742,0.9471,0.8525,0.8793,0.8657,0.7474,0.7477
3,0.8658,0.9427,0.8321,0.8794,0.8551,0.7303,0.7313
4,0.8769,0.9511,0.842,0.8932,0.8669,0.7527,0.7538
5,0.8769,0.9492,0.8484,0.8879,0.8677,0.7528,0.7535
6,0.8714,0.9471,0.8309,0.8914,0.8601,0.7413,0.7429
7,0.8842,0.9524,0.8631,0.8902,0.8764,0.7675,0.7678
8,0.8697,0.9411,0.8378,0.8822,0.8595,0.7381,0.739
9,0.875,0.9518,0.8396,0.8912,0.8646,0.7487,0.7498


In [19]:
model_blend = blend_models([rf_model, et_model])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8719,0.9422,0.8372,0.8871,0.8614,0.7426,0.7437
1,0.8772,0.9467,0.8455,0.8909,0.8676,0.7533,0.7542
2,0.8842,0.948,0.8572,0.8949,0.8757,0.7674,0.768
3,0.8731,0.9422,0.8391,0.8879,0.8628,0.7449,0.7459
4,0.8845,0.9511,0.8537,0.8984,0.8755,0.7679,0.7687
5,0.8836,0.9495,0.8525,0.8977,0.8745,0.7662,0.7671
6,0.8783,0.9449,0.8362,0.901,0.8674,0.7553,0.7571
7,0.8834,0.9523,0.8584,0.8923,0.875,0.7657,0.7662
8,0.8764,0.9413,0.8431,0.8911,0.8664,0.7515,0.7525
9,0.8839,0.9519,0.8507,0.8997,0.8745,0.7666,0.7676


In [20]:
tune_blend = tune_model(model_blend, optimize = "Precision")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8783,0.9388,0.8413,0.8964,0.868,0.7554,0.7567
1,0.8781,0.9447,0.842,0.8955,0.8679,0.7549,0.7561
2,0.8831,0.9458,0.8555,0.8942,0.8744,0.7651,0.7658
3,0.8744,0.9386,0.835,0.8941,0.8635,0.7475,0.749
4,0.8881,0.9476,0.852,0.9072,0.8787,0.775,0.7763
5,0.8856,0.9467,0.8479,0.9056,0.8758,0.7699,0.7714
6,0.8797,0.9405,0.8391,0.9013,0.8691,0.7581,0.7598
7,0.8884,0.9499,0.8642,0.8973,0.8805,0.7758,0.7763
8,0.8752,0.9389,0.8419,0.8899,0.8652,0.7493,0.7503
9,0.8847,0.9491,0.8513,0.9009,0.8754,0.7683,0.7693


In [21]:
evaluate_model(tune_blend)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

**Etapa de NLP**

**Modelo utilizando NMF**

In [22]:
from pycaret.nlp import *

In [23]:
nmf_setup = setup(data = news,
                       target = 'text',
                      session_id = 42,
                      log_experiment = True,
                      experiment_name = 'log_nlp_15_07_22')

Description,Value
session_id,42
Documents,44898
Vocab Size,69499
Custom Stopwords,False


In [24]:
nmf_model = create_model('nmf')

In [26]:
nmf_data = assign_model(nmf_model)

In [27]:
nmf_data.head()

Unnamed: 0,text,subject,date,Status,Topic_0,Topic_1,Topic_2,Topic_3,Dominant_Topic,Perc_Dominant_Topic
0,budget fight loom flip fiscal script conservat...,politicsNews,"December 31, 2017",True,0.010765,0.0,0.004218,0.004843,Topic 0,0.54
1,military people allow first time enlist start ...,politicsNews,"December 29, 2017",True,0.01045,0.0,0.008259,0.002196,Topic 0,0.5
2,senior republican senator let investigation li...,politicsNews,"December 31, 2017",True,0.009593,0.0,0.022831,0.001596,Topic 2,0.67
3,help australian diplomat campaign tell austral...,politicsNews,"December 30, 2017",True,0.011002,0.0,0.016178,0.0,Topic 2,0.6
4,want postal_service charge much call charge mu...,politicsNews,"December 29, 2017",True,0.006461,0.0,0.003981,0.004223,Topic 0,0.44


In [28]:
nmf_data.drop(['text', 'subject', 'date','Dominant_Topic', 'Perc_Dominant_Topic' ], axis = 1, inplace = True)

**Etapa de Classificação**

In [29]:
from pycaret.classification import *

In [30]:
cl_news_nmf = setup(data = nmf_data, target = 'Status', train_size = 0.8, session_id = 42)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,Status
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(44898, 5)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [31]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9198,0.9738,0.9007,0.9286,0.9144,0.839,0.8394,0.611
rf,Random Forest Classifier,0.9185,0.969,0.8947,0.9313,0.9126,0.8363,0.8369,1.198
xgboost,Extreme Gradient Boosting,0.9029,0.9647,0.8942,0.901,0.8976,0.8053,0.8054,1.103
lightgbm,Light Gradient Boosting Machine,0.9015,0.9643,0.8941,0.8984,0.8962,0.8025,0.8025,0.68
gbc,Gradient Boosting Classifier,0.9004,0.9627,0.8949,0.8957,0.8953,0.8004,0.8005,1.075
ada,Ada Boost Classifier,0.897,0.9604,0.8947,0.8895,0.8921,0.7935,0.7936,0.318
dt,Decision Tree Classifier,0.8913,0.8902,0.8703,0.898,0.8839,0.7817,0.7821,0.049
knn,K Neighbors Classifier,0.8881,0.9443,0.8769,0.8866,0.8817,0.7755,0.7756,0.153
lda,Linear Discriminant Analysis,0.8816,0.9462,0.8615,0.8864,0.8738,0.7623,0.7626,0.027
ridge,Ridge Classifier,0.8766,0.0,0.8595,0.8785,0.8689,0.7524,0.7526,0.031


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=42, verbose=0,
                     warm_start=False)

In [32]:
et_nmf_model = create_model('et')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9201,0.973,0.904,0.9262,0.915,0.8396,0.8399
1,0.917,0.9747,0.8923,0.9304,0.911,0.8334,0.834
2,0.9212,0.9726,0.9122,0.9214,0.9168,0.842,0.842
3,0.9173,0.9738,0.8912,0.9321,0.9112,0.8339,0.8346
4,0.9198,0.9735,0.9023,0.9272,0.9146,0.8391,0.8393
5,0.9168,0.9726,0.8923,0.9299,0.9107,0.8328,0.8334
6,0.919,0.9738,0.8958,0.9313,0.9132,0.8373,0.8379
7,0.9262,0.977,0.9116,0.9318,0.9216,0.852,0.8521
8,0.9179,0.9709,0.9022,0.9233,0.9126,0.8351,0.8353
9,0.9226,0.9764,0.9028,0.9323,0.9173,0.8446,0.845


In [33]:
rf_nmf_model = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9204,0.9701,0.8952,0.9346,0.9145,0.8401,0.8407
1,0.9173,0.9701,0.8912,0.9321,0.9112,0.8339,0.8346
2,0.9195,0.9687,0.9052,0.9241,0.9146,0.8386,0.8387
3,0.9179,0.9669,0.8888,0.9353,0.9115,0.835,0.8359
4,0.9193,0.9666,0.8953,0.9324,0.9134,0.8379,0.8385
5,0.9168,0.9664,0.8906,0.9315,0.9106,0.8328,0.8335
6,0.9198,0.9692,0.8947,0.934,0.9139,0.839,0.8396
7,0.9182,0.9724,0.897,0.9285,0.9125,0.8357,0.8361
8,0.9165,0.9677,0.897,0.9251,0.9108,0.8323,0.8326
9,0.9192,0.9719,0.8923,0.935,0.9131,0.8378,0.8386


In [34]:
blend_model_nmf = blend_models([et_nmf_model,rf_nmf_model])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9193,0.9731,0.8981,0.9297,0.9136,0.8379,0.8383
1,0.917,0.9733,0.8923,0.9304,0.911,0.8334,0.834
2,0.9234,0.9716,0.9122,0.9258,0.919,0.8464,0.8465
3,0.9201,0.9719,0.8958,0.9335,0.9143,0.8395,0.8402
4,0.9204,0.9712,0.9005,0.9299,0.915,0.8401,0.8405
5,0.9195,0.97,0.8947,0.9335,0.9137,0.8384,0.8391
6,0.9209,0.9728,0.8976,0.9337,0.9153,0.8412,0.8418
7,0.9234,0.9756,0.9064,0.9309,0.9185,0.8463,0.8466
8,0.9156,0.9702,0.897,0.9234,0.91,0.8306,0.8309
9,0.9226,0.9753,0.9028,0.9323,0.9173,0.8446,0.845


In [35]:
tune_nmf_model = tune_model(blend_model_nmf, optimize = "Precision")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9193,0.971,0.8952,0.9323,0.9134,0.8378,0.8384
1,0.9176,0.9708,0.8929,0.9311,0.9116,0.8345,0.8351
2,0.9198,0.9693,0.907,0.9232,0.915,0.8391,0.8393
3,0.9184,0.9683,0.8906,0.9349,0.9122,0.8361,0.837
4,0.9201,0.968,0.8976,0.932,0.9145,0.8396,0.8401
5,0.9168,0.9674,0.8917,0.9304,0.9107,0.8328,0.8335
6,0.9204,0.9703,0.8958,0.9341,0.9146,0.8401,0.8407
7,0.9201,0.9733,0.9011,0.9288,0.9148,0.8396,0.8399
8,0.9167,0.9684,0.8981,0.9247,0.9112,0.8329,0.8332
9,0.9201,0.9726,0.8952,0.934,0.9142,0.8395,0.8401


In [36]:
final_model = finalize_model(tune_nmf_model)

In [38]:
save_model(final_model, "final_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Status',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                                                                       max_leaf_nodes=None,
                                                                       max_samples=None,
                                                                       min_impurity_decrease=

In [39]:
client.fput_object("modelo",
                  "final_model.pkl",
                  "final_model.pkl")

<minio.helpers.ObjectWriteResult at 0x14fa8341c40>