DETECÇÃO DE "FAKE NEWS" UTILIZANDO INTELIGENCIA ARTIFICIAL.<br>
César Augusto C. Filho1, Francisco Cláudio de Q. Nascimento2

Criando Base de Dados

In [None]:
import pandas as pd
import numpy as np

# Definindo categorias de notícias verdadeiras e falsas
true_news = [
    "Government announces new policies to boost economy.",
    "Scientists discover new species in the Amazon rainforest.",
    "Education reforms to improve school standards announced.",
    "Healthcare workers to receive bonuses for their hard work.",
    "Local community organizes charity event for the homeless."
]

fake_news = [
    "Celebrity caught in shocking scandal, shocking the world.",
    "Aliens have been found living among us, says expert.",
    "New miracle cure promises to cure all diseases instantly.",
    "Politician accused of being a secret agent for another country.",
    "Famous athlete retires to become a professional gamer."
]

# Expandir as amostras
np.random.seed(42)
true_news_expanded = np.random.choice(true_news, 500, replace=True)
fake_news_expanded = np.random.choice(fake_news, 500, replace=True)

# Criando os rótulos
true_labels = np.ones(500)  # 1 para notícias verdadeiras
fake_labels = np.zeros(500) # 0 para notícias falsas

# Combinando os dados
news_data = np.concatenate((true_news_expanded, fake_news_expanded))
labels = np.concatenate((true_labels, fake_labels))

# Criando o DataFrame
df = pd.DataFrame({
    'text': news_data,
    'label': labels
})

# Exibindo as primeiras linhas do dataset
print(df.head())

# Salvando o dataset em um arquivo CSV



In [None]:
df.to_csv('fake_news_dataset.csv', index=False)

Carregamento e Preparação dos Dados

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
import joblib
import numpy as np

# Carregando o dataset do arquivo CSV
df = pd.read_csv('fake_news_dataset.csv')

# Dividindo os dados em recursos (X) e rótulos (y)
X = df['text']
y = df['label']


Extração de Características

In [None]:
# Bag-of-Words
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(X)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

# N-grams (bigramas, por exemplo)
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_ngram = ngram_vectorizer.fit_transform(X)


Divisão dos Dados em Treinamento e Teste

In [None]:
# Dividindo os dados em treinamento e teste (usando Bag-of-Words como exemplo)
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)


Construção e Treinamento dos Classificadores

In [None]:
# Inicialização dos classificadores
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(random_state=42)
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)

# Treinamento dos classificadores
nb_classifier.fit(X_train, y_train)
rf_classifier.fit(X_train, y_train)
lr_classifier.fit(X_train, y_train)


Avaliação dos Modelos

In [None]:
# Previsões
nb_pred = nb_classifier.predict(X_test)
rf_pred = rf_classifier.predict(X_test)
lr_pred = lr_classifier.predict(X_test)

# Cálculo do F1-score
nb_f1 = f1_score(y_test, nb_pred)
rf_f1 = f1_score(y_test, rf_pred)
lr_f1 = f1_score(y_test, lr_pred)

# Exibição do F1-score
print(f'Naive Bayes F1 Score: {nb_f1}')
print(f'Random Forest F1 Score: {rf_f1}')
print(f'Logistic Regression F1 Score: {lr_f1}')

# Matrizes de Confusão
print('Naive Bayes Confusion Matrix:', confusion_matrix(y_test, nb_pred))
print('Random Forest Confusion Matrix:', confusion_matrix(y_test, rf_pred))
print('Logistic Regression Confusion Matrix:', confusion_matrix(y_test, lr_pred))


Ajuste de Parâmetros com GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

# Parâmetros para ajuste de Naive Bayes (exemplo)
nb_params = {'alpha': [0.5, 1.0, 2.0]}
grid_nb = GridSearchCV(nb_classifier, nb_params, cv=5, scoring='f1')
grid_nb.fit(X_train, y_train)

# Parâmetros para ajuste de Logistic Regression (exemplo)
lr_params = {'C': [0.1, 1, 10]}
grid_lr = GridSearchCV(lr_classifier, lr_params, cv=5, scoring='f1')
grid_lr.fit(X_train, y_train)

# Seleção dos melhores parâmetros
best_nb = grid_nb.best_estimator_
best_lr = grid_lr.best_estimator_

Classificação com o Melhor Modelo e Salvamento

In [None]:
# Supondo que Logistic Regression foi o melhor modelo
final_model = best_lr

# Salvando o modelo final
joblib.dump(final_model, 'fake_news_model.pkl')

# Carregando o modelo salvo para uso futuro
# loaded_model = joblib.load('fake_news_model.pkl')

# Previsão com probabilidade
pred_prob = final_model.predict_proba(X_test)


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib

# Passo 1: Carregar e Pré-processar a Base de Dados
df = pd.read_csv('WELFake_Dataset.csv')  # Substitua pelo caminho correto

# Remover linhas com valores nulos
df.dropna(inplace=True)

# Definir as características e rótulos
X = df['text']
y = df['label']

# Passo 2: Extração de Features
# Bag-of-Words
bow_vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 1))
X_bow = bow_vectorizer.fit_transform(X)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(X)

# N-grams (usando TF-IDF)
X_ngrams = tfidf_vectorizer.fit_transform(X)

# Divisão dos Dados em Treino e Teste
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_ngrams, X_test_ngrams, _, _ = train_test_split(X_ngrams, y, test_size=0.2, random_state=42)

# Passo 3: Treinamento dos Modelos de Classificação
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=200, random_state=42)
}

# Avaliar os Modelos
for model_name, model in models.items():
    model.fit(X_train_bow, y_train)
    y_pred = model.predict(X_test_bow)
    print(f"{model_name} F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
    print(f"{model_name} Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}\n")

# Passo 4: Ajuste de Parâmetros com GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
}

grid_search = GridSearchCV(LogisticRegression(max_iter=200), param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train_bow, y_train)

print(f"Melhores Parâmetros: {grid_search.best_params_}")
print(f"Melhor F1 Score: {grid_search.best_score_}")

# Passo 5: Salvar o Melhor Modelo
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_model.pkl')

# Passo 6: Carregar e Usar o Modelo
model = joblib.load('best_model.pkl')

# Exemplo de Classificação de Notícia
user_input = "Texto da notícia a ser classificada"
user_input_vectorized = tfidf_vectorizer.transform([user_input])
prediction = model.predict(user_input_vectorized)
print(f"Previsão: {'Fake' if prediction[0] == 1 else 'Real'}")


Naive Bayes F1 Score: 0.875804795174513
Naive Bayes Confusion Matrix:
 [[6199  882]
 [ 895 6332]]

Random Forest F1 Score: 0.9441406830433378
Random Forest Confusion Matrix:
 [[6590  491]
 [ 308 6919]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression F1 Score: 0.9523274549027712
Logistic Regression Confusion Matrix:
 [[6683  398]
 [ 284 6943]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Melhores Parâmetros: {'C': 0.1, 'solver': 'liblinear'}
Melhor F1 Score: 0.9558461734087811
Previsão: Fake


CORRIGIDO E VÁLIDO - FUNCIONANDO 100%

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib

# Passo 1: Carregar e Pré-processar a Base de Dados
df = pd.read_csv('WELFake_Dataset.csv')  # Substitua pelo caminho correto

# Remover linhas com valores nulos
df.dropna(inplace=True)

# Definir as características e rótulos
X = df['text']
y = df['label']

# Passo 2: Extração de Features
# Bag-of-Words
bow_vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 1))
X_bow = bow_vectorizer.fit_transform(X)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Divisão dos Dados em Treino e Teste para Bag-of-Words
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Divisão dos Dados em Treino e Teste para TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Passo 3: Treinamento dos Modelos de Classificação
models = {
    'Naive Bayes (Bag-of-Words)': MultinomialNB(),
    'Random Forest (Bag-of-Words)': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression (TF-IDF)': LogisticRegression(max_iter=200, random_state=42)
}

# Avaliar os Modelos
for model_name, model in models.items():
    if 'Logistic Regression' in model_name:
        # Treinamento e avaliação para TF-IDF
        model.fit(X_train_tfidf, y_train_tfidf)
        y_pred = model.predict(X_test_tfidf)
        print(f"{model_name} F1 Score: {f1_score(y_test_tfidf, y_pred, average='weighted')}")
        print(f"{model_name} Confusion Matrix:\n {confusion_matrix(y_test_tfidf, y_pred)}\n")
    else:
        # Treinamento e avaliação para Bag-of-Words
        model.fit(X_train_bow, y_train_bow)
        y_pred = model.predict(X_test_bow)
        print(f"{model_name} F1 Score: {f1_score(y_test_bow, y_pred, average='weighted')}")
        print(f"{model_name} Confusion Matrix:\n {confusion_matrix(y_test_bow, y_pred)}\n")

# Passo 4: Ajuste de Parâmetros com GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
}

grid_search = GridSearchCV(LogisticRegression(max_iter=200), param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train_tfidf, y_train_tfidf)

print(f"Melhores Parâmetros: {grid_search.best_params_}")
print(f"Melhor F1 Score: {grid_search.best_score_}")

# Passo 5: Salvar o Melhor Modelo
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_model.pkl')

# Passo 6: Carregar e Usar o Modelo
model = joblib.load('best_model.pkl')

# Exemplo de Classificação de Notícia
user_input = "Texto da notícia a ser classificada"
user_input_vectorized = tfidf_vectorizer.transform([user_input])
prediction = model.predict(user_input_vectorized)
print(f"Previsão: {'Fake' if prediction[0] == 1 else 'Real'}")


Naive Bayes (Bag-of-Words) F1 Score: 0.875804795174513
Naive Bayes (Bag-of-Words) Confusion Matrix:
 [[6199  882]
 [ 895 6332]]

Random Forest (Bag-of-Words) F1 Score: 0.9441406830433378
Random Forest (Bag-of-Words) Confusion Matrix:
 [[6590  491]
 [ 308 6919]]

Logistic Regression (TF-IDF) F1 Score: 0.9492516203440949
Logistic Regression (TF-IDF) Confusion Matrix:
 [[6660  421]
 [ 305 6922]]

Melhores Parâmetros: {'C': 10, 'solver': 'liblinear'}
Melhor F1 Score: 0.9601166680395744
Previsão: Fake


com inclusão do Recall, precisão e acuracia. além do f1 score e matriz de confusçao. Porem apresenta um erro devido o algoritmo de Logistic Regression não conseguiu convergir, correção possível: LogisticRegression(max_iter=500)

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV
import joblib

# Passo 1: Carregar e Pré-processar a Base de Dados
df = pd.read_csv('WELFake_Dataset.csv')  # Substitua pelo caminho correto

# Remover linhas com valores nulos
df.dropna(inplace=True)

# Definir as características e rótulos
X = df['text']
y = df['label']

# Passo 2: Extração de Features
# Bag-of-Words
bow_vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 1))
X_bow = bow_vectorizer.fit_transform(X)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(X)

# N-grams (usando TF-IDF)
X_ngrams = tfidf_vectorizer.fit_transform(X)

# Divisão dos Dados em Treino e Teste
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_ngrams, X_test_ngrams, _, _ = train_test_split(X_ngrams, y, test_size=0.2, random_state=42)

# Passo 3: Treinamento dos Modelos de Classificação
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=200, random_state=42)
}

# Avaliar os Modelos
for model_name, model in models.items():
    if model_name != 'Logistic Regression':
        model.fit(X_train_bow, y_train)
        y_pred = model.predict(X_test_bow)
    else:
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)

    # Calcular e exibir as métricas
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}\n")

# Passo 4: Ajuste de Parâmetros com GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
}

grid_search = GridSearchCV(LogisticRegression(max_iter=200), param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train_bow, y_train)

print(f"Melhores Parâmetros: {grid_search.best_params_}")
print(f"Melhor F1 Score: {grid_search.best_score_}")

# Passo 5: Salvar o Melhor Modelo
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_model.pkl')

# Passo 6: Carregar e Usar o Modelo
model = joblib.load('best_model.pkl')

# Exemplo de Classificação de Notícia
user_input = "Texto da notícia a ser classificada"
user_input_vectorized = tfidf_vectorizer.transform([user_input])
prediction = model.predict(user_input_vectorized)
print(f"Previsão: {'Fake' if prediction[0] == 1 else 'Real'}")

Naive Bayes Metrics:
Accuracy: 0.8758
Precision: 0.8758
Recall: 0.8758
F1 Score: 0.8758
Confusion Matrix:
[[6199  882]
 [ 895 6332]]

Random Forest Metrics:
Accuracy: 0.9442
Precision: 0.9444
Recall: 0.9442
F1 Score: 0.9441
Confusion Matrix:
[[6590  491]
 [ 308 6919]]

Logistic Regression Metrics:
Accuracy: 0.9493
Precision: 0.9494
Recall: 0.9493
F1 Score: 0.9493
Confusion Matrix:
[[6660  421]
 [ 305 6922]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Melhores Parâmetros: {'C': 0.1, 'solver': 'liblinear'}
Melhor F1 Score: 0.9558461734087811
Previsão: Fake
