## Fake or real news

Esse notebook se propoe a fazer uma rede neural que detecta se uma notícia é falsa ou verdadeira.

## Import das dependências e bibliotecas necessárias

In [19]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer

np.set_printoptions(linewidth=140)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/fake-and-real-news-dataset')
else:
    path = Path('fake-and-real-news-dataset')
    if not path.exists():
        import zipfile
        import kaggle
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

## Download e preparo dos dados de entrada

In [20]:
# Lê arquivo e separa 3000 linhas aleatóriamente
# Isso foi feito por não haver memória suficiente para caregar o arquivo inteiro
df_true = pd.read_csv(path/'True.csv')
df_true = df_true.sample(n=3000, random_state=42)
df_false = pd.read_csv(path/'Fake.csv')
df_false = df_false.sample(n=3000, random_state=31)

# Adiciona uma coluna em ambos os arquivos que indica o resultado esperado
df_false['class'] = 0
df_true['class'] = 1

# Divisão dos dados falsos em treinamento e teste
fake_train, fake_test = train_test_split(df_false, test_size=0.2, random_state=42)

# Divisão dos dados verdadeiros em treinamento e teste
true_train, true_test = train_test_split(df_true, test_size=0.2, random_state=91)

# Concatenação dos conjuntos de treinamento e teste
train_data = pd.concat([fake_train, true_train])
test_data = pd.concat([fake_test, true_test])

# Separar X (características) e y (rótulos)
# Características
X_train_text = train_data['text']
X_test_text = test_data['text']
X_train_title = train_data['title']
X_test_title = test_data['title']
X_train_subject = train_data['subject']
X_test_subject = test_data['subject']
# Rótulos
y_train = train_data['class']
y_test = test_data['class']

# Inicializar o vetorizador de palavras para cada atributo de texto
vectorizer_text = CountVectorizer()
vectorizer_title = CountVectorizer()
vectorizer_subject = CountVectorizer()

# Vetorizar os atributos de texto
X_train_text = vectorizer_text.fit_transform(X_train_text).toarray()
X_test_text = vectorizer_text.transform(X_test_text).toarray()

# Vetorizar os atributos de título
X_train_title = vectorizer_title.fit_transform(X_train_title).toarray()
X_test_title = vectorizer_title.transform(X_test_title).toarray()

# Vetorizar os atributos de assunto
X_train_subject = vectorizer_subject.fit_transform(X_train_subject).toarray()
X_test_subject = vectorizer_subject.transform(X_test_subject).toarray()

# Concatenar os atributos de entrada
X_train = np.concatenate((X_train_text, X_train_title, X_train_subject), axis=1)
X_test = np.concatenate((X_test_text, X_test_title, X_test_subject), axis=1)

# Treinamento

## Funções

In [21]:
# Definir a função de ativação sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Definir a função de treinamento da rede neural
def train(X, y, num_epochs, learning_rate):
    num_features = X.shape[1]
    num_samples = X.shape[0]

    # Inicializar os pesos e bias
    W = np.random.randn(num_features, 1)
    W = np.reshape(W, (num_features,))  # Ajustar a dimensão de W
    b = 0.1

    # Loop de treinamento
    for epoch in range(num_epochs+1):
        # Forward propagation
        Z = np.dot(X, W) + b
        A = np.vectorize(sigmoid)(Z)

        # Calcular o custo (função de perda)
        epsilon = 1e-8
        cost = -(1 / num_samples) * np.sum(np.array(y) * np.log(A + epsilon) + (1 - np.array(y)) * np.log(1 - A + epsilon))

        # Backward propagation
        dZ = A - y
        dW = (1 / num_samples) * np.dot(X.T, dZ)
        db = (1 / num_samples) * np.sum(dZ)

        # Atualizar os pesos e bias
        W -= learning_rate * dW
        b -= learning_rate * db

        # Imprimir o custo a cada época
        if epoch % 10 == 0:
            print(f"Época {epoch+1}/{num_epochs+1}, Custo: {cost}")

    return W, b

## Treino

In [23]:
# Treinamento da rede neural
num_epochs = 200
learning_rate = 0.025

# Chamar a função de treinamento
W, b = train(X_train, y_train, num_epochs, learning_rate)

Época 1/201, Custo: 5.53405629098931
Época 11/201, Custo: 5.1566042338089915
Época 21/201, Custo: 4.873214074172414
Época 31/201, Custo: 4.624952690435603
Época 41/201, Custo: 4.398728361519359
Época 51/201, Custo: 4.188230270655906
Época 61/201, Custo: 3.989724812778058
Época 71/201, Custo: 3.8078848926164053
Época 81/201, Custo: 3.645912715449007
Época 91/201, Custo: 3.5004429026112063
Época 101/201, Custo: 3.3693492599282857
Época 111/201, Custo: 3.246402901577488
Época 121/201, Custo: 3.1332036087520816
Época 131/201, Custo: 3.0299807345475083
Época 141/201, Custo: 2.9342727707732266
Época 151/201, Custo: 2.8435432540835235
Época 161/201, Custo: 2.7559933940330215
Época 171/201, Custo: 2.673133760198992
Época 181/201, Custo: 2.595243421012412
Época 191/201, Custo: 2.522283457400143
Época 201/201, Custo: 2.454113531531142


# Validação

In [24]:
# Função de previsão
def predict(X, W, b):
    Z = np.dot(X, W) + b
    A = np.vectorize(sigmoid)(Z)
    predictions = (A > 0.5).astype(int)
    return predictions

# Função de avaliação
def evaluate(y_true, y_pred):
    num_samples = len(y_true)
    accuracy = np.sum(y_true == y_pred) / num_samples
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1_score = 2 * precision * recall / (precision + recall + 1e-8)
    return accuracy, precision, recall, f1_score

# Previsão e avaliação dos dados de teste
predictions = predict(X_test, W, b)
accuracy, precision, recall, f1_score = evaluate(y_test, predictions)

print(f"Acurácia: {round(accuracy * 100, 2)}%")
print(f"Precisão: {round(precision * 100, 2)}%")
print(f"Recall: {round(recall * 100, 2)}%")
print(f"F1-score: {round(f1_score * 100, 2)}%")

Acurácia: 75.33%
Precisão: 74.44%
Recall: 77.17%
F1-score: 75.78%


# Teste com valor novo

In [26]:
# Exemplo de linha para teste
title = "As U.S. budget fight looms, Republicans flip their fiscal script"
text = "WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support education, scientific research, infrastructure, public health and environmental protection. “The (Trump) administration has already been willing to say: ‘We’re going to increase non-defense discretionary spending ... by about 7 percent,’” Meadows, chairman of the small but influential House Freedom Caucus, said on the program. “Now, Democrats are saying that’s not enough, we need to give the government a pay raise of 10 to 11 percent. For a fiscal conservative, I don’t see where the rationale is. ... Eventually you run out of other people’s money,” he said. Meadows was among Republicans who voted in late December for their party’s debt-financed tax overhaul, which is expected to balloon the federal budget deficit and add about $1.5 trillion over 10 years to the $20 trillion national debt. “It’s interesting to hear Mark talk about fiscal responsibility,” Democratic U.S. Representative Joseph Crowley said on CBS. Crowley said the Republican tax bill would require the  United States to borrow $1.5 trillion, to be paid off by future generations, to finance tax cuts for corporations and the rich. “This is one of the least ... fiscally responsible bills we’ve ever seen passed in the history of the House of Representatives. I think we’re going to be paying for this for many, many years to come,” Crowley said. Republicans insist the tax package, the biggest U.S. tax overhaul in more than 30 years,  will boost the economy and job growth. House Speaker Paul Ryan, who also supported the tax bill, recently went further than Meadows, making clear in a radio interview that welfare or “entitlement reform,” as the party often calls it, would be a top Republican priority in 2018. In Republican parlance, “entitlement” programs mean food stamps, housing assistance, Medicare and Medicaid health insurance for the elderly, poor and disabled, as well as other programs created by Washington to assist the needy. Democrats seized on Ryan’s early December remarks, saying they showed Republicans would try to pay for their tax overhaul by seeking spending cuts for social programs. But the goals of House Republicans may have to take a back seat to the Senate, where the votes of some Democrats will be needed to approve a budget and prevent a government shutdown. Democrats will use their leverage in the Senate, which Republicans narrowly control, to defend both discretionary non-defense programs and social spending, while tackling the issue of the “Dreamers,” people brought illegally to the country as children. Trump in September put a March 2018 expiration date on the Deferred Action for Childhood Arrivals, or DACA, program, which protects the young immigrants from deportation and provides them with work permits. The president has said in recent Twitter messages he wants funding for his proposed Mexican border wall and other immigration law changes in exchange for agreeing to help the Dreamers. Representative Debbie Dingell told CBS she did not favor linking that issue to other policy objectives, such as wall funding. “We need to do DACA clean,” she said.  On Wednesday, Trump aides will meet with congressional leaders to discuss those issues. That will be followed by a weekend of strategy sessions for Trump and Republican leaders on Jan. 6 and 7, the White House said. Trump was also scheduled to meet on Sunday with Florida Republican Governor Rick Scott, who wants more emergency aid. The House has passed an $81 billion aid package after hurricanes in Florida, Texas and Puerto Rico, and wildfires in California. The package far exceeded the $44 billion requested by the Trump administration. The Senate has not yet voted on the aid. "
subject = "politicsNews"
date = "December 31, 2017"

# Pré-processamento dos dados de entrada
X_title = vectorizer_title.transform([title]).toarray()
X_text = vectorizer_text.transform([text]).toarray()
X_subject = vectorizer_subject.transform([subject]).toarray()

# Concatenação dos atributos de entrada
X_input = np.concatenate((X_title, X_text, X_subject), axis=1)

# Fazer a predição
prediction = predict(X_input, W, b)

# Imprimir a predição
if prediction == 0:
    print("Predição: Fake")
else:
    print("Predição: True")

Predição: Fake
