<a href="https://colab.research.google.com/github/chanphil2002/fake-news-detection/blob/main/FakeNews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive

drive.mount('/content/drive')
fake_file_path = '/content/drive/MyDrive/Colab Notebooks/data/Fake.csv'
real_file_path = '/content/drive/MyDrive/Colab Notebooks/data/True.csv'
fake = pd.read_csv(fake_file_path)
real = pd.read_csv(real_file_path)

# Check the first few rows of the dataset
print(fake.head())
print(real.head())

# Add labels: 0 for fake, 1 for real
fake['label'] = 0
real['label'] = 1

# Combine datasets
data = pd.concat([fake, real], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Merge title and text into a single field
data['content'] = data['title'].fillna('') + " " + data['text'].fillna('')
data = data[['content', 'label']]

# Basic overview
print(data.head())
print(data['label'].value_counts())


In [None]:
# Class balance
sns.countplot(data=data, x='label')
plt.xticks([0, 1], ['Fake', 'Real'])
plt.title("Class Distribution")
plt.show()

# Add length feature
data['content_len'] = data['content'].apply(lambda x: len(x.split()))

# Article length by class
sns.histplot(data=data, x='content_len', hue='label', bins=50, kde=True)
plt.title("Article Length Distribution by Class")
plt.show()

In [None]:
from wordcloud import WordCloud

fake_text = " ".join(data[data['label']==0]['content'].tolist())
real_text = " ".join(data[data['label']==1]['content'].tolist())

plt.figure(figsize=(14,6))
plt.subplot(1, 2, 1)
plt.imshow(WordCloud(width=800, height=400, background_color='white').generate(fake_text))
plt.title("Fake News WordCloud")
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(WordCloud(width=800, height=400, background_color='white').generate(real_text))
plt.title("Real News WordCloud")
plt.axis('off')

plt.tight_layout()
plt.show()


In [None]:
!pip install nltk
!pip install keras-tuner

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)           # Remove text in brackets
    text = re.sub(r'http\S+|www\S+', '', text)     # Remove URLs
    text = re.sub(r'<.*?>+', '', text)             # Remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)           # Remove punctuation/numbers
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

# Apply cleaning
data['clean_content'] = data['content'].apply(clean_text)

# Parameters
MAX_WORDS = 15000
MAX_LEN = 300

# Tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(data['clean_content'])

X = tokenizer.texts_to_sequences(data['clean_content'])
X = pad_sequences(X, maxlen=MAX_LEN)

y = data['label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Bidirectional, SpatialDropout1D
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping

EMBEDDING_DIM = 128
EPOCHS = 10
BATCH_SIZE = 64

In [None]:
def build_lstm_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=MAX_WORDS,
                        output_dim=hp.Choice('embedding_dim', [64, 128, 256]),
                        input_length=MAX_LEN))
    model.add(SpatialDropout1D(hp.Float('spatial_dropout', 0.1, 0.5, step=0.1)))
    model.add(Bidirectional(LSTM(units=hp.Int('lstm_units_1', 32, 128, step=32),
                                 return_sequences=True,
                                 dropout=hp.Float('dropout_1', 0.2, 0.5, step=0.1),
                                 recurrent_dropout=hp.Float('recurrent_dropout_1', 0.2, 0.5, step=0.1))))
    model.add(Bidirectional(LSTM(units=hp.Int('lstm_units_2', 32, 128, step=32),
                                 dropout=hp.Float('dropout_2', 0.2, 0.5, step=0.1),
                                 recurrent_dropout=hp.Float('recurrent_dropout_2', 0.2, 0.5, step=0.1))))
    model.add(Dense(hp.Int('dense_units_1', 64, 256, step=64), activation='relu'))
    model.add(Dropout(hp.Float('dense_dropout_1', 0.2, 0.5, step=0.1)))
    model.add(Dense(hp.Int('dense_units_2', 64, 256, step=64), activation='relu'))
    model.add(Dropout(hp.Float('dense_dropout_2', 0.2, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=RMSprop(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

lstm_tuner = kt.Hyperband(
    build_lstm_model,
    objective='val_accuracy',
    max_epochs=5,
    factor=3,
    directory='hyperband_logs',
    project_name='lstm_hyperband'
)

# Use a smaller subset of training data for faster tuning
X_tune = X_train[:8000]
y_tune = y_train[:8000]

early_stop = EarlyStopping(monitor='val_loss', patience=2)

lstm_tuner.search(X_tune, y_tune,
             epochs=5,
             validation_split=0.2,
             callbacks=[early_stop])

best_lstm_model = lstm_tuner.get_best_models(num_models=1)[0]

history = best_lstm_model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)]
)

best_lstm_model.summary()

In [None]:
def build_cnn_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=MAX_WORDS,
                        output_dim=hp.Choice('embedding_dim', [64, 128, 256]),
                        input_length=MAX_LEN))

    model.add(Conv1D(filters=hp.Int('filters_1', 64, 256, step=64), kernel_size=3, activation='relu'))
    model.add(Conv1D(filters=hp.Int('filters_2', 64, 256, step=64), kernel_size=3, activation='relu'))

    model.add(GlobalMaxPooling1D())
    model.add(Dense(hp.Int('dense_units_1', 64, 256, step=64), activation='relu'))
    model.add(Dropout(hp.Float('dropout_1', 0.2, 0.5, step=0.1)))
    model.add(Dense(hp.Int('dense_units_2', 64, 256, step=64), activation='relu'))
    model.add(Dropout(hp.Float('dropout_2', 0.2, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=RMSprop(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

cnn_tuner = kt.Hyperband(
    build_cnn_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory='hyperband_logs',
    project_name='cnn_hyperband'
)


cnn_tuner.search(X_train, y_train,
                 epochs=10,
                 validation_split=0.2,
                 callbacks=[EarlyStopping(monitor='val_loss', patience=2)])

best_cnn_model = cnn_tuner.get_best_models(num_models=1)[0]
best_cnn_model.summary()

# Retrain best model
history_cnn = best_cnn_model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)]
)


In [None]:
# Evaluate LSTM
loss_lstm, accuracy_lstm = best_lstm_model.evaluate(X_test, y_test, verbose=0)
print(f"LSTM Test Accuracy: {accuracy_lstm:.4f}")

# Evaluate CNN
loss_cnn, accuracy_cnn = best_cnn_model.evaluate(X_test, y_test, verbose=0)
print(f"CNN Test Accuracy: {accuracy_cnn:.4f}")

import matplotlib.pyplot as plt

def plot_history(history, title):
    plt.figure(figsize=(12, 4))

    # Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Val Accuracy')
    plt.title(f'{title} - Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title(f'{title} - Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Plot LSTM
plot_history(history_lstm, 'LSTM')

# Plot CNN
plot_history(history_cnn, 'CNN')
