In [2]:
#cs22b2037 chanti babu sambangi
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, roc_curve

# Load dataset
true_df = pd.read_csv("/kaggle/input/fake-news-data/True.csv")
fake_df = pd.read_csv("/kaggle/input/fake-news-data/Fake.csv")

# Create labels (0 for real, 1 for fake)
true_df['label'] = 0
fake_df['label'] = 1

# Combine datasets
df = pd.concat([true_df, fake_df], axis=0)

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# Combine title and text
df['full_text'] = df['title'] + ' ' + df['text']
df.head()


Unnamed: 0,title,text,subject,date,label,full_text
0,Only 'miracles' can move Brexit talks forward ...,TALLINN (Reuters) - Only miracles can move B...,worldnews,"September 29, 2017",0,Only 'miracles' can move Brexit talks forward ...
1,RETIRED COP PENS Gut-Wrenching VIRAL Letter To...,It s easy to see why this letter went viral. B...,left-news,"Aug 31, 2016",1,RETIRED COP PENS Gut-Wrenching VIRAL Letter To...
2,Obama says hopes Trump can stand up to Russia ...,BERLIN (Reuters) - U.S. President Barack Obama...,politicsNews,"November 17, 2016",0,Obama says hopes Trump can stand up to Russia ...
3,URGENT! 24 HOURS UNTIL SURRENDER OF INTERNET B...,This will be IRREVERSIBLE! Call your Congressm...,Government News,"Sep 28, 2016",1,URGENT! 24 HOURS UNTIL SURRENDER OF INTERNET B...
4,Stampede in India's financial capital kills at...,MUMBAI (Reuters) - A rush hour stampede killed...,worldnews,"September 29, 2017",0,Stampede in India's financial capital kills at...


In [4]:

# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text

df['processed_text'] = df['full_text'].apply(preprocess_text)

# Split data
X = df['processed_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization and padding
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)
len(X_train_seq),len(X_test_seq)

(35918, 8980)

In [5]:

# Model architecture
embedding_dim = 128

model = Sequential([
    # Embedding layer converts words to dense vectors
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),

    # CNN for local feature extraction
    Conv1D(128, 5, activation='relu'),

    # LSTM for sequence modeling
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),

    # Dense layers for classification
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.Precision(name='precision')]
)




In [6]:

# Callbacks
callbacks = [
    EarlyStopping(patience=3, monitor='val_loss'),
    ModelCheckpoint('best_model.keras', save_best_only=True)  # Change .h5 to .keras
]

# Train the model
history = model.fit(
    X_train_pad,
    y_train,
    epochs=15,
    batch_size=64,
    validation_split=0.2,
    callbacks=callbacks
)

# Load the best model
model = tf.keras.models.load_model('best_model.keras')

# Predict on test set
y_pred = model.predict(X_test_pad)
y_pred_class = (y_pred > 0.5).astype(int)


Epoch 1/15
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 36ms/step - accuracy: 0.8765 - auc: 0.9425 - loss: 0.2557 - precision: 0.8620 - val_accuracy: 0.9825 - val_auc: 0.9975 - val_loss: 0.0546 - val_precision: 0.9936
Epoch 2/15
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 33ms/step - accuracy: 0.9849 - auc: 0.9980 - loss: 0.0443 - precision: 0.9864 - val_accuracy: 0.9854 - val_auc: 0.9983 - val_loss: 0.0458 - val_precision: 0.9798
Epoch 3/15
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 34ms/step - accuracy: 0.9941 - auc: 0.9996 - loss: 0.0180 - precision: 0.9957 - val_accuracy: 0.9840 - val_auc: 0.9980 - val_loss: 0.0456 - val_precision: 0.9802
Epoch 4/15
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 33ms/step - accuracy: 0.9962 - auc: 0.9996 - loss: 0.0122 - precision: 0.9968 - val_accuracy: 0.9872 - val_auc: 0.9960 - val_loss: 0.0532 - val_precision: 0.9952
Epoch 5/15
[1m449/449[0m [32m━━━━━━━━

In [7]:

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred_class)}")
print(f"Precision: {precision_score(y_test, y_pred_class)}")
print(f"AUC: {roc_auc_score(y_test, y_pred)}")

# Calculate EER
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
fnr = 1 - tpr
eer_threshold = thresholds[np.nanargmin(np.absolute(fnr - fpr))]
eer = fpr[np.nanargmin(np.absolute(fnr - fpr))]
print(f"EER: {eer}")

Accuracy: 0.9879732739420936
Precision: 0.9890470974808324
AUC: 0.9985719893470077
EER: 0.012253233492171545
