In [1]:
from google.colab import files
uploaded = files.upload()

Saving news.csv to news.csv


In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import string

In [14]:
df = pd.read_csv("news.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [16]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,6335.0
mean,5280.415627
std,3038.503953
min,2.0
25%,2674.5
50%,5271.0
75%,7901.0
max,10557.0


Pre-processing

In [17]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
stop_words = set(stopwords.words('english'))

In [19]:
def clean_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(clean_text)

In [20]:
# Encode labels: FAKE=0, REAL=1
df['label'] = df['label'].map({'FAKE': 0, 'REAL': 1})

In [21]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

 Generate Word Embeddings

In [22]:
vocab_size = 10000
max_length = 200
embedding_dim = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')


Model Architecture

In [23]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()




Train Model

In [24]:
history = model.fit(train_padded, y_train, epochs=10, batch_size=64,
                    validation_split=0.1, verbose=2)


Epoch 1/10
72/72 - 26s - 356ms/step - accuracy: 0.6393 - loss: 0.6566 - val_accuracy: 0.6529 - val_loss: 0.6513
Epoch 2/10
72/72 - 26s - 361ms/step - accuracy: 0.7590 - loss: 0.5409 - val_accuracy: 0.8264 - val_loss: 0.4198
Epoch 3/10
72/72 - 38s - 528ms/step - accuracy: 0.8595 - loss: 0.3390 - val_accuracy: 0.8383 - val_loss: 0.3844
Epoch 4/10
72/72 - 36s - 499ms/step - accuracy: 0.8937 - loss: 0.2621 - val_accuracy: 0.8363 - val_loss: 0.4637
Epoch 5/10
72/72 - 20s - 273ms/step - accuracy: 0.8950 - loss: 0.2376 - val_accuracy: 0.7712 - val_loss: 0.5340
Epoch 6/10
72/72 - 16s - 223ms/step - accuracy: 0.9057 - loss: 0.2021 - val_accuracy: 0.8363 - val_loss: 0.5240
Epoch 7/10
72/72 - 22s - 301ms/step - accuracy: 0.8913 - loss: 0.2434 - val_accuracy: 0.8185 - val_loss: 0.4774
Epoch 8/10
72/72 - 20s - 273ms/step - accuracy: 0.9042 - loss: 0.2080 - val_accuracy: 0.7929 - val_loss: 0.5297
Epoch 9/10
72/72 - 21s - 287ms/step - accuracy: 0.9077 - loss: 0.2048 - val_accuracy: 0.8166 - val_loss:

Evaluate Model

In [25]:
y_pred_prob = model.predict(test_padded)
y_pred = (y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step
Accuracy: 0.8193
Precision: 0.8584
Recall: 0.7684
F1-score: 0.8109


Predict New News Article

In [26]:
def predict_news(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
    pred_prob = model.predict(padded)[0][0]
    label = "REAL" if pred_prob > 0.5 else "FAKE"
    return label, pred_prob

sample_news = "The government has announced a new policy for education reform."
label, prob = predict_news(sample_news)
print(f"Prediction: {label} with confidence {prob:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Prediction: FAKE with confidence 0.4288


Conclusion

* This model helps identify fake news with high accuracy using deep learning.
* Future improvements can include attention mechanisms or transformers
(BERT) for better performance.