In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
df_fake = pd.read_csv('fake.csv')
df_true = pd.read_csv('true.csv')
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
df_true.isnull().sum()
df_fake.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [5]:
print(df_fake.shape)
print(df_true.shape)

(23481, 4)
(21417, 4)


In [6]:
print("--- DataFrames BEFORE Removing Duplicates ---")
print(f"Fake News Count: {len(df_fake)}")
print(f"True News Count: {len(df_true)}")
print("-" * 40)

# 1. Remove duplicates from df_fake
df_fake.drop_duplicates(inplace=True)

# 2. Remove duplicates from df_true
df_true.drop_duplicates(inplace=True)

print("--- DataFrames AFTER Removing Duplicates ---")
print(f"Fake News Count: {len(df_fake)}")
print(f"True News Count: {len(df_true)}")

--- DataFrames BEFORE Removing Duplicates ---
Fake News Count: 23481
True News Count: 21417
----------------------------------------
--- DataFrames AFTER Removing Duplicates ---
Fake News Count: 23478
True News Count: 21211


In [7]:
print(df_fake.value_counts().sum)
print(df_true.value_counts().sum)

<bound method Series.sum of title                                                                                                                                   text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [8]:
df_fake['class'] = 0
df_true['class'] = 1

In [9]:
df_true_manual = df_true.tail(10)
df_fake_manual = df_fake.tail(10)

for i in range(21211, 21201,-1):
    df_true.drop([i], axis=0, inplace=True)

for j in range(23478, 23468,-1):
    df_fake.drop([j], axis=0, inplace=True)

In [10]:
df_fake_manual["class"] = 0
df_true_manual["class"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fake_manual["class"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_true_manual["class"] = 1


In [11]:
df_manual = pd.concat([df_fake_manual, df_true_manual], axis=0)
df_manual.to_csv("manual_testing.csv")

In [12]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.head(10)

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",0
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017",0
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017",0
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017",0
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017",0


In [13]:
df = df_merge.drop(["title", "subject","date"], axis = 1)


In [14]:
df.isnull().sum()

text     0
class    0
dtype: int64

In [15]:
df = df.sample(frac = 1)
df.head()

Unnamed: 0,text,class
16136,KUWAIT (Reuters) - Kuwait s ruling emir accept...,1
6600,"WASHINGTON (Reuters) - Kellyanne Conway, a Rep...",1
16868,BRUSSELS (Reuters) - British Prime Minister Th...,1
23321,21st Century Wire says What a twisted world we...,0
15724,Nothing to see here just Obama evening the pla...,0


In [16]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [17]:
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install gensim



In [18]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)   # required by word_tokenize
STOP_WORDS = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
import re
import gensim 

In [19]:
def preprocess(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [w for w in tokens if w not in STOP_WORDS]
        return tokens
    return []

In [20]:
print(df_merge['text'])
df_merge['processed_text'] = df_merge['text'].apply(preprocess)
df_merge[['text', 'processed_text']].head()

0        Donald Trump just couldn t wish all Americans ...
1        House Intelligence Committee Chairman Devin Nu...
2        On Friday, it was revealed that former Milwauk...
3        On Christmas day, Donald Trump announced that ...
4        Pope Francis used his annual Christmas Day mes...
                               ...                        
21412    BRUSSELS (Reuters) - NATO allies on Tuesday we...
21413    LONDON (Reuters) - LexisNexis, a provider of l...
21414    MINSK (Reuters) - In the shadow of disused Sov...
21415    MOSCOW (Reuters) - Vatican Secretary of State ...
21416    JAKARTA (Reuters) - Indonesia will buy 11 Sukh...
Name: text, Length: 44669, dtype: object


Unnamed: 0,text,processed_text
0,Donald Trump just couldn t wish all Americans ...,"[donald, trump, wish, americans, happy, new, y..."
1,House Intelligence Committee Chairman Devin Nu...,"[house, intelligence, committee, chairman, dev..."
2,"On Friday, it was revealed that former Milwauk...","[friday, revealed, former, milwaukee, sheriff,..."
3,"On Christmas day, Donald Trump announced that ...","[christmas, day, donald, trump, announced, wou..."
4,Pope Francis used his annual Christmas Day mes...,"[pope, francis, used, annual, christmas, day, ..."


In [21]:
from gensim.models import Word2Vec
EMBEDDING_DIM = 100
WINDOW_SIZE = 10
MIN_WORD_COUNT = 5
SG_MODEL = 1

corpus = df_merge['processed_text'].tolist()
word2vec_model = Word2Vec(
    sentences=corpus, 
    vector_size=EMBEDDING_DIM,
    window=WINDOW_SIZE,        
    min_count=MIN_WORD_COUNT,  
    sg=SG_MODEL,
    workers=4  # Set based on your CPU cores for faster training
)
def get_document_vector(word_list, model, vector_size):
    """Calculates the average Word2Vec vector for a document."""
    
    # Initialize a zero vector
    vector = np.zeros(vector_size)
    word_count = 0
    
    # Get the WordVector (wv) component for fast lookups
    wv = model.wv

    # Sum vectors for all words in the document that are in the model's vocabulary
    for word in word_list:
        if word in wv:
            vector += wv[word]
            word_count += 1
    
    # Calculate the average vector
    if word_count > 0:
        vector = vector / word_count
    
    return vector

# Apply the function to create the new feature column
print("Generating document vectors...")
df_merge['document_vector'] = df_merge['processed_text'].apply(
    lambda x: get_document_vector(x, word2vec_model, EMBEDDING_DIM)
)

print("Document vector generation complete.")
print(df_merge['document_vector'])
first_vector = df_merge['document_vector'].iloc[0]
print("\n--- Data Type Check ---")
print(f"Data type of the first entry: {type(first_vector)}")

Generating document vectors...
Document vector generation complete.
0        [0.07583403320242546, 0.04187698843753267, 0.0...
1        [0.04695818875510202, 0.13540242544137115, 0.0...
2        [0.004362303456679608, 0.08245044836541637, 0....
3        [0.08746129379236722, 0.15563484383046597, 0.0...
4        [-0.0026560557163542225, 0.025955445346854893,...
                               ...                        
21412    [0.15280790730502528, 0.1473945954979292, 0.04...
21413    [0.042486393289338495, 0.13626204154393604, 0....
21414    [-0.018041047070062523, 0.060838847229200785, ...
21415    [0.016266909195110202, 0.07864857869621335, 0....
21416    [0.04427956897082902, 0.1890925298945246, 0.03...
Name: document_vector, Length: 44669, dtype: object

--- Data Type Check ---
Data type of the first entry: <class 'numpy.ndarray'>


In [30]:
word2vec_model.save("word2vec_model.model")


In [22]:
# Create the feature matrix X by vertically stacking the document vectors
# np.vstack ensures all document vectors form a single NumPy array
X = np.vstack(df_merge['document_vector'].values)

# Assume your binary label is in a column named 'label' (0 for fake, 1 for true)
Y = df_merge['class'].values

# Check the final shapes
print(f"Shape of Feature Matrix (X): {X.shape}") 
print(f"Shape of Target Vector (Y): {Y.shape}")

Shape of Feature Matrix (X): (44669, 100)
Shape of Target Vector (Y): (44669,)


In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, 
    Y, 
    test_size=0.2,   # Use 20% of the data for testing
    random_state=42, # Ensures the split is the same every time for reproducibility
    stratify=Y       # Highly recommended for classification: keeps the ratio of 0s and 1s the same in both sets
)

print(f"Total training samples: {len(X_train)}")
print(f"Total testing samples: {len(X_test)}")

Total training samples: 35735
Total testing samples: 8934


In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the number of input features (the size of your Word2Vec vector)
# Assuming EMBEDDING_DIM is defined from your Word2Vec training
input_dim = EMBEDDING_DIM 

# Initialize and define all layers inside the Sequential constructor
model = Sequential([
    # Input Layer & First Hidden Layer
    Dense(units=128, input_dim=input_dim, activation='relu'),
    
    # Dropout Layer (Regularization)
    Dropout(0.5),
    
    # Second Hidden Layer
    Dense(units=64, activation='relu'),
    
    # Output Layer (Binary Classification)
    Dense(units=1, activation='sigmoid')
])

# Display the model structure
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [25]:
# 1. Compile the model
model.compile(
    optimizer='adam', # The standard, efficient optimizer for deep learning
    loss='binary_crossentropy', # Standard loss function for binary classification
    metrics=['accuracy']
)

# 2. Train the model
# You can use a validation split to monitor performance on unseen data during training
history = model.fit(
    X_train, 
    Y_train,
    epochs=10,         # Number of full passes through the training data
    batch_size=32,     # Number of samples per gradient update
    validation_split=0.1, # Use 10% of the training data to monitor validation loss
    verbose=1
)

print("\nModel training complete.")

Epoch 1/10
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9267 - loss: 0.1902 - val_accuracy: 0.9524 - val_loss: 0.1256
Epoch 2/10
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9557 - loss: 0.1203 - val_accuracy: 0.9597 - val_loss: 0.1076
Epoch 3/10
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9604 - loss: 0.1087 - val_accuracy: 0.9625 - val_loss: 0.1040
Epoch 4/10
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9623 - loss: 0.1040 - val_accuracy: 0.9631 - val_loss: 0.1037
Epoch 5/10
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9640 - loss: 0.1003 - val_accuracy: 0.9611 - val_loss: 0.1031
Epoch 6/10
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9635 - loss: 0.0961 - val_accuracy: 0.9611 - val_loss: 0.1100
Epoch 7/10
[1m1

In [31]:
model.save("fake_news_classifier.h5")



In [26]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)

print("\n--- Final Model Evaluation ---")
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


--- Final Model Evaluation ---
Test Loss: 0.0696
Test Accuracy: 0.9750


In [27]:
from sklearn.metrics import confusion_matrix, classification_report

# Generate probabilities on the test set
# .predict() outputs a probability for each test sample
Y_pred_proba = model.predict(X_test, verbose=0)

# Convert probabilities to class labels (0 or 1)
# We use a threshold of 0.5: if probability > 0.5, predict 1 (True), otherwise 0 (Fake)
Y_pred = (Y_pred_proba > 0.5).astype("int32")

# Flatten Y_test and Y_pred to ensure they are 1D arrays for scikit-learn metrics
Y_test_flat = Y_test.flatten()
Y_pred_flat = Y_pred.flatten()

In [28]:
# Calculate the confusion matrix
cm = confusion_matrix(Y_test_flat, Y_pred_flat)

print("\n--- Confusion Matrix ---")
print(cm)


--- Confusion Matrix ---
[[4561  133]
 [  90 4150]]


In [29]:
print("\n--- Classification Report ---")
# Labels: 0 (Fake) and 1 (True)
print(classification_report(Y_test_flat, Y_pred_flat, target_names=['Fake News (0)', 'True News (1)']))


--- Classification Report ---
               precision    recall  f1-score   support

Fake News (0)       0.98      0.97      0.98      4694
True News (1)       0.97      0.98      0.97      4240

     accuracy                           0.98      8934
    macro avg       0.97      0.98      0.97      8934
 weighted avg       0.98      0.98      0.98      8934



In [None]:
# After training Word2Vec:
word2vec_model.save("word2vec_model.model")

# After training Keras model:
model.save("fake_news_classifier.h5") # or .keras format