In [1]:
import pandas as pd
import numpy as np

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load the fake news dataset
fake_df = pd.read_csv('Fake.csv')

# Load the true news dataset
true_df = pd.read_csv('True.csv')

print("Fake News DataFrame Head:")
display(fake_df.head())
print("\nTrue News DataFrame Head:")
display(true_df.head())

Fake News DataFrame Head:


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"



True News DataFrame Head:


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [3]:
# Add a 'target' column with value 1 (for fake) to the fake_df
fake_df['target'] = 1

# Add a 'target' column with value 0 (for true) to the true_df
true_df['target'] = 0

print("Target columns added.")

Target columns added.


In [4]:
# Combine the two DataFrames into one
df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the entire DataFrame to mix fake and real news
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("DataFrames combined and shuffled. Here's the new DataFrame head:")
display(df.head())
print(f"\nTotal rows in the combined dataset: {len(df)}")

DataFrames combined and shuffled. Here's the new DataFrame head:


Unnamed: 0,title,text,subject,date,target
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0



Total rows in the combined dataset: 44898


In [5]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Download necessary NLTK data if you haven't already
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

ps = PorterStemmer()

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    # Keep only alphanumeric tokens
    y = [i for i in text if i.isalnum()]
    
    # Remove stopwords and punctuation
    text = [i for i in y if i not in stopwords.words('english') and i not in string.punctuation]
    
    # Apply stemming
    y.clear()
    for i in text:
        y.append(ps.stem(i))
        
    return " ".join(y)

print("Text transformation function is ready.")

Text transformation function is ready.


In [6]:
# Apply the transformation function to the 'text' column
# We are creating a new column to store the transformed text
df['transformed_text'] = df['text'].apply(transform_text)

print("Text preprocessing complete. Here's a sample:")
display(df[['text', 'transformed_text']].head())

Text preprocessing complete. Here's a sample:


Unnamed: 0,text,transformed_text
0,"21st Century Wire says Ben Stein, reputable pr...",21st centuri wire say ben stein reput professo...
1,WASHINGTON (Reuters) - U.S. President Donald T...,washington reuter presid donald trump remov ch...
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,reuter puerto rico governor ricardo rossello s...
3,"On Monday, Donald Trump once again embarrassed...",monday donald trump embarrass countri accident...
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",glasgow scotland reuter presidenti candid go a...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score

# Define features (X) and target (y)
X = df['transformed_text']
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Data split into training and testing sets.")
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Data split into training and testing sets.
Training set size: 35918
Testing set size: 8980


In [8]:
# Initialize the TF-IDF Vectorizer
# We'll use 5000 features for this task
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# ONLY transform the testing data (using the vocabulary learned from training data)
X_test_tfidf = tfidf.transform(X_test)

print("Text has been vectorized successfully.")

Text has been vectorized successfully.


In [9]:
# Initialize the Logistic Regression model
model = LogisticRegression(solver='liblinear', random_state=42)

# Train the model on the vectorized training data
model.fit(X_train_tfidf, y_train)

print("Model training complete!")

# Let's check the accuracy on the test set
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"\nModel Accuracy on Test Set: {accuracy:.4f}")
print(f"Model Precision on Test Set: {precision:.4f}")

Model training complete!

Model Accuracy on Test Set: 0.9841
Model Precision on Test Set: 0.9876


In [10]:
import pickle

# Save the fitted TF-IDF vectorizer
pickle.dump(tfidf, open('fakenews_vectorizer.pkl', 'wb'))

# Save the trained Logistic Regression model
pickle.dump(model, open('fakenews_model.pkl', 'wb'))

print("Successfully saved 'fakenews_vectorizer.pkl' and 'fakenews_model.pkl' to your project folder!")

Successfully saved 'fakenews_vectorizer.pkl' and 'fakenews_model.pkl' to your project folder!
