About the Dataset:

1. id: unique id for a news article
2. title: the title of a news article
3. author: author of the news article
4. text: the text of the article; could be incomplete
5. label: a label that marks whether the news article is real or fake:
           1: Fake news
           0: real News





In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# -------------------------------
# Configuration
# -------------------------------
DATA_DIR = "./data"            # folder containing True.csv and Fake.csv
OUTPUT_DIR = "./processed"     # where to save train/test/valid splits
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------------------------
# Load Data
# -------------------------------
true_path = os.path.join(DATA_DIR, "True.csv")
fake_path = os.path.join(DATA_DIR, "Fake.csv")

true_df = pd.read_csv(true_path)
fake_df = pd.read_csv(fake_path)

# -------------------------------
# Labeling
# -------------------------------
true_df["label"] = "True"
fake_df["label"] = "Fake"

# Combine and shuffle
df = pd.concat([true_df, fake_df], axis=0)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# -------------------------------
# Split Data
# -------------------------------
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

train_df, valid_df = train_test_split(
    train_df, test_size=0.1, random_state=42, stratify=train_df["label"]
)

# -------------------------------
# Save Files
# -------------------------------
train_df.to_csv(os.path.join(OUTPUT_DIR, "train.csv"), index=False)
test_df.to_csv(os.path.join(OUTPUT_DIR, "test.csv"), index=False)
valid_df.to_csv(os.path.join(OUTPUT_DIR, "valid.csv"), index=False)

print(f"✅ Dataset successfully prepared and saved to '{OUTPUT_DIR}' folder.")

Mounted at /content/drive


FileNotFoundError: [Errno 2] No such file or directory: '/content/True.csv'

In [None]:
import pandas as pd
import os

# -------------------------------
# Configuration
# -------------------------------
DATA_PATH = "./processed/train.csv"   # Path to your processed training CSV

# -------------------------------
# Load Dataset
# -------------------------------
if os.path.exists(DATA_PATH):
    news_dataset = pd.read_csv(DATA_PATH)
    print("✅ Dataset loaded successfully!")
    print(news_dataset.head())
else:
    print(f"❌ Error: File not found at {DATA_PATH}.")
    print("Please ensure 'train.csv' exists inside the 'processed' folder.")

✅ Dataset loaded successfully!
                                               title  \
0  THE OBAMA LEGACY: Worst Economic Growth Of All...   
1   Oops: Ammosexual Playing With Gun During Beng...   
2  Biden, Ukraine's Poroshenko to meet Thursday: ...   
3  U.S.-backed SDF attacks Islamic State in Syria...   
4  Zimbabwe's Mnangagwa opens amnesty window for ...   

                                                text       subject  \
0   Last week the Commerce Department released it...      politics   
1  Thinking about going to the movies? You might ...          News   
2  WASHINGTON (Reuters) - U.S. Vice President Joe...  politicsNews   
3  BEIRUT (Reuters) - The U.S.-backed Syrian Demo...     worldnews   
4  HARARE (Reuters) - Zimbabwe s new president, E...     worldnews   

                 date label  
0         Apr 2, 2017  Fake  
1    January 22, 2016  Fake  
2     March 30, 2016   True  
3  September 9, 2017   True  
4  November 28, 2017   True  


In [6]:
news_dataset.shape

(32326, 5)

In [7]:
news_dataset.head()

Unnamed: 0,title,text,subject,date,label
0,THE OBAMA LEGACY: Worst Economic Growth Of All...,Last week the Commerce Department released it...,politics,"Apr 2, 2017",Fake
1,Oops: Ammosexual Playing With Gun During Beng...,Thinking about going to the movies? You might ...,News,"January 22, 2016",Fake
2,"Biden, Ukraine's Poroshenko to meet Thursday: ...",WASHINGTON (Reuters) - U.S. Vice President Joe...,politicsNews,"March 30, 2016",True
3,U.S.-backed SDF attacks Islamic State in Syria...,BEIRUT (Reuters) - The U.S.-backed Syrian Demo...,worldnews,"September 9, 2017",True
4,Zimbabwe's Mnangagwa opens amnesty window for ...,"HARARE (Reuters) - Zimbabwe s new president, E...",worldnews,"November 28, 2017",True


In [8]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [9]:
print(news_dataset['text'])

0         Last week the Commerce Department released it...
1        Thinking about going to the movies? You might ...
2        WASHINGTON (Reuters) - U.S. Vice President Joe...
3        BEIRUT (Reuters) - The U.S.-backed Syrian Demo...
4        HARARE (Reuters) - Zimbabwe s new president, E...
                               ...                        
32321    President Trump was lambasted by Democrats and...
32322    JERUSALEM (Reuters) - Israel s defense ministe...
32323    Thinking before you speak is basically a requi...
32324    The video below is a much watch! A young Donal...
32325    PARIS (Reuters) - The mayor of Paris, where a ...
Name: text, Length: 32326, dtype: object


In [10]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [11]:
print(X)
print(Y)

                                                   title  \
0      THE OBAMA LEGACY: Worst Economic Growth Of All...   
1       Oops: Ammosexual Playing With Gun During Beng...   
2      Biden, Ukraine's Poroshenko to meet Thursday: ...   
3      U.S.-backed SDF attacks Islamic State in Syria...   
4      Zimbabwe's Mnangagwa opens amnesty window for ...   
...                                                  ...   
32321  WOW! TRUMP UNDERESTIMATED ILLEGAL VOTE: New St...   
32322  Israel sees Assad winning Syria war, urges mor...   
32323   Afghanistan War Vet Humiliates Republican Opp...   
32324  A YOUNG DONALD J TRUMP: The 34-Year Old Shares...   
32325  Paris mayor says Trump climate withdrawal 'a m...   

                                                    text       subject  \
0       Last week the Commerce Department released it...      politics   
1      Thinking about going to the movies? You might ...          News   
2      WASHINGTON (Reuters) - U.S. Vice President Joe... 

In [12]:
port_stem = PorterStemmer()

In [13]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [17]:
news_dataset['text'] = news_dataset['text'].apply(stemming)

In [18]:
print(news_dataset['text'])

0        last week commerc depart releas third revis fo...
1        think go movi might want avoid hour secret sol...
2        washington reuter u vice presid joe biden ukra...
3        beirut reuter u back syrian democrat forc alli...
4        harar reuter zimbabw new presid emmerson mnang...
                               ...                        
32321    presid trump lambast democrat alli media sugge...
32322    jerusalem reuter israel defens minist said tue...
32323    think speak basic requir skill want lawmak mis...
32324    video much watch young donald j trump speak le...
32325    pari reuter mayor pari landmark global climat ...
Name: text, Length: 32326, dtype: object


In [19]:
#separating the data and label
X = news_dataset['text'].values
Y = news_dataset['label'].values

In [20]:
print(X)

['last week commerc depart releas third revis fourth quarter gross domest product number came paltri percent mean growth presid obama final year offic end error hope land big thud percent low water mark put obama presid last place among post world war ii presid come econom growth post wwii presid begin harri truman disadvantag begin aftermath war economi contract percent four time contract neg year sinc even best obama econom record truman moder democrat also post two best year growth record percent percent zero percent interest rate gin economi back thirteenth presid mild distinct obama eight full year enact growth polici mani predecessor never two complet term georg h w bush jimmi carter four year gerald ford less three year richard nixon five least bit surpris obama economi failur chronicl seven year column averag growth rate presid johnson percent kennedi percent clinton percent reagan percent carter percent eisenhow percent nixon percent ford percent g h w bush percent g w bush pe

In [21]:
print(Y)

['Fake' 'Fake' 'True' ... 'Fake' 'Fake' 'True']


In [22]:
Y.shape

(32326,)

In [23]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [24]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4889546 stored elements and shape (32326, 77507)>
  Coords	Values
  (0, 1001)	0.06605132804204407
  (0, 1984)	0.021694138693682308
  (0, 2320)	0.03695704582043224
  (0, 4269)	0.05468683976980557
  (0, 4607)	0.026402041861559554
  (0, 5848)	0.0848384861485373
  (0, 6316)	0.08112681339753572
  (0, 6565)	0.03899239909404576
  (0, 6826)	0.050454640303167785
  (0, 9110)	0.1444029832613104
  (0, 9627)	0.03651537483090922
  (0, 10085)	0.13046390485089426
  (0, 11638)	0.08055015297130033
  (0, 12183)	0.03220446469138074
  (0, 12717)	0.07242026732584675
  (0, 12748)	0.027397891507127407
  (0, 12817)	0.05997161012659732
  (0, 12907)	0.039579076589588584
  (0, 13310)	0.11316946391847633
  (0, 15904)	0.026396858039882606
  (0, 16022)	0.03446985908949706
  (0, 16753)	0.07871275924808185
  (0, 16972)	0.07147617811309895
  (0, 17365)	0.05052801437496483
  (0, 18682)	0.0796015913604259
  :	:
  (32325, 37061)	0.15172549713037115
  (32325, 41

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [26]:
model = LogisticRegression()

In [27]:
model.fit(X_train, Y_train)

In [28]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [29]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9894431554524362


In [30]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [31]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9857717290442314


In [33]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]=='True'):
  print('The news is Real')
else:
  print('The news is Fake')

['True']
The news is Real


In [34]:
print(Y_test[3])

True
