<a href="https://colab.research.google.com/github/gouravchauhan712/Leetcode/blob/main/fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("/content/fake_news_dataset.csv")
df

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake
...,...,...,...,...,...,...,...
19995,House party born.,hit and television I change very our happy doo...,2024-12-04,BBC,Gary Miles,Entertainment,fake
19996,Though nation people maybe price box.,fear most meet rock even sea value design stan...,2024-05-26,Daily News,Maria Mcbride,Entertainment,real
19997,Yet exist with experience unit.,activity loss very provide eye west create wha...,2023-04-17,BBC,Kristen Franklin,Entertainment,real
19998,School wide itself item.,term point general common training watch respo...,2024-06-30,Reuters,David Wise,Health,fake


In [4]:
df.shape

(20000, 7)

In [8]:
df.columns

Index(['title', 'text', 'date', 'source', 'author', 'category', 'label'], dtype='object')

In [5]:
df.describe()

Unnamed: 0,title,text,date,source,author,category,label
count,20000,20000,20000,19000,19000,20000,20000
unique,20000,20000,1096,8,17051,7,2
top,Turn present write decision town human personal.,suffer tree increase prevent organization easy...,2023-08-31,Daily News,Michael Smith,Health,fake
freq,1,1,32,2439,12,2922,10056


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     20000 non-null  object
 1   text      20000 non-null  object
 2   date      20000 non-null  object
 3   source    19000 non-null  object
 4   author    19000 non-null  object
 5   category  20000 non-null  object
 6   label     20000 non-null  object
dtypes: object(7)
memory usage: 1.1+ MB


In [7]:
missing_values = df.isnull().sum()
print('Missing values by column:')
print(missing_values)

Missing values by column:
title          0
text           0
date           0
source      1000
author      1000
category       0
label          0
dtype: int64


Data Preprocessing

In [9]:
## Convert Date Column
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     20000 non-null  object        
 1   text      20000 non-null  object        
 2   date      20000 non-null  datetime64[ns]
 3   source    19000 non-null  object        
 4   author    19000 non-null  object        
 5   category  20000 non-null  object        
 6   label     20000 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 1.1+ MB


In [10]:
## Drop rows with missing labels or text
df = df.dropna(subset=['label', 'text'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     20000 non-null  object        
 1   text      20000 non-null  object        
 2   date      20000 non-null  datetime64[ns]
 3   source    19000 non-null  object        
 4   author    19000 non-null  object        
 5   category  20000 non-null  object        
 6   label     20000 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 1.1+ MB


In [12]:
## Fill missing values with 'Unknown'
df['source'] = df['source'].fillna('Unknown')
df['author'] = df['author'].fillna('Unknown')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     20000 non-null  object        
 1   text      20000 non-null  object        
 2   date      20000 non-null  datetime64[ns]
 3   source    20000 non-null  object        
 4   author    20000 non-null  object        
 5   category  20000 non-null  object        
 6   label     20000 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 1.1+ MB


Data Cleaning

In [23]:
import re, nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [18]:
## Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""

    ## Lowercase
    text = text.lower()

    ## Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)

    ## Tokenize
    tokens = word_tokenize(text)

    ## Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    ## Join back into a string
    return ' '.join(cleaned_tokens)

In [24]:
df['cleaned_text'] = df['text'].apply(clean_text)
df['cleaned_title'] = df['title'].apply(clean_text)

In [25]:
df.shape

(20000, 9)

In [26]:
df.columns

Index(['title', 'text', 'date', 'source', 'author', 'category', 'label',
       'cleaned_text', 'cleaned_title'],
      dtype='object')