## Dataset Exploration and Understanding

In [25]:
from google.colab import files
uploaded = files.upload()


Saving archive.zip to archive (1).zip


In [26]:
import zipfile
import os
with zipfile.ZipFile("archive.zip", 'r') as zip_ref:
    zip_ref.extractall("unzipped_data")

print("Files extracted successfully!")

Files extracted successfully!


In [27]:
os.listdir("unzipped_data")

['Fake.csv', 'True.csv']

In [28]:
import pandas as pd

fake_df = pd.read_csv("unzipped_data/Fake.csv")
true_df = pd.read_csv("unzipped_data/True.csv")

print("Fake News Dataset:", fake_df.shape)
print("True News Dataset:", true_df.shape)

fake_df.head()

Fake News Dataset: (23481, 4)
True News Dataset: (21417, 4)


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [29]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [30]:
#merge and label

#Add a label column
fake_df["label"] = "FAKE"
true_df["label"] = "TRUE"

#Merge into one dataset
data = pd.concat([fake_df, true_df], ignore_index = True)

#Shuffle the rows so FAKE and TRUE are mixed
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

#Check the structure
print(data.shape)
print(data["label"].value_counts())
data.head()

(44898, 5)
label
FAKE    23481
TRUE    21417
Name: count, dtype: int64


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",FAKE
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",TRUE
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",TRUE
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",FAKE
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",TRUE


In [31]:
#Basic overview of the merged data

#structure
data.info()

#Number of articles in each class
data['label'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
FAKE,23481
TRUE,21417


In [32]:
#Check missing values
data.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [33]:
#Text length analysis

#Add a column for word count
data['word_count'] = data['text'].apply(lambda x: len(str(x).split()))

#Summary statistics
data['word_count'].describe()

#Compare FAKE vs TRUE word counts
data.groupby('label')['word_count'].mean()

Unnamed: 0_level_0,word_count
label,Unnamed: 1_level_1
FAKE,423.197905
TRUE,385.640099


In [34]:
#Build Clean Text Function

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#Downloading the resources once

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Added this line


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    #Remove URL's
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    #Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [35]:
#Apply to the data
data['cleaned_text'] = data['text'].apply(clean_text)

In [36]:
#Sample test
data[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,"21st Century Wire says Ben Stein, reputable pr...",st century wire say ben stein reputable profes...
1,WASHINGTON (Reuters) - U.S. President Donald T...,washington reuters u president donald trump re...
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,reuters puerto rico governor ricardo rossello ...
3,"On Monday, Donald Trump once again embarrassed...",monday donald trump embarrassed country accide...
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",glasgow scotland reuters u presidential candid...


## MY PART

In [42]:
#Train Test Split

from sklearn.model_selection import train_test_split

X = data['cleaned_text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [46]:
# Check for duplicates

duplicates = data.duplicated(subset=['cleaned_text']).sum()
print("Number of duplicate texts:", duplicates)

Number of duplicate texts: 6315


In [47]:
# Drop duplicates

data = data.drop_duplicates(subset=['cleaned_text']).reset_index(drop=True)

In [48]:
#Train Test Split

from sklearn.model_selection import train_test_split

X = data['cleaned_text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [49]:
# Check the shape of the splits

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (30866,)
Shape of X_test: (7717,)
Shape of y_train: (30866,)
Shape of y_test: (7717,)


In [50]:
#Check stratification

print ("Train label distribution:\n", y_train.value_counts(normalize = True))
print ("Test label distribution:\n", y_test.value_counts(normalize = True))

Train label distribution:
 label
TRUE    0.54918
FAKE    0.45082
Name: proportion, dtype: float64
Test label distribution:
 label
TRUE    0.549177
FAKE    0.450823
Name: proportion, dtype: float64


In [51]:
#Check for any leakages

overlap = set(X_train).intersection(set(X_test))
print("Overlap:", len(overlap))

Overlap: 0
