In [None]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os

# This makes the graphs display nicely in the notebook
%matplotlib inline

In [None]:
with zipfile.ZipFile("../data/archive.zip", 'r') as zip_ref:
    zip_ref.extractall("unzipped_data")
    
print("Files extracted successfully!")

os.listdir("unzipped_data")

In [None]:
# Load the dataset
df_fake = pd.read_csv("unzipped_data/Fake.csv")
df_true = pd.read_csv("unzipped_data/True.csv")

# Add labels
df_fake["label"] = "FAKE"
df_true["label"] = "TRUE"

# Combine into one dataset
df = pd.concat([df_fake, df_true], axis=0).reset_index(drop=True)

In [None]:
# Confirm it worked
print("Dataset loaded successfully!")
print(f"Total shape: {df.shape}") # Should be (44898, 5) if the dataset hasn't changed
df.head() # Displays the first 5 rows

In [None]:
# The Big Picture - Dataset Overview
print("=== 1. DATASET OVERVIEW ===")
print(f"Total Number of Articles: {len(df)}")
print(f"Number of FAKE articles (label 0): {len(df[df['label'] == 0])}")
print(f"Number of REAL articles (label 1): {len(df[df['label'] == 1])}")
print(f"Number of Columns/Features: {df.shape[1]}")
print("\nColumn Names and Data Types:")
print(df.dtypes)

In [None]:
# Cell 5: Data Quality Check
print("=== 2. DATA QUALITY ===")
print("\nMissing Values per Column:")
print(df.isnull().sum())

print(f"\nTotal Duplicate Articles (based on text): {df.duplicated(subset='text').sum()}")
print(f"Total Duplicate Titles: {df.duplicated(subset='title').sum()}")

In [None]:
# Text Analysis - Article Length
# Create a new column for article length (in words)
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

print("=== 3. TEXT ANALYSIS ===")
print("\nAverage Article Length (in words):")
print(df.groupby('label')['word_count'].mean().round(2))


In [None]:
# Create the visualization
plt.figure(figsize=(10, 5))

# Create a histogram for each label
sns.histplot(data=df[df['label']==0], x='word_count', color='red', label='Fake', bins=50, alpha=0.6, kde=True, stat='density')
sns.histplot(data=df[df['label']==1], x='word_count', color='green', label='Real', bins=50, alpha=0.6, kde=True, stat='density')
plt.title('Distribution of Article Length by Label')
plt.xlabel('Word Count')
plt.ylabel('Density')
plt.xlim(0, 20000) 
plt.legend()
plt.show()

In [None]:
# Subject Analysis
print("=== 4. SUBJECT ANALYSIS ===")
if 'subject' in df.columns:
    print("\nSubjects for FAKE news:")
    print(df[df['label'] == 0]['subject'].value_counts())
    print("\nSubjects for REAL news:")
    print(df[df['label'] == 1]['subject'].value_counts())


In [None]:
    # Plotting the overall subject count
    plt.figure(figsize=(12, 5))
    df['subject'].value_counts().plot(kind='bar')
    plt.title('Overall Count of Articles by Subject')
    plt.xticks(rotation=45)
    plt.tight_layout() # Prevents labels from being cut off
    plt.show()

In [None]:
    # A more advanced plot: subject distribution by label
    plt.figure(figsize=(12, 5))


In [None]:
    # This creates a cross-tabulation (count) of subject by label
    subject_by_label = pd.crosstab(df['subject'], df['label'])
    subject_by_label.plot(kind='bar', figsize=(12,5))
    plt.title('Subject Distribution by News Label')
    plt.xlabel('Subject')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(['Fake', 'Real'])
    plt.tight_layout()
    plt.show()


