<a href="https://colab.research.google.com/github/mohamedseklani/DLI/blob/main/1_data_cleaning_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!git config --global user.email "mohamedseklani8@gmail.com"
!git config --global user.name "mohamedseklani"

!git clone https://github.com/mohamedseklani/DLI
%cd DLI

fatal: destination path 'DLI' already exists and is not an empty directory.
/content/DLI


In [4]:
# --- Commit 1: Initial load and null cleanup ---

# 1) Setup
import pandas as pd
import numpy as np
import csv, sys

# Increase limit for very long email bodies
csv.field_size_limit(sys.maxsize)

# Pandas display settings
pd.set_option('display.max_columns', None)

# 2) Data Loading (robust)
# Using on_bad_lines='skip' instead of deprecated error_bad_lines
df = pd.read_csv(
    "/content/DLI/TREC-05.csv",  # <-- adjust path to your file
    engine="python",
    on_bad_lines="skip",
    sep=None,  # autodetect delimiter
    encoding="utf-8",
    encoding_errors="replace",  # handle weird characters
    dtype=str  # read all as string first
)

print("Initial Shape:", df.shape)
print(df.isnull().sum())

# 3) Preprocessing - Remove nulls in key columns
# Convert label and urls to numeric (coerce errors to NaN)
if "label" in df.columns:
    df["label"] = pd.to_numeric(df["label"], errors="coerce")
if "urls" in df.columns:
    df["urls"] = pd.to_numeric(df["urls"], errors="coerce")

# Drop rows where body, subject, or label is missing
required_cols = [c for c in ["body", "subject", "label"] if c in df.columns]
df.dropna(subset=required_cols, inplace=True)

# Fill missing urls with 0 and ensure integer type
if "urls" in df.columns:
    df["urls"] = df["urls"].fillna(0)
    df["urls"] = np.maximum(df["urls"].astype(float), 0).astype(int)

# Strip extra whitespace from text fields
for col in ["subject", "body"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()

df.reset_index(drop=True, inplace=True)
print("After cleaning Shape:", df.shape)

Initial Shape: (53875, 7)
sender        13
receiver    2627
date        2376
subject     2472
body        1214
label       1341
urls        1342
dtype: int64
After cleaning Shape: (51264, 7)


In [10]:
import os

os.makedirs("data", exist_ok=True)
os.makedirs("notebooks", exist_ok=True)

# Save cleaned CSV inside data/
df.to_csv("data/cleaned_data.csv", index=False)

# Optional: Save notebook (if working inside Colab manually, skip this)

In [13]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mnew file:   data/cleaned_data.csv[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mTREC-05.csv[m



In [16]:
# Save notebook in Colab manually

from google.colab import files

files.download("1_data_cleaning.ipynb")

FileNotFoundError: Cannot find file: 1_data_cleaning.ipynb

In [15]:
!mkdir -p notebooks
!mv 1_data_cleaning.ipynb notebooks/

mv: cannot stat '1_data_cleaning.ipynb': No such file or directory


In [14]:
# Make sure he is inside his forked repo directory in Colab or terminal
!git add data/cleaned_data.csv notebooks/1_data_cleaning.ipynb
!git commit -m "Commit 1: Cleaned and saved dataset"
!git push origin main  # Push to his fork's main branch

fatal: pathspec 'notebooks/1_data_cleaning.ipynb' did not match any files
[main 2374209] Commit 1: Cleaned and saved dataset
 1 file changed, 51265 insertions(+)
 create mode 100644 data/cleaned_data.csv
fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
# --- Commit 2: Class distribution and word cloud ---

# Class Distribution
label_counts = df['label'].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.xticks([0, 1], ['Legitimate (0)', 'Phishing (1)'])
plt.title("Class Distribution")
plt.xlabel("Email Type")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("results/class_distribution.png")
plt.show()

# Word Clouds
phish_text = ' '.join(df[df['label'] == 1]['body'].dropna().values)
legit_text = ' '.join(df[df['label'] == 0]['body'].dropna().values)

phish_wc = WordCloud(width=800, height=400, background_color='white').generate(phish_text)
legit_wc = WordCloud(width=800, height=400, background_color='white').generate(legit_text)

plt.figure(figsize=(10, 5))
plt.imshow(phish_wc, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud - Phishing Emails")
plt.tight_layout()
plt.savefig("results/phishing_wordcloud.png")
plt.show()

plt.figure(figsize=(10, 5))
plt.imshow(legit_wc, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud - Legitimate Emails")
plt.tight_layout()
plt.savefig("results/legit_wordcloud.png")
plt.show()

In [None]:
!git add results/class_distribution.png results/phishing_wordcloud.png results/legit_wordcloud.png
!git commit -m "Commit 2: Added class distribution and word cloud visualizations"
!git push origin main

In [None]:
# --- Commit 3: Saved cleaned dataset for next stage ---

# Save Cleaned Data
df.to_csv("data/cleaned_data.csv", index=False)
print("Cleaned data saved to 'data/cleaned_data.csv'.")

# Summary
print("Final Class Counts:")
print(df['label'].value_counts())
print("Preview:")
print(df[['subject', 'label']].head())

In [None]:
!git add data/cleaned_data.csv notebooks/1_data_cleaning.ipynb
!git commit -m "Commit 3: Final cleaned dataset export and stats summary"
!git push origin main