<a href="https://colab.research.google.com/github/shihab1/DLI/blob/main/2_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git config --global user.email "almnaryb139@gmail.com"
!git config --global user.name "shihab1"

!git clone https://github.com/shihab1/DLI.git
%cd DLI

Cloning into 'DLI'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), 8.30 KiB | 1.19 MiB/s, done.
/content/DLI


In [3]:
# --- Commit 1: Text cleaning and preprocessing ---

# 1. Setup
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# 2. Data Loading
df = pd.read_csv("/content/data/cleaned_data.csv")

# 3. Preprocessing - Merge subject and body
df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['clean_text'] = df['text'].apply(clean_text)
print(df[['text', 'clean_text']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  \
0  FW: June 29 -- BNA, Inc. Daily Labor Report Us...   
1  NGX failover plan. Hi Chris, Tonight we are ro...   
2  RE: Intranet Site Rika r these new? -----Origi...   
3  FW: ENA Upstream Company information John/Gera...   
4  New Master Physical Gerald and Stacy - Attache...   

                                          clean_text  
0  fw june bna inc daily labor report user id enr...  
1  ngx failover plan hi chris tonight rolling new...  
2  intranet site rika r new original message thom...  
3  fw ena upstream company information johngerald...  
4  new master physical gerald stacy attached work...  


In [4]:
# --- Commit 2: Tokenization and sequence preparation ---

# 4. Tokenization and Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import os

# Ensure target directory exists
os.makedirs("/content/data", exist_ok=True)

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_text'])

sequences = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(sequences, maxlen=200)

# Save tokenizer to /content/data
with open("/content/data/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Tokenizer saved to /content/data/tokenizer.pkl")
print("Sample tokenized sequence:", sequences[0])
print("Padded shape:", X.shape)

Tokenizer saved to /content/data/tokenizer.pkl
Sample tokenized sequence: [62, 593, 4153, 29, 261, 1960, 32, 703, 359, 1, 1, 1, 10, 3, 4153, 2814, 8, 83, 593, 18, 4153, 2814, 7, 593, 4153, 29, 261, 1960, 32, 261, 1960, 32, 2814, 985, 1476, 593, 7893, 621, 226, 1069, 159, 297, 1337, 2237, 375, 2070, 389, 5313, 16, 2769, 2939, 2793, 36, 226, 1973, 88, 1, 47, 4153, 290, 1744, 1850, 1750, 18, 1298, 2814, 1398, 4017, 1, 7435, 668, 8926, 2606, 2799, 207, 930, 56, 8926, 2799, 8806, 1, 8754, 1170, 1, 1472, 448, 3098, 7070, 1960, 1898, 1834, 2628, 4017, 770, 1, 377, 1960, 1744, 330, 668, 1, 9821, 1, 1, 1911, 29, 503, 7435, 1398, 6209, 330, 574, 8926, 2606, 2799, 1, 9821, 1, 1277, 1911, 29, 1, 312, 2965, 221, 377, 1960, 1744, 262, 330, 3322, 420, 4367, 574, 1485, 461, 1, 312, 1, 3002, 718, 4971, 1, 1, 210, 1834, 1, 1203, 207, 3052, 545, 4636, 8020, 3993, 903, 1, 5010, 138, 9, 4367, 93, 138, 545, 9541, 3721, 348, 7550, 1, 8754, 1170, 7070, 1960, 488, 4348, 461, 1353, 1, 312, 2965, 221, 1435, 1, 4

In [4]:
# --- Commit 3: Label preparation and saving features ---

# 5. Label preparation
y = df['label'].astype(int).values

# 6. Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# 7. Save features
np.save("data/X_train.npy", X_train)
np.save("data/X_test.npy", X_test)
np.save("data/y_train.npy", y_train)
np.save("data/y_test.npy", y_test)

print("Saved X_train.npy, X_test.npy, y_train.npy, y_test.npy")
print("X_train shape:", X_train.shape)
print("y_train distribution:", np.bincount(y_train))


Saved X_train.npy, X_test.npy, y_train.npy, y_test.npy
X_train shape: (41011, 200)
y_train distribution: [24340 16671]
