##### Data Collection

##### Load Dataset to Pandas Dataframe

In [9]:
import pandas as pd

df__spam_dataset = pd.read_csv('spam_dataset.csv')
df__spam_dataset.head()

print(f'Dataframe shape: {df__spam_dataset.shape}')

KeyboardInterrupt: 

##### Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from bs4 import BeautifulSoup # For HTML parsing
from bs4 import MarkupResemblesLocatorWarning # Import the specific warning
import unicodedata # For character normalization
from collections import Counter # For rare word filtering
import os # For file path operations
import warnings # For warning management
import emoji # For emoji handling
import contractions # Import the contractions library


# --- Suppress BeautifulSoup warnings ---
# Warnings related to MarkupResemblesLocator are ignored.
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

# --- Global Configurations and NLTK Downloads ---

# Define the path for the preprocessed CSV file.
PREPROCESSED_FILE_PATH = 'spam_dataset_preprocessed.csv'

# A function is defined to safely manage NLTK data downloads.
# It checks if a resource is already available before initiating a download,
# preventing unnecessary re-downloads and handling potential errors.
def safe_nltk_download(resource_path, download_name):
    try:
        nltk.data.find(resource_path)
        print(f"'{download_name}' already downloaded.")
    except LookupError:
        print(f"Downloading '{download_name}'...")
        nltk.download(download_name)
    except Exception as e:
        print(f"An unexpected error occurred while checking/downloading '{download_name}': {e}")

# NLTK data packages are downloaded, which are essential for text processing.
# 'punkt' and 'punkt_tab' are used for tokenization, 'stopwords' provides a list of common words to filter,
# 'wordnet' supports lemmatization, and 'omw-1.4' is a dependency for WordNet.
safe_nltk_download('tokenizers/punkt', 'punkt')
safe_nltk_download('tokenizers/punkt_tab', 'punkt_tab')
safe_nltk_download('corpora/stopwords', 'stopwords')
safe_nltk_download('corpora/wordnet', 'wordnet')
safe_nltk_download('corpora/omw-1.4', 'omw-1.4')

# NLTK components for text processing are initialized globally.
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# A regex pattern for common emoticons is compiled globally.
EMOTICON_PATTERN = re.compile(r'(?:[:;=X][-o*]?[DPpS\|/\)\(])|(?:\<3)')

# A threshold for rare word filtering is defined globally.
RARE_WORD_THRESHOLD = 5

# The core text preprocessing function is defined.
def preprocess_text(text):
    # HTML tags are removed using BeautifulSoup.
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Emojis are converted to their textual descriptions.
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Common emoticons are replaced with a descriptive token.
    text = EMOTICON_PATTERN.sub(r' EMOTICON ', text)

    # Text is converted to lowercase.
    text = text.lower()

    # Contractions are expanded to their full forms using the 'contractions' library.
    text = contractions.fix(text)

    # URLs within the text are removed.
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Unicode characters are normalized to their closest ASCII equivalents (NFKD form)
    # and then filtered to remove non-ASCII characters, standardizing character representation.
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Escape characters (like newlines, tabs, carriage returns) are replaced with a single space.
    # Multiple spaces are also condensed into a single space.
    text = re.sub(r'\s+', ' ', text).strip()

    # Punctuation marks are eliminated from the text.
    text = text.translate(str.maketrans('', '', string.punctuation))

    # All numerical digits are removed from the text.
    text = re.sub(r'\d+', '', text)

    # The text is broken down into individual words (tokens).
    tokens = word_tokenize(text)

    # Stop words are filtered out, and remaining words are lemmatized.
    cleaned_tokens = []
    for word in tokens:
        if word not in stop_words: # Accessing global stop_words
            lemma = lemmatizer.lemmatize(word) # Accessing global lemmatizer
            cleaned_tokens.append(lemma)

    # Cleaned tokens are rejoined into a single string.
    return ' '.join(cleaned_tokens)

# --- Functions for Preprocessing Stages ---

def perform_initial_data_checks(df):
    # Initial DataFrame information and missing values are displayed.
    print("--- Initial DataFrame Info & Missing Values (before NA/Duplicates) ---")
    print(df.info())
    print("\nMissing values:\n", df.isnull().sum())
    print("\nOriginal label distribution:\n", df['label'].value_counts())
    return df

def handle_missing_and_duplicate_values(df):
    # Rows with missing values in 'text' or 'label' are handled by dropping them.
    initial_rows_before_na = df.shape[0]
    df.dropna(subset=['text', 'label'], inplace=True)
    if df.shape[0] < initial_rows_before_na:
        print(f"\n{initial_rows_before_na - df.shape[0]} rows were dropped due to missing values. 🗑️")

    # Duplicate rows are identified based on the 'text' column, and duplicates are removed,
    # retaining the first occurrence of each unique text message.
    print("\n--- Handling Duplicate Values ---")
    initial_rows_before_duplicates = df.shape[0]
    df.drop_duplicates(subset=['text'], inplace=True)
    if df.shape[0] < initial_rows_before_duplicates:
        print(f"\n{initial_rows_before_duplicates - df.shape[0]} duplicate rows were removed. 🗑️")
    else:
        print("\nNo duplicate rows were found based on the 'text' column. 🗑️")

    print(f"DataFrame shape after NA and duplicate removal: {df.shape}")
    print("\nUpdated label distribution after duplicate removal:\n", df['label'].value_counts())
    return df

def apply_text_normalization_pipeline(df):
    # The advanced text preprocessing function is applied to the 'text' column,
    # now using standard .apply() for sequential processing.
    print("\n--- Applying Advanced Text Normalization (Sequential Processing) ---")
    df['cleaned_text'] = df['text'].apply(preprocess_text)

    print("\nSample original vs. cleaned text (after initial advanced normalization):") # Ensure we don't go out of bounds for small DFs
    for i in range(min(5, len(df))):
        print(f"Original: {df['text'].iloc[i]}")
        print(f"Cleaned:  {df['cleaned_text'].iloc[i]}\n")
    
    # All words from the 'cleaned_text' column are collected and tokenized.
    all_words = []
    for text in df['cleaned_text']:
        all_words.extend(text.split())

    # Word frequencies across the entire corpus are calculated.
    word_counts = Counter(all_words)

    # Rare words are identified based on a predefined threshold.
    rare_words = {word for word, count in word_counts.items() if count < RARE_WORD_THRESHOLD}
    print(f"\n--- Applying Rare Word Filtering ---")
    print(f"Identified {len(rare_words)} rare words (appearing less than {RARE_WORD_THRESHOLD} times).")

    # Rare words are removed from the 'cleaned_text' column.
    df['cleaned_text_filtered'] = df['cleaned_text'].apply(
        lambda text: ' '.join([word for word in text.split() if word not in rare_words])
    )

    print("\nSample cleaned vs. filtered text (after rare word removal):") # Ensure we don't go out of bounds
    for i in range(min(5, len(df))):
        print(f"Cleaned (before filter):  {df['cleaned_text'].iloc[i]}")
        print(f"Filtered (after filter): {df['cleaned_text_filtered'].iloc[i]}\n")

    # The main 'cleaned_text' column is updated with the filtered text, and the temporary column is dropped.
    df['cleaned_text'] = df['cleaned_text_filtered']
    df.drop(columns=['cleaned_text_filtered'], inplace=True)

    # --- Handle empty cleaned text data ---
    initial_rows_before_empty_check = df.shape[0]
    # Filter out rows where 'cleaned_text' is empty or contains only whitespace
    df['cleaned_text'].replace('', np.nan, inplace=True) # Replace empty strings with NaN
    df.dropna(subset=['cleaned_text'], inplace=True) # Drop rows where 'cleaned_text' is NaN
    
    if df.shape[0] < initial_rows_before_empty_check:
        print(f"\n{initial_rows_before_empty_check - df.shape[0]} rows were dropped because 'cleaned_text' became empty after preprocessing. 🗑️")
    else:
        print("\nNo rows had empty 'cleaned_text' after preprocessing. ✅")

    # A note on spelling correction is provided, indicating its optional and resource-intensive nature.
    print("\n--- Spelling Correction/Normalization (Optional) ---")
    print("Spelling correction is a computationally intensive step and can significantly increase processing time,")
    print("especially for large datasets. It is often omitted or applied selectively.")
    print("If desired, the 'spellchecker' library can be used, but it's not included in the main pipeline by default.")
    return df

def apply_label_encoding(df):
    # A LabelEncoder is instantiated for categorical label conversion.
    le = LabelEncoder()
    # Categorical labels in the 'label' column are transformed into numerical representations.
    df['label_encoded'] = le.fit_transform(df['label'])

    # The mapping from original labels to encoded numerical values is displayed.
    label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"\n--- Applying Label Encoding ---")
    print(f"Label mapping: {label_mapping} 🏷️")
    return df

# --- Main Execution Logic ---

# The script checks for the existence of a preprocessed CSV file.
if os.path.exists(PREPROCESSED_FILE_PATH):
    # If the file is found, the preprocessed data is loaded directly.
    print(f"--- Loading preprocessed data from {PREPROCESSED_FILE_PATH} ---")
    df__spam_dataset = pd.read_csv(PREPROCESSED_FILE_PATH)
    print("Preprocessed data loaded successfully. ✅")
    print(f"DataFrame shape: {df__spam_dataset.shape}")
    print("Columns available:", df__spam_dataset.columns.tolist())
    print("Sample data:")
    print(df__spam_dataset[['text', 'cleaned_text', 'label', 'label_encoded']].head())

else:
    # If the preprocessed file is not found, the preprocessing pipeline is executed.
    # It is assumed that df__spam_dataset is already loaded from 'spam_dataset.csv'
    # in a preceding cell, containing 'text' and 'label' columns.
    try:
        # This checks if 'df__spam_dataset' is defined, e.g., from a previous cell where it was loaded.
        _ = df__spam_dataset.head()
        print("--- Preprocessed file not found. Starting preprocessing on existing df__spam_dataset. ---")
    except NameError:
        print("Error: 'df__spam_dataset' is not defined. Please ensure the raw dataset is loaded (e.g., pd.read_csv('spam_dataset.csv')) before running this block. ❌")
        # If df__spam_dataset isn't loaded, you might uncomment a line like this for testing:
        # df__spam_dataset = pd.read_csv('spam_dataset.csv') # Make sure this file exists in your environment
        raise # Stop execution if the DataFrame isn't available.


    df__spam_dataset = perform_initial_data_checks(df__spam_dataset)
    df__spam_dataset = handle_missing_and_duplicate_values(df__spam_dataset)
    df__spam_dataset = apply_text_normalization_pipeline(df__spam_dataset)
    df__spam_dataset = apply_label_encoding(df__spam_dataset)

    print("\n--- Preprocessing Complete! 🎉 ---")
    print("Updated DataFrame Info:")
    print(df__spam_dataset.info())
    print("\nSample of DataFrame after preprocessing:")
    print(df__spam_dataset[['text', 'cleaned_text', 'label', 'label_encoded']].head())
    print("\nFinal label distribution (encoded):\n", df__spam_dataset['label_encoded'].value_counts())

    # The newly preprocessed DataFrame is saved to a CSV file for faster future loading.
    print(f"\n--- Saving preprocessed data to {PREPROCESSED_FILE_PATH} ---")
    try:
        df__spam_dataset.to_csv(PREPROCESSED_FILE_PATH, index=False)
        print("Preprocessed data saved successfully for faster future loading. 💾")
    except Exception as e:
        print(f"Error saving preprocessed data: {e} ❌")

'punkt' already downloaded.
'punkt_tab' already downloaded.
'stopwords' already downloaded.
Downloading 'wordnet'...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dipan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dipan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Downloading 'omw-1.4'...
--- Loading preprocessed data from spam_dataset_preprocessed.csv ---
Preprocessed data loaded successfully. ✅
DataFrame shape: (92367, 4)
Columns available: ['label', 'text', 'cleaned_text', 'label_encoded']
Sample data:
                                                text  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                        cleaned_text label  label_encoded  
0  go point crazy available bugis n great world l...   ham              0  
1                              ok lar joking wif oni   ham              0  
2  free entry wkly comp win fa cup final st may t...  spam              1  
3                    dun say early hor c already say   ham              0  
4                nah think go usf lif

In [None]:
df__spam_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92367 entries, 0 to 92366
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   label          92367 non-null  object
 1   text           92367 non-null  object
 2   cleaned_text   92367 non-null  object
 3   label_encoded  92367 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 2.8+ MB


##### Feature Extraction

In [None]:
import pandas as pd
import numpy as np
import joblib # For saving/loading models and feature matrices
import os # For checking file existence

from sklearn.feature_extraction.text import CountVectorizer

# --- Assume df__spam_dataset is available from previous preprocessing steps ---
# It is assumed that df__spam_dataset contains 'cleaned_text' and 'label_encoded' columns.
# If running this cell independently, ensure df__spam_dataset is loaded first.
# Example: df__spam_dataset = pd.read_csv('spam_dataset_preprocessed.csv')

# --- Define paths for saving/loading ---
# A directory for storing feature extraction models and vectors is defined.
FEATURE_EXTRACTION_DIR = 'feature_extraction_models'
os.makedirs(FEATURE_EXTRACTION_DIR, exist_ok=True)

# File paths for the CountVectorizer model and its generated feature vectors are specified.
COUNT_VECTORIZER_MODEL_PATH = os.path.join(FEATURE_EXTRACTION_DIR, 'count_vectorizer.joblib')
COUNT_VECTORS_PATH = os.path.join(FEATURE_EXTRACTION_DIR, 'count_feature_vectors.joblib')

# A variable to hold the generated CountVectorizer features is initialized.
count_vectors = None
# A variable to hold the fitted CountVectorizer model is initialized.
count_vectorizer_model = None

# -----------------------------------------------------------------------------
## Feature Extraction: Bag-of-Words (CountVectorizer)
# -----------------------------------------------------------------------------
print("\n--- Feature Extraction: Bag-of-Words (CountVectorizer) ---")

# The script checks if pre-existing CountVectorizer features and model are available.
if os.path.exists(COUNT_VECTORS_PATH) and os.path.exists(COUNT_VECTORIZER_MODEL_PATH):
    try:
        # If found, the feature vectors and the fitted model are loaded.
        count_vectors = joblib.load(COUNT_VECTORS_PATH)
        count_vectorizer_model = joblib.load(COUNT_VECTORIZER_MODEL_PATH)
        print(f"CountVectorizer features loaded from: {COUNT_VECTORS_PATH}")
        print(f"CountVectorizer model loaded from: {COUNT_VECTORIZER_MODEL_PATH}")
        print(f"CountVectorizer features shape: {count_vectors.shape}")
        print(f"CountVectorizer vocabulary size: {len(count_vectorizer_model.vocabulary_)}")
    except Exception as e:
        # An error message is displayed if loading fails, and regeneration is initiated.
        print(f"Error loading CountVectorizer features or model: {e}. Regeneration will be performed.")
        count_vectors = None # Reset to None to trigger regeneration
        count_vectorizer_model = None
else:
    # A message indicating that features were not found is displayed, and generation is initiated.
    print("CountVectorizer features or model not found. Generation will be performed.")

# If features were not loaded (either not found or an error occurred during loading), they are generated.
if count_vectors is None:
    # A CountVectorizer is instantiated.
    # 'max_features' is set to limit the vocabulary size, focusing on the most frequent words.
    # 'ngram_range' is set to (1,3) to consider individual words (trigrams).
    count_vectorizer_model = CountVectorizer(max_features=5000, ngram_range=(1,3))

    # The vectorizer is fitted to the 'cleaned_text' data and the text is transformed into feature vectors.
    count_vectors = count_vectorizer_model.fit_transform(df__spam_dataset['cleaned_text'])

    print(f"CountVectorizer features shape: {count_vectors.shape}")
    print(f"CountVectorizer vocabulary size: {len(count_vectorizer_model.vocabulary_)}")

    try:
        # The generated feature vectors and the fitted model are saved for future use.
        joblib.dump(count_vectors, COUNT_VECTORS_PATH)
        joblib.dump(count_vectorizer_model, COUNT_VECTORIZER_MODEL_PATH)
        print(f"CountVectorizer features generated and saved to: {COUNT_VECTORS_PATH}")
        print(f"CountVectorizer model saved to: {COUNT_VECTORIZER_MODEL_PATH}")
    except Exception as e:
        # An error message is displayed if saving fails.
        print(f"Error saving CountVectorizer features or model: {e}")
else:
    # A message indicating that generation was skipped due to successful loading is displayed.
    print("CountVectorizer generation was skipped as features and model were loaded.")

print("\nBag-of-Words Feature Extraction complete.")


--- Feature Extraction: Bag-of-Words (CountVectorizer) ---
CountVectorizer features or model not found. Generation will be performed.
CountVectorizer features shape: (92367, 5000)
CountVectorizer vocabulary size: 5000
CountVectorizer features generated and saved to: feature_extraction_models\count_feature_vectors.joblib
CountVectorizer model saved to: feature_extraction_models\count_vectorizer.joblib

Bag-of-Words Feature Extraction complete.


##### Splitting the Data into Training and Testing Sets

In [None]:
import pandas as pd
import numpy as np
import joblib # For loading the feature vectors
import os # For path operations

from sklearn.model_selection import train_test_split

# --- Assume df__spam_dataset and count_vectors are available ---
# If running this cell independently, ensure they are loaded/defined first.

# Define paths (consistent with your previous feature extraction block)
FEATURE_EXTRACTION_DIR = 'feature_extraction_models'
COUNT_VECTORS_PATH = os.path.join(FEATURE_EXTRACTION_DIR, 'count_feature_vectors.joblib')

# Load the preprocessed DataFrame if not already in memory
try:
    # Assuming the preprocessed CSV path from earlier
    PREPROCESSED_FILE_PATH = 'spam_dataset_preprocessed.csv'
    df__spam_dataset = pd.read_csv(PREPROCESSED_FILE_PATH)
    print(f"DataFrame loaded from {PREPROCESSED_FILE_PATH}")
except FileNotFoundError:
    print("Preprocessed DataFrame not found. Please ensure 'spam_dataset_preprocessed.csv' exists.")
    # Exit or handle error appropriately if df__spam_dataset is critical for next steps

# Load the count_vectors if not already in memory
count_vectors = None
if os.path.exists(COUNT_VECTORS_PATH):
    try:
        count_vectors = joblib.load(COUNT_VECTORS_PATH)
        print(f"CountVectorizer features loaded from: {COUNT_VECTORS_PATH}")
        print(f"Shape of loaded features: {count_vectors.shape}")
    except Exception as e:
        print(f"Error loading CountVectorizer features: {e}. Please ensure the feature extraction step was successful.")
else:
    print("CountVectorizer features not found. Please run the feature extraction step first.")


# --- Data Splitting ---
print("\n--- Splitting Data into Training and Testing Sets ---")

# Define features (X) and target (y)
# X is your feature matrix (sparse matrix from CountVectorizer)
X = count_vectors

# y is your target variable (encoded labels from the DataFrame)
y = df__spam_dataset['label_encoded']

# Perform the train-test split
# test_size=0.20 means 20% of data for testing, 80% for training
# random_state for reproducibility: ensures the same split every time the code is run
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y) # Stratify to maintain class distribution

print(f"Original features shape: {X.shape}")
print(f"Original labels shape: {y.shape}")
print(f"Training features (X_train) shape: {X_train.shape}")
print(f"Testing features (X_test) shape: {X_test.shape}")
print(f"Training labels (y_train) shape: {y_train.shape}")
print(f"Testing labels (y_test) shape: {y_test.shape}")

print("\nData splitting complete! Your data is now ready for model training. 🎉")

DataFrame loaded from spam_dataset_preprocessed.csv
CountVectorizer features loaded from: feature_extraction_models\count_feature_vectors.joblib
Shape of loaded features: (92367, 5000)

--- Splitting Data into Training and Testing Sets ---
Original features shape: (92367, 5000)
Original labels shape: (92367,)
Training features (X_train) shape: (73893, 5000)
Testing features (X_test) shape: (18474, 5000)
Training labels (y_train) shape: (73893,)
Testing labels (y_test) shape: (18474,)

Data splitting complete! Your data is now ready for model training. 🎉


##### Model Selection & Training

In [None]:
import pandas as pd
import numpy as np
import joblib
import os

# Base ML Algorithms
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier # Added SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Ensemble Learning Algorithms
from sklearn.ensemble import (
    RandomForestClassifier,  # Already there, but a bagging example
    GradientBoostingClassifier, # Boosting
    AdaBoostClassifier,       # Boosting
    BaggingClassifier,        # General Bagging
    VotingClassifier,         # Voting
    StackingClassifier        # Stacking
)

# For splitting data (needed for Stacking meta-learner)
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# For parallel processing
from joblib import Parallel, delayed
import multiprocessing

# --- IMPORTANT ASSUMPTION ---
# This code block assumes that the following variables are already defined and available
# from previous execution cells:
# X_train, X_test, y_train, y_test
# These should be the result of your train_test_split from the previous dedicated step.
# If not, please ensure the "Splitting the Data into Training and Testing Sets" cell is run first.

# Define paths for saving model results (consistent with previous steps)
FEATURE_EXTRACTION_DIR = 'feature_extraction_models'
os.makedirs(FEATURE_EXTRACTION_DIR, exist_ok=True)
MODEL_RESULTS_PATH = os.path.join(FEATURE_EXTRACTION_DIR, 'all_model_results.joblib')

# --- Safeguard for X_train, X_test, y_train, y_test (if running standalone) ---
# This part is a safety net. In a typical Colab flow, if the previous cells ran,
# these variables would already be in memory.

# Load df__spam_dataset if not already loaded (needed for label_encoded for splitting)
try:
    PREPROCESSED_FILE_PATH = 'spam_dataset_preprocessed.csv'
    df__spam_dataset = pd.read_csv(PREPROCESSED_FILE_PATH)
except FileNotFoundError:
    raise FileNotFoundError("Preprocessed DataFrame not found. Please ensure 'spam_dataset_preprocessed.csv' exists and run the preprocessing step.")

# Load count_vectors if not already loaded (needed for X)
COUNT_VECTORS_PATH = os.path.join(FEATURE_EXTRACTION_DIR, 'count_feature_vectors.joblib')
count_vectors = None
if os.path.exists(COUNT_VECTORS_PATH):
    try:
        count_vectors = joblib.load(COUNT_VECTORS_PATH)
    except Exception as e:
        raise RuntimeError(f"Error loading CountVectorizer features: {e}. Please ensure the feature extraction step was successful.")
else:
    raise FileNotFoundError("CountVectorizer features not found. Please run the feature extraction step first.")

# Re-perform train-test split to ensure X_train, X_test, y_train, y_test are available
X = count_vectors
y = df__spam_dataset['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
print("Data split verified for parallel model training.")


# -----------------------------------------------------------------------------
## Parallel Model Training and Evaluation 🚀
# -----------------------------------------------------------------------------
print("\n--- Parallel Model Training and Evaluation (Including Ensembles) ---")

model_results = {}

# Define Base Classifiers for Ensembles (to be used within Voting/Stacking)
# These models are chosen for their diversity and common use in ensembles.
base_clf1 = MultinomialNB()
base_clf2 = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
base_clf3 = LinearSVC(random_state=42, dual=False, max_iter=1000)
base_clf4 = RandomForestClassifier(random_state=42, n_estimators=100) # Added n_estimators

# Define all models to be trained and evaluated
models = {
    # --- Individual Classifiers ---
    'Multinomial Naive Bayes': base_clf1,
    'Logistic Regression': base_clf2,
    'Linear SVM': base_clf3,
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': base_clf4,
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Stochastic Gradient Descent (SGD)': SGDClassifier(loss='log_loss', random_state=42, max_iter=1000), # loss='log_loss' for logistic regression (probability estimates)
    'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42, n_estimators=100),

    # --- Ensemble Methods ---
    # Bagging: BaggingClassifier with Decision Tree base
    'Bagging (Decision Tree)': BaggingClassifier(
        estimator=DecisionTreeClassifier(random_state=42),
        n_estimators=100, # Number of base estimators
        random_state=42,
        n_jobs=-1 # Use all cores for bagging itself if possible
    ),
    # Boosting: AdaBoost with Decision Tree base
    'AdaBoost (Decision Tree)': AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=1, random_state=42), # Use shallow trees (weak learners)
        n_estimators=100, # Number of boosting stages
        random_state=42
    ),
    # Voting Classifier: Combines diverse models
    'Voting Classifier (Soft)': VotingClassifier(
        estimators=[
            ('mnb', base_clf1),
            ('lr', base_clf2),
            ('lsvc', LinearSVC(random_state=42, dual=False, max_iter=1000)) # Re-instantiate if needed, or use base_clf3
        ],
        voting='soft', # Use predicted probabilities
        # weights=[0.3, 0.4, 0.3], # Example weights, can be tuned
        n_jobs=-1 # Use all cores for voting if possible
    ),
    # Stacking Classifier: Meta-learner combines base predictions
    'Stacking Classifier': StackingClassifier(
        estimators=[
            ('mnb', base_clf1),
            ('lr', base_clf2),
            ('rf', base_clf4)
        ],
        final_estimator=LogisticRegression(random_state=42, solver='liblinear', max_iter=1000), # Meta-learner
        cv=5, # Cross-validation folds for base learners' predictions
        n_jobs=-1 # Use all cores if possible
    )
}

# Determine the number of CPU cores to use for joblib.Parallel.
# Using -1 means using all available CPU cores.
# Note: Some sklearn models (like BaggingClassifier, VotingClassifier, StackingClassifier)
# have their own `n_jobs` parameter. Setting it to -1 in their constructor handles
# their internal parallelization. For `joblib.Parallel` here, `num_cores` controls
# the parallel execution of the *different* models.
num_cores = multiprocessing.cpu_count()
print(f"Attempting to train {len(models)} models in parallel using {num_cores} cores via joblib...")

# Define the function to train and evaluate a single model.
def train_and_evaluate_single_model(name, model, X_train_data, X_test_data, y_train_data, y_test_data):
    """
    Trains a given model and evaluates its performance on the test set.
    """
    print(f"  [Parallel] Training {name}...")
    try:
        model.fit(X_train_data, y_train_data)
        y_pred = model.predict(X_test_data)

        # Calculate metrics, ensuring zero_division handles cases where a class is not predicted/present
        accuracy = accuracy_score(y_test_data, y_pred)
        precision = precision_score(y_test_data, y_pred, pos_label=1, zero_division=0)
        recall = recall_score(y_test_data, y_pred, pos_label=1, zero_division=0)
        f1 = f1_score(y_test_data, y_pred, pos_label=1, zero_division=0)
        cm = confusion_matrix(y_test_data, y_pred, labels=np.unique(y_test_data))

        return {
            'name': name,
            'model': model,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'confusion_matrix': cm
        }
    except Exception as e:
        print(f"  [Parallel] Error training {name}: {e}")
        return {
            'name': name,
            'model': None,
            'accuracy': None, 'precision': None, 'recall': None, 'f1_score': None,
            'confusion_matrix': None,
            'error': str(e)
        }

# Execute training and evaluation in parallel
parallel_results = Parallel(n_jobs=num_cores, verbose=10)(
    delayed(train_and_evaluate_single_model)(name, model, X_train, X_test, y_train, y_test)
    for name, model in models.items()
)

# Populate model_results dictionary
for res in parallel_results:
    if res['model'] is not None:
        model_results[res['name']] = {
            'model': res['model'],
            'accuracy': res['accuracy'],
            'precision': res['precision'],
            'recall': res['recall'],
            'f1_score': res['f1_score'],
            'confusion_matrix': res['confusion_matrix']
        }
    else:
        print(f"Skipping results for {res['name']} due to training error.")

print("\nAll models trained and evaluated in parallel! 🎉")

# -----------------------------------------------------------------------------
## Reviewing All Model Results
# -----------------------------------------------------------------------------
print("\n--- Reviewing All Model Results ---")

if not model_results:
    print("No models were successfully trained or evaluated.")
else:
    # Convert results to a DataFrame for easier comparison
    results_df = pd.DataFrame([
        {
            'Model': name,
            'Accuracy': results['accuracy'],
            'Precision': results['precision'],
            'Recall': results['recall'],
            'F1-Score': results['f1_score']
        }
        for name, results in model_results.items() if results['accuracy'] is not None
    ])
    # Sort by F1-Score for better comparison in text classification
    results_df = results_df.sort_values(by='F1-Score', ascending=False)

    print("\nPerformance Summary (Sorted by F1-Score):")
    print(results_df.to_string(index=False)) # Use to_string to avoid truncation

    # Print detailed metrics for each model
    for name, results in model_results.items():
        print(f"\n--- {name} Performance ---")
        if results['accuracy'] is not None:
            print(f"  Accuracy: {results['accuracy']:.4f}")
            print(f"  Precision: {results['precision']:.4f}")
            print(f"  Recall: {results['recall']:.4f}")
            print(f"  F1-Score: {results['f1_score']:.4f}")
            print("  Confusion Matrix:")
            unique_labels = np.unique(y_test)
            if len(unique_labels) == 2:
                cm = results['confusion_matrix']
                print(f"    True Negative (Actual: {unique_labels[0]}, Pred: {unique_labels[0]}): {cm[0, 0]}")
                print(f"    False Positive (Actual: {unique_labels[0]}, Pred: {unique_labels[1]}): {cm[0, 1]}")
                print(f"    False Negative (Actual: {unique_labels[1]}, Pred: {unique_labels[0]}): {cm[1, 0]}")
                print(f"    True Positive (Actual: {unique_labels[1]}, Pred: {unique_labels[1]}): {cm[1, 1]}")
            else:
                print(results['confusion_matrix'])
        else:
            print("  Metrics not available due to training error.")


# Save the entire model_results dictionary for later analysis
try:
    joblib.dump(model_results, MODEL_RESULTS_PATH)
    print(f"\nAll model results saved to: {MODEL_RESULTS_PATH}")
except Exception as e:
    print(f"Error saving all model results: {e}")

Data split verified for parallel model training.

--- Parallel Model Training and Evaluation (Including Ensembles) ---
Attempting to train 1 models in parallel using 16 cores via joblib...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   1 tasks      | elapsed:  1.5min


Skipping results for Voting Classifier (Soft) due to training error.

All models trained and evaluated in parallel! 🎉

--- Reviewing All Model Results ---
No models were successfully trained or evaluated.

All model results saved to: feature_extraction_models\all_model_results.joblib


##### Hyper Parameter Tuning

In [11]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import GridSearchCV, StratifiedKFold # Added StratifiedKFold for cross-validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# --- IMPORTANT ASSUMPTION ---
# This code block assumes that X_train, X_test, y_train, y_test are already defined
# and available from previous execution cells (the data splitting step).
# It also assumes 'model_results' from the previous model selection step is available
# to help identify top performing models.

# Define paths (consistent with your previous steps)
FEATURE_EXTRACTION_DIR = 'feature_extraction_models'
os.makedirs(FEATURE_EXTRACTION_DIR, exist_ok=True)
TUNED_MODELS_DIR = os.path.join(FEATURE_EXTRACTION_DIR, 'tuned_models')
os.makedirs(TUNED_MODELS_DIR, exist_ok=True)


# If model_results is not available, you might want to load it from where it was saved.
# Example:
# try:
#     MODEL_RESULTS_PATH = os.path.join(FEATURE_EXTRACTION_DIR, 'all_model_results.joblib')
#     model_results = joblib.load(MODEL_RESULTS_PATH)
#     print(f"Loaded previous model results from {MODEL_RESULTS_PATH}")
# except FileNotFoundError:
#     print("Previous model results not found. Please run model selection step first.")
#     # Or re-run the model selection step if you need to proceed

# -----------------------------------------------------------------------------
## Hyperparameter Tuning with GridSearchCV ⚙️
# -----------------------------------------------------------------------------
print("\n--- Hyperparameter Tuning with GridSearchCV ---")

# Select a few top-performing models for tuning based on previous F1-scores
# It's recommended to choose 2-3 models that showed promising results.
# For demonstration, let's select Logistic Regression and Linear SVM as common strong performers for text.
# You can add or change these based on the actual results from the previous step.
models_to_tune = {
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear', max_iter=1000),
    'Linear SVM': LinearSVC(random_state=42, dual=False, max_iter=1000),
    # 'Random Forest': RandomForestClassifier(random_state=42), # Can also be tuned, but takes longer
    # 'Multinomial Naive Bayes': MultinomialNB(), # Often less parameters to tune
}

# Define hyperparameter grids for each selected model
param_grids = {
    'Logistic Regression': {
        'C': [0.1, 1, 10, 100],  # Inverse of regularization strength
        'penalty': ['l1', 'l2']  # Regularization type
    },
    'Linear SVM': {
        'C': [0.1, 1, 10],      # Regularization parameter
        # 'loss': ['hinge', 'squared_hinge'] # Can also be tuned, but default is usually good
    },
    # 'Random Forest': {
    #     'n_estimators': [50, 100, 200], # Number of trees in the forest
    #     'max_depth': [10, 20, None],   # Maximum depth of the tree
    #     'min_samples_split': [2, 5]    # Minimum number of samples required to split an internal node
    # }
}

tuned_model_results = {}
# Using StratifiedKFold for robust cross-validation, especially important for imbalanced data
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models_to_tune.items():
    if name in param_grids:
        print(f"\n--- Tuning {name} ---")
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[name],
            cv=cv_folds, # Use stratified cross-validation
            scoring='f1', # Optimize for F1-score (good balance of precision/recall)
            verbose=2, # Print detailed progress
            n_jobs=-1 # Use all available CPU cores for grid search
        )

        # Fit GridSearchCV on the training data
        grid_search.fit(X_train, y_train)

        # Get the best estimator found by GridSearchCV
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        print(f"\nBest parameters for {name}: {best_params}")
        print(f"Best F1-score on training data (cross-validation): {best_score:.4f}")

        # Evaluate the best model on the unseen test set
        y_pred = best_model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
        recall = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
        f1 = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
        cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))

        print(f"\n{name} Performance on Test Set (after tuning):")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1-Score: {f1:.4f}")
        print(f"  Confusion Matrix:\n{cm}")

        # Store results
        tuned_model_results[name] = {
            'model': best_model,
            'best_params': best_params,
            'cv_f1_score': best_score,
            'test_accuracy': accuracy,
            'test_precision': precision,
            'test_recall': recall,
            'test_f1_score': f1,
            'test_confusion_matrix': cm
        }

        # Save the best model
        model_filename = os.path.join(TUNED_MODELS_DIR, f'{name.lower().replace(" ", "_")}_tuned_model.joblib')
        try:
            joblib.dump(best_model, model_filename)
            print(f"Tuned {name} model saved to: {model_filename}")
        except Exception as e:
            print(f"Error saving tuned {name} model: {e}")

    else:
        print(f"No hyperparameter grid defined for {name}. Skipping tuning for this model.")

print("\nHyperparameter tuning complete! 🎉")

# -----------------------------------------------------------------------------
## Summary of Tuned Model Results
# -----------------------------------------------------------------------------
print("\n--- Summary of Tuned Model Results ---")
if not tuned_model_results:
    print("No models were tuned or evaluated.")
else:
    tuned_results_df = pd.DataFrame([
        {
            'Model': name,
            'Best Params': str(results['best_params']),
            'CV F1-Score (Train)': results['cv_f1_score'],
            'Test Accuracy': results['test_accuracy'],
            'Test Precision': results['test_precision'],
            'Test Recall': results['test_recall'],
            'Test F1-Score': results['test_f1_score']
        }
        for name, results in tuned_model_results.items()
    ])
    tuned_results_df = tuned_results_df.sort_values(by='Test F1-Score', ascending=False)
    print(tuned_results_df.to_string(index=False))

    # Save all tuned model results summary
    TUNED_RESULTS_SUMMARY_PATH = os.path.join(TUNED_MODELS_DIR, 'tuned_model_summary.joblib')
    try:
        joblib.dump(tuned_model_results, TUNED_RESULTS_SUMMARY_PATH)
        print(f"\nSummary of tuned model results saved to: {TUNED_RESULTS_SUMMARY_PATH}")
    except Exception as e:
        print(f"Error saving tuned model results summary: {e}")


--- Hyperparameter Tuning with GridSearchCV ---

--- Tuning Logistic Regression ---
Fitting 5 folds for each of 8 candidates, totalling 40 fits


KeyboardInterrupt: 