<small>

Author: Felipe Garaycochea Lozada   
Student Code: 22500219

</small>

# 📚 Libraries

In [615]:
import numpy as np
import pandas as pd
import unicodedata
import re
import pickle

from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report


# 🗂️ Dataset Load

In [616]:
data = pd.read_csv("dataset/Dataset-Mental-Disorders.csv")
data.head()

Unnamed: 0,Patient Number,Sadness,Euphoric,Exhausted,Sleep dissorder,Mood Swing,Suicidal thoughts,Anorxia,Authority Respect,Try-Explanation,Aggressive Response,Ignore & Move-On,Nervous Break-down,Admit Mistakes,Overthinking,Sexual Activity,Concentration,Optimisim,Expert Diagnose
0,Patiant-01,Usually,Seldom,Sometimes,Sometimes,YES,YES,NO,NO,YES,NO,NO,YES,YES,YES,3 From 10,3 From 10,4 From 10,Bipolar Type-2
1,Patiant-02,Usually,Seldom,Usually,Sometimes,NO,YES,NO,NO,NO,NO,NO,NO,NO,NO,4 From 10,2 From 10,5 From 10,Depression
2,Patiant-03,Sometimes,Most-Often,Sometimes,Sometimes,YES,NO,NO,NO,YES,YES,NO,YES,YES,NO,6 From 10,5 From 10,7 From 10,Bipolar Type-1
3,Patiant-04,Usually,Seldom,Usually,Most-Often,YES,YES,YES,NO,YES,NO,NO,NO,NO,NO,3 From 10,2 From 10,2 From 10,Bipolar Type-2
4,Patiant-05,Usually,Usually,Sometimes,Sometimes,NO,NO,NO,NO,NO,NO,NO,YES,YES,YES,5 From 10,5 From 10,6 From 10,Normal


# 🧱 Pipeline Construction

## 🧹 Data Cleaning and Preprocessing Pipeline

### 🔧 Normalizing Column Names

In [617]:
def normalize_columns(d):
    def normalize_name(col):
        col = col.lower()
        col = "".join(
            (
                c
                for c in unicodedata.normalize("NFD", col)
                if unicodedata.category(c) != "Mn"
            )
        )
        col = re.sub(r"[^\w]+", "_", col)
        col = col.strip("_")
        return col

    d = d.copy()
    d.columns = [normalize_name(c) for c in d.columns]
    print("Normalized columns")
    return d


normalize_columns_transformer = FunctionTransformer(normalize_columns, validate=False)

In [618]:
pipeline_F1 = Pipeline(
    [
        ("normalize_col_names", normalize_columns_transformer),
    ]
)
pipeline_F1

In [619]:
data_F1 = pipeline_F1.fit_transform(data)
data_F1.columns

Normalized columns


Index(['patient_number', 'sadness', 'euphoric', 'exhausted', 'sleep_dissorder',
       'mood_swing', 'suicidal_thoughts', 'anorxia', 'authority_respect',
       'try_explanation', 'aggressive_response', 'ignore_move_on',
       'nervous_break_down', 'admit_mistakes', 'overthinking',
       'sexual_activity', 'concentration', 'optimisim', 'expert_diagnose'],
      dtype='object')

### 🗑️ Deleting irrelevant columns

In [620]:
def drop_patient_number_column(d):
    print("Column patient number dropped")
    return d.drop(columns="patient_number")


drop_patient_number_transformer = FunctionTransformer(
    drop_patient_number_column, validate=False
)

In [621]:
pipeline_F2 = Pipeline(
    steps=[
        ("normalize_columns", normalize_columns_transformer),
        ("drop_patient_number", drop_patient_number_transformer),
    ]
)
pipeline_F2

In [622]:
data_F2 = pipeline_F2.fit_transform(data)
data_F2.columns

Normalized columns
Column patient number dropped


Index(['sadness', 'euphoric', 'exhausted', 'sleep_dissorder', 'mood_swing',
       'suicidal_thoughts', 'anorxia', 'authority_respect', 'try_explanation',
       'aggressive_response', 'ignore_move_on', 'nervous_break_down',
       'admit_mistakes', 'overthinking', 'sexual_activity', 'concentration',
       'optimisim', 'expert_diagnose'],
      dtype='object')

### 🗑️ Deleting Duplicate Records

In [623]:
def drop_duplicate_rows(d):
    print("Duplicate Records (before):", d[d.duplicated()].shape[0])
    d.drop_duplicates(inplace=True)
    print("Duplicate Records (after):", d[d.duplicated()].shape[0])
    return d


drop_duplicates_transformer = FunctionTransformer(drop_duplicate_rows, validate=False)

In [624]:
pipeline_F3 = Pipeline(
    steps=[
        ("normalize_columns", normalize_columns_transformer),
        ("drop_patient_number", drop_patient_number_transformer),
        ("drop_duplicates", drop_duplicates_transformer),
    ]
)
pipeline_F3

In [625]:
data_F3 = pipeline_F3.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0


### 🗑️ Deleting Null Data

In [626]:
def drop_na_rows(d):
    original_rows = len(d)
    d = d.dropna()
    rows_removed = original_rows - len(d)
    print(f"{rows_removed} rows with null values ​​were removed.")
    return d


drop_na_transformer = FunctionTransformer(drop_na_rows, validate=False)

In [627]:
pipeline_F4 = Pipeline(
    steps=[
        ("normalize_columns", normalize_columns_transformer),
        ("drop_patient_number", drop_patient_number_transformer),
        ("drop_duplicates", drop_duplicates_transformer),
        ("drop_na", drop_na_transformer),
    ]
)

In [628]:
data_F4 = pipeline_F4.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.


### 🧹 Cleaning Suicidal Thoughts Column

In [629]:
def clean_suicidal_thoughts(d):
    d = d.copy()
    d["suicidal_thoughts"] = d["suicidal_thoughts"].str.replace(" ", "")
    return d


clean_suicidal_thoughts_transformer = FunctionTransformer(clean_suicidal_thoughts)

In [630]:
pipeline_F5 = Pipeline(
    [
        ("normalize_columns", normalize_columns_transformer),
        ("drop_patient_number", drop_patient_number_transformer),
        ("drop_duplicates", drop_duplicates_transformer),
        ("drop_na", drop_na_transformer),
        ("clean_suicidal_thoughts", clean_suicidal_thoughts_transformer),
    ]
)

In [631]:
data_F5 = pipeline_F5.fit_transform(data)
data_F5["suicidal_thoughts"].unique()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.


array(['YES', 'NO'], dtype=object)

### 🔄 Converting Text to Numeric Values

In [632]:
def print_unique_values(data, cols):
    for col in cols:
        print(f"{col} → {data[col].unique()} | dtype: {data[col].dtype}")

In [633]:
ordinal_cat_cols = ["sadness", "euphoric", "exhausted", "sleep_dissorder"]

def map_ordinal_columns(d):
    d = d.copy()
    ordinal_map = {"seldom": 1, "sometimes": 2, "usually": 3, "most-often": 4}

    for col in ordinal_cat_cols:
        d[col] = d[col].str.lower().str.replace("-", " ").str.strip()
        d[col] = d[col].replace({"most often": "most-often"})
        d[col] = d[col].map(ordinal_map)

    print_unique_values(d, ordinal_cat_cols)

    return d


ordinal_map_transformer = FunctionTransformer(map_ordinal_columns, validate=False)

In [634]:
pipeline_F6 = Pipeline(
    steps=[
        ("normalize_columns", normalize_columns_transformer),
        ("drop_patient_number", drop_patient_number_transformer),
        ("drop_duplicates", drop_duplicates_transformer),
        ("drop_na", drop_na_transformer),
        ("clean_suicidal_thoughts", clean_suicidal_thoughts_transformer),
        ("ordinal_mapping", ordinal_map_transformer),
    ]
)
pipeline_F6

In [635]:
data_F6 = pipeline_F6.fit_transform(data)
data_F6[ordinal_cat_cols].head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64


Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder
0,3,1,2,2
1,3,1,3,2
2,2,4,2,2
3,3,1,3,4
4,3,3,2,2


In [636]:
ordinal_num_cols = ["sexual_activity", "concentration", "optimisim"]

def ordinal_score(d):
    d = d.copy()

    def extract_score(value):
        return int(value.split()[0]) if isinstance(value, str) else value

    for col in ordinal_num_cols:
        d[col] = d[col].apply(extract_score)

    print_unique_values(d, ordinal_num_cols)

    return d


ordinal_score_transformer = FunctionTransformer(ordinal_score, validate=False)

In [637]:
pipeline_F7 = Pipeline(
    steps=[
        ("normalize_columns", normalize_columns_transformer),
        ("drop_patient_number", drop_patient_number_transformer),
        ("drop_duplicates", drop_duplicates_transformer),
        ("drop_na", drop_na_transformer),
        ("clean_suicidal_thoughts", clean_suicidal_thoughts_transformer),
        ("ordinal_mapping", ordinal_map_transformer),
        ("ordinal_scores", ordinal_score_transformer),
    ]
)
pipeline_F7

In [638]:
data_F7 = pipeline_F7.fit_transform(data)
data_F7[ordinal_num_cols].head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64


Unnamed: 0,sexual_activity,concentration,optimisim
0,3,3,4
1,4,2,5
2,6,5,7
3,3,2,2
4,5,5,6


## 🛠️ Feature Engineering Pipeline

### 🎯 Variable Selection

In [639]:
target = "expert_diagnose"
X = data_F7.drop(columns=target)
y = data_F7[target]

### 🧩 One-Hot Encoding Transformation

In [640]:
def get_dummies_transform(d):
    d = pd.DataFrame(d).copy()
    X_dummies = pd.get_dummies(d.drop(columns=target), drop_first=True, dtype=int)
    return pd.concat([X_dummies, d[target]], axis=1)


get_dummies_transformer = FunctionTransformer(get_dummies_transform)

In [641]:
pipeline_F8 = Pipeline(
    steps=[("preprocessing", pipeline_F7), ("onehot_encoding", get_dummies_transformer)]
)
pipeline_F8

In [642]:
data_F8 = pipeline_F8.fit_transform(data)
data_F8.head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64


Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,sexual_activity,concentration,optimisim,mood_swing_YES,suicidal_thoughts_YES,anorxia_YES,authority_respect_YES,try_explanation_YES,aggressive_response_YES,ignore_move_on_YES,nervous_break_down_YES,admit_mistakes_YES,overthinking_YES,expert_diagnose
0,3,1,2,2,3,3,4,1,1,0,0,1,0,0,1,1,1,Bipolar Type-2
1,3,1,3,2,4,2,5,0,1,0,0,0,0,0,0,0,0,Depression
2,2,4,2,2,6,5,7,1,0,0,0,1,1,0,1,1,0,Bipolar Type-1
3,3,1,3,4,3,2,2,1,1,1,0,1,0,0,0,0,0,Bipolar Type-2
4,3,3,2,2,5,5,6,0,0,0,0,0,0,0,1,1,1,Normal


### 🧹 Drop Highly Correlated Features

In [643]:
def drop_high_corr_cols(d, threshold=0.9, target_col="expert_diagnose"):
    d = pd.DataFrame(d).copy()
    X = d.drop(columns=target_col)

    corr_matrix = X.corr().abs()

    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]

    print("Columns Eliminated for High Collinearity:", to_drop)

    X_clean = X.drop(columns=to_drop, errors="ignore")
    return pd.concat([X_clean, d[target_col]], axis=1)


drop_high_corr_transformer = FunctionTransformer(drop_high_corr_cols)

In [644]:
pipeline_F9 = Pipeline(
    steps=[
        ("preprocessing", pipeline_F7),
        ("onehot_encoding", get_dummies_transformer),
        ("drop_high_corr", drop_high_corr_transformer),
    ]
)
pipeline_F9

In [645]:
data_F9 = pipeline_F9.fit_transform(data)
data_F9.columns

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []


Index(['sadness', 'euphoric', 'exhausted', 'sleep_dissorder',
       'sexual_activity', 'concentration', 'optimisim', 'mood_swing_YES',
       'suicidal_thoughts_YES', 'anorxia_YES', 'authority_respect_YES',
       'try_explanation_YES', 'aggressive_response_YES', 'ignore_move_on_YES',
       'nervous_break_down_YES', 'admit_mistakes_YES', 'overthinking_YES',
       'expert_diagnose'],
      dtype='object')

### 🧹 Drop Zero Variance Features


In [646]:
def drop_zero_variance_cols(d):
    d = pd.DataFrame(d).copy()
    deleted = d.columns[d.nunique() == 1].tolist()
    print("Columns eliminated by zero variance:", deleted)
    return d.loc[:, d.nunique() > 1]


drop_zero_var_transformer = FunctionTransformer(drop_zero_variance_cols)

In [647]:
pipeline_F10 = Pipeline(
    steps=[
        ("preprocessing", pipeline_F7),
        ("onehot_encoding", get_dummies_transformer),
        ("drop_high_corr", drop_high_corr_transformer),
        ("drop_zero_variance", drop_zero_var_transformer),
    ]
)
pipeline_F10

In [648]:
data_F10 = pipeline_F10.fit_transform(data)
data_F10.head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []


Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,sexual_activity,concentration,optimisim,mood_swing_YES,suicidal_thoughts_YES,anorxia_YES,authority_respect_YES,try_explanation_YES,aggressive_response_YES,ignore_move_on_YES,nervous_break_down_YES,admit_mistakes_YES,overthinking_YES,expert_diagnose
0,3,1,2,2,3,3,4,1,1,0,0,1,0,0,1,1,1,Bipolar Type-2
1,3,1,3,2,4,2,5,0,1,0,0,0,0,0,0,0,0,Depression
2,2,4,2,2,6,5,7,1,0,0,0,1,1,0,1,1,0,Bipolar Type-1
3,3,1,3,4,3,2,2,1,1,1,0,1,0,0,0,0,0,Bipolar Type-2
4,3,3,2,2,5,5,6,0,0,0,0,0,0,0,1,1,1,Normal


### 🌳 Feature Selection via Decision Tree

In [649]:
from sklearn.tree import DecisionTreeClassifier


def select_features(d, feature_list=None):
    if feature_list is not None:
        print("Using predefined features:", feature_list)
        return d[feature_list + [target]]

    X = d.drop(columns=target)
    y = d[target]

    dtc = DecisionTreeClassifier(random_state=42)
    dtc.fit(X, y)

    dtc_importances = pd.Series(dtc.feature_importances_, index=X.columns)
    features = dtc_importances.sort_values(ascending=False).head(11).index.tolist()
    print("Best features according to DecisionTreeClassifier:", features)

    return d[features + [target]]


select_features_transformer = FunctionTransformer(
    select_features, kw_args={"feature_list": None}
)

In [650]:
pipeline_F11 = Pipeline(
    steps=[
        ("preprocessing", pipeline_F7),
        ("onehot_encoding", get_dummies_transformer),
        ("drop_high_corr", drop_high_corr_transformer),
        ("drop_zero_variance", drop_zero_var_transformer),
        ("select_features", select_features_transformer),
    ]
)
pipeline_F11

In [651]:
data_F11 = pipeline_F11.fit_transform(data)
data_F11.head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []
Best features according to DecisionTreeClassifier: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


Unnamed: 0,mood_swing_YES,sadness,sexual_activity,euphoric,optimisim,suicidal_thoughts_YES,exhausted,concentration,sleep_dissorder,anorxia_YES,nervous_break_down_YES,expert_diagnose
0,1,3,3,1,4,1,2,3,2,0,1,Bipolar Type-2
1,0,3,4,1,5,1,3,2,2,0,0,Depression
2,1,2,6,4,7,0,2,5,2,0,1,Bipolar Type-1
3,1,3,3,1,2,1,3,2,4,1,0,Bipolar Type-2
4,0,3,5,3,6,0,2,5,2,0,1,Normal


### 🎯 Drop Target Column


In [652]:
def drop_target_col(d):
    return d.drop(columns=target)


drop_target_col_transformer = FunctionTransformer(drop_target_col)

In [653]:
pipeline_F12 = Pipeline(
    [
        ("preprocessing", pipeline_F7),
        ("onehot_encoding", get_dummies_transformer),
        ("drop_high_corr", drop_high_corr_transformer),
        ("drop_zero_variance", drop_zero_var_transformer),
        ("select_features", select_features_transformer),
        ("drop_target_value", drop_target_col_transformer),
    ]
)
pipeline_F12

In [654]:
data_F12 = pipeline_F12.fit_transform(data)
data_F12.columns

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []
Best features according to DecisionTreeClassifier: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


Index(['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim',
       'suicidal_thoughts_YES', 'exhausted', 'concentration',
       'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES'],
      dtype='object')

# 🔗 Assemble Final Pipelines

In this section, we assemble the final preprocessing pipelines. One pipeline is used to extract the training features (`X`) from the full dataset, while the other is designed to process new incoming data with the same structure, ensuring consistency during inference.


## 🔄 Pipeline to Process the Data (Extract Training Features `X`)

In [655]:
data_processing_pipeline = Pipeline(
    [
        ("data_processing", pipeline_F12),
    ]
)
data_processing_pipeline

## 🆕 Pipeline to Process Data from a new Dataset (Same data structure)

<small>

📝 Technical note (better future):

The functions included in the inference_pipeline pipeline are mainly designed for data exploration analysis (EDA), and also for handling the object variable (y) in a direct format.

This is inappropriate for inference tests, since the transformations are included in the complete dataset (predictors + objects) and are not prepared to operate uniquely in new instances without labeling.

🔧 Best recommended:
- Consider a new set of functions and a specific pipeline for prediction, that:

- Focuses exclusively on the predictor variables (X).

- It does not depend on the presence of the object variable (y).

- Allow you to receive new data without labels and apply the necessary transformations before making the prediction.

</small>

In [656]:
best_features = [
    "mood_swing_YES",
    "sadness",
    "sexual_activity",
    "euphoric",
    "optimisim",
    "suicidal_thoughts_YES",
    "exhausted",
    "concentration",
    "sleep_dissorder",
    "anorxia_YES",
    "nervous_break_down_YES",
]

select_features_custom_list_transformer = FunctionTransformer(
    select_features, kw_args={"feature_list": best_features}
)

In [657]:
def scaling_features(d):
    d = pd.DataFrame(d).copy()
    X = d.drop(columns=target)
    y_ = d[target]
    scaler = StandardScaler() 
    X_scaled = scaler.fit_transform(X)
    return pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y_], axis=1)

scaling_features_transformer = FunctionTransformer(scaling_features, feature_names_out='one-to-one')

In [658]:
inference_pipeline = Pipeline(
    [
        ("normalize_columns", normalize_columns_transformer),
        ("drop_patient_number_column", drop_patient_number_transformer),
        ("clean_suicidal_thoughts", clean_suicidal_thoughts_transformer),
        ("ordinal_mapping", ordinal_map_transformer),
        ("ordinal_scores", ordinal_score_transformer),
        ("onehot_encoding", get_dummies_transformer),
        ("select_features_custom_list", select_features_custom_list_transformer),
        ("scaling_features", scaling_features_transformer),
        ("drop_target_value", drop_target_col_transformer),
    ]
)

inference_pipeline

### 💾 Save Pipeline for new data

In [678]:
with open("inference_pipeline.pkl", "wb") as f:
    pickle.dump(inference_pipeline, f)

# 🤖 Model Training (Using `data_processing_pipeline`)

## 🎯 Variable Selection

In [659]:
X = data_processing_pipeline.fit_transform(data)
y = data["Expert Diagnose"]

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []
Best features according to DecisionTreeClassifier: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


In [660]:
X.head()

Unnamed: 0,mood_swing_YES,sadness,sexual_activity,euphoric,optimisim,suicidal_thoughts_YES,exhausted,concentration,sleep_dissorder,anorxia_YES,nervous_break_down_YES
0,1,3,3,1,4,1,2,3,2,0,1
1,0,3,4,1,5,1,3,2,2,0,0
2,1,2,6,4,7,0,2,5,2,0,1
3,1,3,3,1,2,1,3,2,4,1,0
4,0,3,5,3,6,0,2,5,2,0,1


## 🧪 Train SVC with Scaled Features

In [661]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

clf_svc_model = SVC(
    C=1.178769e-2, random_state=42, kernel="linear", class_weight="balanced"
)
clf_svc_model.fit(X_scaled, y)

## 💾 Save Trained SVC Model

In [662]:
with open("clf_svc_model.pkl", "wb") as f:
    pickle.dump((clf_svc_model, scaler), f)

print("✅ Trained SVC model and scaler saved as 'clf_svc_model.pkl'")

✅ Trained SVC model and scaler saved as 'clf_svc_model.pkl'


# 🧪 Testing `inference_pipeline`

To verify that the `inference_pipeline` is correctly built, we apply it to the original dataset. This simulates how new data would be processed, even though we're using the same training data here for validation purposes.


In [666]:
simulated_new_x_to_predict = inference_pipeline.fit_transform(data)
simulated_new_x_to_predict.head()


Normalized columns
Column patient number dropped
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Using predefined features: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


Unnamed: 0,mood_swing_YES,sadness,sexual_activity,euphoric,optimisim,suicidal_thoughts_YES,exhausted,concentration,sleep_dissorder,anorxia_YES,nervous_break_down_YES
0,1.051315,0.488813,-0.869935,-1.014999,-0.235336,1.051315,-0.62337,-0.698317,-0.47067,-0.78843,0.967204
1,-0.95119,0.488813,-0.370451,-1.014999,0.268955,1.051315,0.360898,-1.25697,-0.47067,-0.78843,-1.033908
2,1.051315,-0.597438,0.628518,2.247498,1.277536,-0.95119,-0.62337,0.41899,-0.47067,-0.78843,0.967204
3,1.051315,0.488813,-0.869935,-1.014999,-1.243917,1.051315,0.360898,-1.25697,1.583164,1.268344,-1.033908
4,-0.95119,0.488813,0.129033,1.159999,0.773246,-0.95119,-0.62337,0.41899,-0.47067,-0.78843,0.967204


In [668]:
y_pred = clf_svc_model.predict(simulated_new_x_to_predict)
print(classification_report(y, y_pred))

                precision    recall  f1-score   support

Bipolar Type-1       0.92      0.82      0.87        28
Bipolar Type-2       0.97      0.97      0.97        31
    Depression       0.97      0.90      0.93        31
        Normal       0.83      0.97      0.89        30

      accuracy                           0.92       120
     macro avg       0.92      0.91      0.92       120
  weighted avg       0.92      0.92      0.92       120



In [677]:
results_df = pd.DataFrame({"y_true": y, "y_pred": y_pred})

results_df = results_df.join(X[best_features].reset_index(drop=True))

correct_preds = results_df[results_df["y_true"] == results_df["y_pred"]]
#depression_correct = correct_preds[correct_preds["y_true"] == "Depression"]
#depression_correct.head()
correct_preds.iloc[0]

y_true                    Bipolar Type-2
y_pred                    Bipolar Type-2
mood_swing_YES                         1
sadness                                3
sexual_activity                        3
euphoric                               1
optimisim                              4
suicidal_thoughts_YES                  1
exhausted                              2
concentration                          3
sleep_dissorder                        2
anorxia_YES                            0
nervous_break_down_YES                 1
Name: 0, dtype: object