<small>

Author: Felipe Garaycochea Lozada   
Student Code: 22500219

</small>

# 📚 Libraries

In [455]:
import numpy as np
import pandas as pd
import unicodedata
import re

from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


# 🗂️ Dataset Load

In [456]:
data = pd.read_csv("dataset/Dataset-Mental-Disorders.csv")
data.head()

Unnamed: 0,Patient Number,Sadness,Euphoric,Exhausted,Sleep dissorder,Mood Swing,Suicidal thoughts,Anorxia,Authority Respect,Try-Explanation,Aggressive Response,Ignore & Move-On,Nervous Break-down,Admit Mistakes,Overthinking,Sexual Activity,Concentration,Optimisim,Expert Diagnose
0,Patiant-01,Usually,Seldom,Sometimes,Sometimes,YES,YES,NO,NO,YES,NO,NO,YES,YES,YES,3 From 10,3 From 10,4 From 10,Bipolar Type-2
1,Patiant-02,Usually,Seldom,Usually,Sometimes,NO,YES,NO,NO,NO,NO,NO,NO,NO,NO,4 From 10,2 From 10,5 From 10,Depression
2,Patiant-03,Sometimes,Most-Often,Sometimes,Sometimes,YES,NO,NO,NO,YES,YES,NO,YES,YES,NO,6 From 10,5 From 10,7 From 10,Bipolar Type-1
3,Patiant-04,Usually,Seldom,Usually,Most-Often,YES,YES,YES,NO,YES,NO,NO,NO,NO,NO,3 From 10,2 From 10,2 From 10,Bipolar Type-2
4,Patiant-05,Usually,Usually,Sometimes,Sometimes,NO,NO,NO,NO,NO,NO,NO,YES,YES,YES,5 From 10,5 From 10,6 From 10,Normal


# Pipeline construction

## 🧹 Data Cleaning and Preprocessing

### 🔧 Normalizing Column Names

In [457]:
class NormalizeColumnNames(BaseEstimator, TransformerMixin):
    def fit(self, d, y=None):
        return self

    def transform(self, d):
        d = d.copy()
        d.columns = [self.normalize(col) for col in d.columns]
        return d
    
    

    def normalize(self, col):
        col = col.lower()
        col = ''.join(
            (c for c in unicodedata.normalize('NFD', col) if unicodedata.category(c) != 'Mn')
        )
        col = re.sub(r'[^\w]+', '_', col)
        col = col.strip('_')
        return col


In [458]:
def normalize_columns(d):
    def normalize_name(col):
        col = col.lower()
        col = ''.join(
            (c for c in unicodedata.normalize('NFD', col) if unicodedata.category(c) != 'Mn')
        )
        col = re.sub(r'[^\w]+', '_', col)
        col = col.strip('_')
        return col

    d = d.copy()
    d.columns = [normalize_name(c) for c in d.columns]
    print("Normalized columns")
    return d

normalize_columns_transformer  = FunctionTransformer(normalize_columns, validate=False)


In [459]:
pipeline_F1 = Pipeline([
    ('normalize_col_names', normalize_columns_transformer),
])
pipeline_F1

In [460]:
data_F1 = pipeline_F1.fit_transform(data)
data_F1.columns

Normalized columns


Index(['patient_number', 'sadness', 'euphoric', 'exhausted', 'sleep_dissorder',
       'mood_swing', 'suicidal_thoughts', 'anorxia', 'authority_respect',
       'try_explanation', 'aggressive_response', 'ignore_move_on',
       'nervous_break_down', 'admit_mistakes', 'overthinking',
       'sexual_activity', 'concentration', 'optimisim', 'expert_diagnose'],
      dtype='object')

### 🗑️ Deleting irrelevant columns

In [461]:
def drop_patient_number_column(d):
    print("Column patient number dropped")
    return d.drop(columns='patient_number') 
drop_patient_number_transformer  = FunctionTransformer(drop_patient_number_column, validate=False)

In [462]:
pipeline_F2 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer ),
])
pipeline_F2

In [463]:
data_F2 = pipeline_F2.fit_transform(data)
data_F2.columns

Normalized columns
Column patient number dropped


Index(['sadness', 'euphoric', 'exhausted', 'sleep_dissorder', 'mood_swing',
       'suicidal_thoughts', 'anorxia', 'authority_respect', 'try_explanation',
       'aggressive_response', 'ignore_move_on', 'nervous_break_down',
       'admit_mistakes', 'overthinking', 'sexual_activity', 'concentration',
       'optimisim', 'expert_diagnose'],
      dtype='object')

In [464]:
def drop_duplicate_rows(d):
    print("Duplicate Records (before):", d[d.duplicated()].shape[0])
    d.drop_duplicates(inplace=True)
    print("Duplicate Records (after):", d[d.duplicated()].shape[0])
    return d

drop_duplicates_transformer = FunctionTransformer(drop_duplicate_rows, validate=False)

In [465]:
pipeline_F3 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
])
pipeline_F3

In [466]:
data_F3 = pipeline_F3.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0


In [467]:
def drop_na_rows(d):
    original_rows = len(d)
    d = d.dropna()
    rows_removed = original_rows - len(d)
    print(f"{rows_removed} rows with null values ​​were removed.")
    return d
  
drop_na_transformer = FunctionTransformer(drop_na_rows, validate=False)

In [468]:
pipeline_F4 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
    ('drop_na', drop_na_transformer),
])

In [469]:
data_F4 = pipeline_F4.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.


In [470]:
def clean_suicidal_thoughts(d):
    d = d.copy()
    d['suicidal_thoughts'] = d['suicidal_thoughts'].str.replace(" ", "")
    return d

clean_suicidal_thoughts_transformer = FunctionTransformer(clean_suicidal_thoughts)


In [471]:
pipeline_F5 = Pipeline([
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
    ('drop_na', drop_na_transformer),
    ('clean_suicidal_thoughts', clean_suicidal_thoughts_transformer),
])

In [472]:
data_F5 = pipeline_F5.fit_transform(data)
data_F5["suicidal_thoughts"].unique()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.


array(['YES', 'NO'], dtype=object)

### 🔄 Converting Text to Numeric Values

In [473]:
def print_unique_values(data, cols):
    for col in cols:
        print(f"{col} → {data[col].unique()} | dtype: {data[col].dtype}")

In [474]:
def map_ordinal_columns(d):
    d = d.copy()
    ordinal_map = {
        'seldom': 1,
        'sometimes': 2,
        'usually': 3,
        'most-often': 4
    }
    
    ordinal_cat_cols = ['sadness', 'euphoric', 'exhausted', 'sleep_dissorder']

    for col in ordinal_cat_cols:
        d[col] = d[col].str.lower().str.replace('-', ' ').str.strip()
        d[col] = d[col].replace({'most often': 'most-often'})
        d[col] = d[col].map(ordinal_map)

    print_unique_values(d, ordinal_cat_cols)

    return d

ordinal_map_transformer = FunctionTransformer(map_ordinal_columns, validate=False)

In [475]:
pipeline_F6 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
    ('drop_na', drop_na_transformer),
    ('clean_suicidal_thoughts', clean_suicidal_thoughts_transformer),
    ('ordinal_mapping', ordinal_map_transformer),
])
pipeline_F6

In [476]:
data_F6 = pipeline_F6.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64


In [477]:
data_F6

Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,mood_swing,suicidal_thoughts,anorxia,authority_respect,try_explanation,aggressive_response,ignore_move_on,nervous_break_down,admit_mistakes,overthinking,sexual_activity,concentration,optimisim,expert_diagnose
0,3,1,2,2,YES,YES,NO,NO,YES,NO,NO,YES,YES,YES,3 From 10,3 From 10,4 From 10,Bipolar Type-2
1,3,1,3,2,NO,YES,NO,NO,NO,NO,NO,NO,NO,NO,4 From 10,2 From 10,5 From 10,Depression
2,2,4,2,2,YES,NO,NO,NO,YES,YES,NO,YES,YES,NO,6 From 10,5 From 10,7 From 10,Bipolar Type-1
3,3,1,3,4,YES,YES,YES,NO,YES,NO,NO,NO,NO,NO,3 From 10,2 From 10,2 From 10,Bipolar Type-2
4,3,3,2,2,NO,NO,NO,NO,NO,NO,NO,YES,YES,YES,5 From 10,5 From 10,6 From 10,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,4,1,3,2,NO,YES,NO,NO,YES,NO,YES,NO,NO,YES,2 From 10,5 From 10,3 From 10,Depression
116,2,2,2,1,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,6 From 10,7 From 10,8 From 10,Bipolar Type-1
117,3,2,3,2,YES,NO,YES,YES,NO,NO,NO,YES,NO,YES,1 From 10,5 From 10,3 From 10,Bipolar Type-2
118,3,2,1,1,NO,YES,YES,NO,YES,YES,YES,NO,YES,YES,7 From 10,7 From 10,7 From 10,Depression


In [478]:
def ordinal_score(d):
    d = d.copy()
    
    ordinal_num_cols = ["sexual_activity", "concentration", "optimisim"]
    
    def extract_score(value):
        return int(value.split()[0]) if isinstance(value, str) else value

    for col in ordinal_num_cols:
        d[col] = d[col].apply(extract_score)
    
    print_unique_values(d, ordinal_num_cols)

    return d

ordinal_score_transformer = FunctionTransformer(ordinal_score, validate=False)



In [479]:
pipeline_F7 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
    ('drop_na', drop_na_transformer),
    ('clean_suicidal_thoughts', clean_suicidal_thoughts_transformer),
    ('ordinal_mapping', ordinal_map_transformer),
    ('ordinal_scores', ordinal_score_transformer),
])
pipeline_F7

In [480]:
data_F7 = pipeline_F7.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64


## 🛠️ Feature Engineering

In [481]:
target = "expert_diagnose"
X = data_F7.drop(columns=target)
y = data_F7[target]

In [482]:
def get_dummies_transform(d):
    d = pd.DataFrame(d).copy()
    X_dummies = pd.get_dummies(d.drop(columns=target), drop_first=True, dtype=int)
    return pd.concat([X_dummies, d[target]], axis=1)


get_dummies_transformer = FunctionTransformer(get_dummies_transform)


In [483]:
pipeline_F8 = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer)

])
pipeline_F8

In [484]:
data_F8 = pipeline_F8.fit_transform(data)
data_F8.head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64


Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,sexual_activity,concentration,optimisim,mood_swing_YES,suicidal_thoughts_YES,anorxia_YES,authority_respect_YES,try_explanation_YES,aggressive_response_YES,ignore_move_on_YES,nervous_break_down_YES,admit_mistakes_YES,overthinking_YES,expert_diagnose
0,3,1,2,2,3,3,4,1,1,0,0,1,0,0,1,1,1,Bipolar Type-2
1,3,1,3,2,4,2,5,0,1,0,0,0,0,0,0,0,0,Depression
2,2,4,2,2,6,5,7,1,0,0,0,1,1,0,1,1,0,Bipolar Type-1
3,3,1,3,4,3,2,2,1,1,1,0,1,0,0,0,0,0,Bipolar Type-2
4,3,3,2,2,5,5,6,0,0,0,0,0,0,0,1,1,1,Normal


In [485]:
def drop_high_corr_cols(d, threshold=0.9, target_col='expert_diagnose'):
    d = pd.DataFrame(d).copy()
    X = d.drop(columns=target_col)
    
    corr_matrix = X.corr().abs()
    
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
    
    print("Columns Eliminated for High Collinearity:", to_drop)

    X_clean = X.drop(columns=to_drop, errors='ignore')
    return pd.concat([X_clean, d[target_col]], axis=1)

drop_high_corr_transformer = FunctionTransformer(drop_high_corr_cols)



In [486]:
pipeline_F9 = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer),
    ('drop_high_corr', drop_high_corr_transformer),

])
pipeline_F9

In [487]:
data_F9 = pipeline_F9.fit_transform(data)
data_F9.head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []


Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,sexual_activity,concentration,optimisim,mood_swing_YES,suicidal_thoughts_YES,anorxia_YES,authority_respect_YES,try_explanation_YES,aggressive_response_YES,ignore_move_on_YES,nervous_break_down_YES,admit_mistakes_YES,overthinking_YES,expert_diagnose
0,3,1,2,2,3,3,4,1,1,0,0,1,0,0,1,1,1,Bipolar Type-2
1,3,1,3,2,4,2,5,0,1,0,0,0,0,0,0,0,0,Depression
2,2,4,2,2,6,5,7,1,0,0,0,1,1,0,1,1,0,Bipolar Type-1
3,3,1,3,4,3,2,2,1,1,1,0,1,0,0,0,0,0,Bipolar Type-2
4,3,3,2,2,5,5,6,0,0,0,0,0,0,0,1,1,1,Normal


In [488]:
def drop_zero_variance_cols(d):
    d = pd.DataFrame(d).copy()
    deleted = d.columns[d.nunique() == 1].tolist()
    print("Columns eliminated by zero variance:", deleted)
    return d.loc[:, d.nunique() > 1]

drop_zero_var_transformer = FunctionTransformer(drop_zero_variance_cols)

In [489]:
pipeline_F10 = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer),
    ('drop_high_corr', drop_high_corr_transformer),
    ('drop_zero_variance', drop_zero_var_transformer),
])
pipeline_F10

In [490]:
data_F10 = pipeline_F10.fit_transform(data)
data_F10.head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []


Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,sexual_activity,concentration,optimisim,mood_swing_YES,suicidal_thoughts_YES,anorxia_YES,authority_respect_YES,try_explanation_YES,aggressive_response_YES,ignore_move_on_YES,nervous_break_down_YES,admit_mistakes_YES,overthinking_YES,expert_diagnose
0,3,1,2,2,3,3,4,1,1,0,0,1,0,0,1,1,1,Bipolar Type-2
1,3,1,3,2,4,2,5,0,1,0,0,0,0,0,0,0,0,Depression
2,2,4,2,2,6,5,7,1,0,0,0,1,1,0,1,1,0,Bipolar Type-1
3,3,1,3,4,3,2,2,1,1,1,0,1,0,0,0,0,0,Bipolar Type-2
4,3,3,2,2,5,5,6,0,0,0,0,0,0,0,1,1,1,Normal


In [491]:
from sklearn.tree import DecisionTreeClassifier

def select_features(d, feature_list=None):
    if feature_list is not None:
        print("Using predefined features:", feature_list)
        return d[feature_list + [target]]

    X = d.drop(columns=target)
    y = d[target]
    
    dtc = DecisionTreeClassifier(random_state=42)
    dtc.fit(X, y)
    
    dtc_importances = pd.Series(dtc.feature_importances_, index=X.columns)
    features = dtc_importances.sort_values(ascending=False).head(11).index.tolist()
    print("Best features according to DecisionTreeClassifier:", features)

    return d[features + [target]]

  
select_features_transformer = FunctionTransformer(
    select_features,
    kw_args={'feature_list': None}
)



In [492]:
pipeline_F11 = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer),
    ('drop_high_corr', drop_high_corr_transformer),
    ('drop_zero_variance', drop_zero_var_transformer),
    ('select_features', select_features_transformer),
])
pipeline_F11

In [493]:
data_F11 = pipeline_F11.fit_transform(data)
data_F11.head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []
Best features according to DecisionTreeClassifier: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


Unnamed: 0,mood_swing_YES,sadness,sexual_activity,euphoric,optimisim,suicidal_thoughts_YES,exhausted,concentration,sleep_dissorder,anorxia_YES,nervous_break_down_YES,expert_diagnose
0,1,3,3,1,4,1,2,3,2,0,1,Bipolar Type-2
1,0,3,4,1,5,1,3,2,2,0,0,Depression
2,1,2,6,4,7,0,2,5,2,0,1,Bipolar Type-1
3,1,3,3,1,2,1,3,2,4,1,0,Bipolar Type-2
4,0,3,5,3,6,0,2,5,2,0,1,Normal


## Scaling 

In [None]:
def scaling_features(d):
    d = pd.DataFrame(d).copy()
    X = d.drop(columns=target)
    y_ = d[target]
    scaler = StandardScaler() 
    X_scaled = scaler.fit_transform(X)
    return pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y_], axis=1)

scaling_features_transformer = FunctionTransformer(scaling_features, feature_names_out='one-to-one')


In [495]:
pipeline_12 = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer),
    ('drop_high_corr', drop_high_corr_transformer),
    ('drop_zero_variance', drop_zero_var_transformer),
    ('select_features', select_features_transformer),
    #('scaling', scaling_features_transformer)
])
pipeline_12

In [496]:
data_F12 = pipeline_12.fit_transform(data)
data_F12.head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []
Best features according to DecisionTreeClassifier: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


Unnamed: 0,mood_swing_YES,sadness,sexual_activity,euphoric,optimisim,suicidal_thoughts_YES,exhausted,concentration,sleep_dissorder,anorxia_YES,nervous_break_down_YES,expert_diagnose
0,1,3,3,1,4,1,2,3,2,0,1,Bipolar Type-2
1,0,3,4,1,5,1,3,2,2,0,0,Depression
2,1,2,6,4,7,0,2,5,2,0,1,Bipolar Type-1
3,1,3,3,1,2,1,3,2,4,1,0,Bipolar Type-2
4,0,3,5,3,6,0,2,5,2,0,1,Normal


In [497]:
def drop_target_col(d):
  return d.drop(columns=target)

drop_target_col_transformer = FunctionTransformer(drop_target_col)

  

In [498]:
pipeline_data_processing = Pipeline(
    [
        ("pipeline_data_processing", pipeline_12),
        ("drop_target_value", drop_target_col_transformer),
    ]
)
pipeline_data_processing

In [None]:
X = pipeline_data_processing.fit_transform(data)
X.head()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []
Best features according to DecisionTreeClassifier: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


Unnamed: 0,mood_swing_YES,sadness,sexual_activity,euphoric,optimisim,suicidal_thoughts_YES,exhausted,concentration,sleep_dissorder,anorxia_YES,nervous_break_down_YES
0,1,3,3,1,4,1,2,3,2,0,1
1,0,3,4,1,5,1,3,2,2,0,0
2,1,2,6,4,7,0,2,5,2,0,1
3,1,3,3,1,2,1,3,2,4,1,0
4,0,3,5,3,6,0,2,5,2,0,1


In [500]:
y = data["Expert Diagnose"]
y.head()

0    Bipolar Type-2
1        Depression
2    Bipolar Type-1
3    Bipolar Type-2
4            Normal
Name: Expert Diagnose, dtype: object

In [501]:
X.head()

Unnamed: 0,mood_swing_YES,sadness,sexual_activity,euphoric,optimisim,suicidal_thoughts_YES,exhausted,concentration,sleep_dissorder,anorxia_YES,nervous_break_down_YES
0,1,3,3,1,4,1,2,3,2,0,1
1,0,3,4,1,5,1,3,2,2,0,0
2,1,2,6,4,7,0,2,5,2,0,1
3,1,3,3,1,2,1,3,2,4,1,0
4,0,3,5,3,6,0,2,5,2,0,1


In [502]:
best_features = [
    "mood_swing_YES",
    "sadness",
    "sexual_activity",
    "euphoric",
    "optimisim",
    "suicidal_thoughts_YES",
    "exhausted",
    "concentration",
    "sleep_dissorder",
    "anorxia_YES",
    "nervous_break_down_YES",
]

select_features_custom_list_transformer = FunctionTransformer(
    select_features, kw_args={"feature_list": best_features}
)

## Pipeline for new values

In [503]:
pipeline_for_new_values = Pipeline([
  ('normalize_columns', normalize_columns_transformer),
  ('drop_patient_number_column', drop_patient_number_transformer),
  ('clean_suicidal_thoughts', clean_suicidal_thoughts_transformer),
  ('ordinal_mapping', ordinal_map_transformer),
  ('ordinal_scores', ordinal_score_transformer),
  ('onehot_encoding', get_dummies_transformer), 
  ('select_features_custom_list', select_features_custom_list_transformer),
  ('scaling_features', scaling_features_transformer),
  ('drop_target_value', drop_target_col_transformer)])

pipeline_for_new_values

In [None]:
import pickle
from sklearn.svm import SVC

scaler = StandardScaler() 
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns) 

clf_svc_model = SVC(C=0.01178769, random_state=42, kernel='linear', class_weight='balanced')
clf_svc_model.fit(X_scaled, y)

with open("clf_svc_model.pkl", "wb") as f:
    pickle.dump((clf_svc_model, scaler), f)

clf_svc_model


In [527]:
X_to_predict = pipeline_for_new_values.fit_transform(data)

Normalized columns
Column patient number dropped
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Using predefined features: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


In [None]:
y_pred = clf_svc_model.predict(X_to_predict)

results_df = pd.DataFrame({
    "y_true": y,
    "y_pred": y_pred
})

results_df = results_df.join(X[best_features].reset_index(drop=True))

correct_preds = results_df[results_df["y_true"] == results_df["y_pred"]]
# depression_correct = correct_preds[correct_preds["y_true"] == "Depression"]
correct_preds.iloc[0]

y_true                    Bipolar Type-2
y_pred                    Bipolar Type-2
mood_swing_YES                         1
sadness                                3
sexual_activity                        3
euphoric                               1
optimisim                              4
suicidal_thoughts_YES                  1
exhausted                              2
concentration                          3
sleep_dissorder                        2
anorxia_YES                            0
nervous_break_down_YES                 1
Name: 0, dtype: object