<small>

Author: Felipe Garaycochea Lozada   
Student Code: 22500219

</small>

# 📚 Libraries

In [1]:
import numpy as np
import pandas as pd
import unicodedata
import re

from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


# 🗂️ Dataset Load

In [2]:
data = pd.read_csv("dataset/Dataset-Mental-Disorders.csv")
data.head()

Unnamed: 0,Patient Number,Sadness,Euphoric,Exhausted,Sleep dissorder,Mood Swing,Suicidal thoughts,Anorxia,Authority Respect,Try-Explanation,Aggressive Response,Ignore & Move-On,Nervous Break-down,Admit Mistakes,Overthinking,Sexual Activity,Concentration,Optimisim,Expert Diagnose
0,Patiant-01,Usually,Seldom,Sometimes,Sometimes,YES,YES,NO,NO,YES,NO,NO,YES,YES,YES,3 From 10,3 From 10,4 From 10,Bipolar Type-2
1,Patiant-02,Usually,Seldom,Usually,Sometimes,NO,YES,NO,NO,NO,NO,NO,NO,NO,NO,4 From 10,2 From 10,5 From 10,Depression
2,Patiant-03,Sometimes,Most-Often,Sometimes,Sometimes,YES,NO,NO,NO,YES,YES,NO,YES,YES,NO,6 From 10,5 From 10,7 From 10,Bipolar Type-1
3,Patiant-04,Usually,Seldom,Usually,Most-Often,YES,YES,YES,NO,YES,NO,NO,NO,NO,NO,3 From 10,2 From 10,2 From 10,Bipolar Type-2
4,Patiant-05,Usually,Usually,Sometimes,Sometimes,NO,NO,NO,NO,NO,NO,NO,YES,YES,YES,5 From 10,5 From 10,6 From 10,Normal


# Pipeline construction

## 🧹 Data Cleaning and Preprocessing

### 🔧 Normalizing Column Names

In [3]:
class NormalizeColumnNames(BaseEstimator, TransformerMixin):
    def fit(self, d, y=None):
        return self

    def transform(self, d):
        d = d.copy()
        d.columns = [self.normalize(col) for col in d.columns]
        return d
    
    

    def normalize(self, col):
        col = col.lower()
        col = ''.join(
            (c for c in unicodedata.normalize('NFD', col) if unicodedata.category(c) != 'Mn')
        )
        col = re.sub(r'[^\w]+', '_', col)
        col = col.strip('_')
        return col


In [4]:
def normalize_columns(d):
    def normalize_name(col):
        col = col.lower()
        col = ''.join(
            (c for c in unicodedata.normalize('NFD', col) if unicodedata.category(c) != 'Mn')
        )
        col = re.sub(r'[^\w]+', '_', col)
        col = col.strip('_')
        return col

    d = d.copy()
    d.columns = [normalize_name(c) for c in d.columns]
    print("Normalized columns")
    return d

normalize_columns_transformer  = FunctionTransformer(normalize_columns, validate=False)


In [5]:
pipeline_F1 = Pipeline([
    ('normalize_col_names', normalize_columns_transformer),
])
pipeline_F1

In [6]:
data_F1 = pipeline_F1.fit_transform(data)
data_F1.columns

Normalized columns


Index(['patient_number', 'sadness', 'euphoric', 'exhausted', 'sleep_dissorder',
       'mood_swing', 'suicidal_thoughts', 'anorxia', 'authority_respect',
       'try_explanation', 'aggressive_response', 'ignore_move_on',
       'nervous_break_down', 'admit_mistakes', 'overthinking',
       'sexual_activity', 'concentration', 'optimisim', 'expert_diagnose'],
      dtype='object')

### 🗑️ Deleting irrelevant columns

In [7]:
def drop_patient_number_column(d):
    print("Column patient number dropped")
    return d.drop(columns='patient_number') 
drop_patient_number_transformer  = FunctionTransformer(drop_patient_number_column, validate=False)

In [8]:
pipeline_F2 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer ),
])
pipeline_F2

In [9]:
data_F2 = pipeline_F2.fit_transform(data)
data_F2.columns

Normalized columns
Column patient number dropped


Index(['sadness', 'euphoric', 'exhausted', 'sleep_dissorder', 'mood_swing',
       'suicidal_thoughts', 'anorxia', 'authority_respect', 'try_explanation',
       'aggressive_response', 'ignore_move_on', 'nervous_break_down',
       'admit_mistakes', 'overthinking', 'sexual_activity', 'concentration',
       'optimisim', 'expert_diagnose'],
      dtype='object')

In [10]:
def drop_duplicate_rows(d):
    print("Duplicate Records (before):", d[d.duplicated()].shape[0])
    d.drop_duplicates(inplace=True)
    print("Duplicate Records (after):", d[d.duplicated()].shape[0])
    return d

drop_duplicates_transformer = FunctionTransformer(drop_duplicate_rows, validate=False)

In [11]:
pipeline_F3 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
])
pipeline_F3

In [12]:
data_F3 = pipeline_F3.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0


In [13]:
def drop_na_rows(d):
    original_rows = len(d)
    d = d.dropna()
    rows_removed = original_rows - len(d)
    print(f"{rows_removed} rows with null values ​​were removed.")
    return d
  
drop_na_transformer = FunctionTransformer(drop_na_rows, validate=False)

In [14]:
pipeline_F4 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
    ('drop_na', drop_na_transformer),
])

In [15]:
data_F4 = pipeline_F4.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.


In [16]:
def clean_suicidal_thoughts(d):
    d = d.copy()
    d['suicidal_thoughts'] = d['suicidal_thoughts'].str.replace(" ", "")
    return d

clean_suicidal_thoughts_transformer = FunctionTransformer(clean_suicidal_thoughts)


In [17]:
pipeline_F5 = Pipeline([
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
    ('drop_na', drop_na_transformer),
    ('clean_suicidal_thoughts', clean_suicidal_thoughts_transformer),
])

In [18]:
data_F5 = pipeline_F5.fit_transform(data)
data_F5["suicidal_thoughts"].unique()

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.


array(['YES', 'NO'], dtype=object)

### 🔄 Converting Text to Numeric Values

In [19]:
def print_unique_values(data, cols):
    for col in cols:
        print(f"{col} → {data[col].unique()} | dtype: {data[col].dtype}")

In [20]:
def map_ordinal_columns(d):
    d = d.copy()
    ordinal_map = {
        'seldom': 1,
        'sometimes': 2,
        'usually': 3,
        'most-often': 4
    }
    
    ordinal_cat_cols = ['sadness', 'euphoric', 'exhausted', 'sleep_dissorder']

    for col in ordinal_cat_cols:
        d[col] = d[col].str.lower().str.replace('-', ' ').str.strip()
        d[col] = d[col].replace({'most often': 'most-often'})
        d[col] = d[col].map(ordinal_map)

    print_unique_values(d, ordinal_cat_cols)

    return d

ordinal_map_transformer = FunctionTransformer(map_ordinal_columns, validate=False)

In [21]:
pipeline_F6 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
    ('drop_na', drop_na_transformer),
    ('clean_suicidal_thoughts', clean_suicidal_thoughts_transformer),
    ('ordinal_mapping', ordinal_map_transformer),
])
pipeline_F6

In [22]:
data_F6 = pipeline_F6.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64


In [23]:
data_F6

Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,mood_swing,suicidal_thoughts,anorxia,authority_respect,try_explanation,aggressive_response,ignore_move_on,nervous_break_down,admit_mistakes,overthinking,sexual_activity,concentration,optimisim,expert_diagnose
0,3,1,2,2,YES,YES,NO,NO,YES,NO,NO,YES,YES,YES,3 From 10,3 From 10,4 From 10,Bipolar Type-2
1,3,1,3,2,NO,YES,NO,NO,NO,NO,NO,NO,NO,NO,4 From 10,2 From 10,5 From 10,Depression
2,2,4,2,2,YES,NO,NO,NO,YES,YES,NO,YES,YES,NO,6 From 10,5 From 10,7 From 10,Bipolar Type-1
3,3,1,3,4,YES,YES,YES,NO,YES,NO,NO,NO,NO,NO,3 From 10,2 From 10,2 From 10,Bipolar Type-2
4,3,3,2,2,NO,NO,NO,NO,NO,NO,NO,YES,YES,YES,5 From 10,5 From 10,6 From 10,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,4,1,3,2,NO,YES,NO,NO,YES,NO,YES,NO,NO,YES,2 From 10,5 From 10,3 From 10,Depression
116,2,2,2,1,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,6 From 10,7 From 10,8 From 10,Bipolar Type-1
117,3,2,3,2,YES,NO,YES,YES,NO,NO,NO,YES,NO,YES,1 From 10,5 From 10,3 From 10,Bipolar Type-2
118,3,2,1,1,NO,YES,YES,NO,YES,YES,YES,NO,YES,YES,7 From 10,7 From 10,7 From 10,Depression


In [24]:
def ordinal_score(d):
    d = d.copy()
    
    ordinal_num_cols = ["sexual_activity", "concentration", "optimisim"]
    
    def extract_score(value):
        return int(value.split()[0]) if isinstance(value, str) else value

    for col in ordinal_num_cols:
        d[col] = d[col].apply(extract_score)
    
    print_unique_values(d, ordinal_num_cols)

    return d

ordinal_score_transformer = FunctionTransformer(ordinal_score, validate=False)



In [25]:
pipeline_F7 = Pipeline(steps=[
    ('normalize_columns', normalize_columns_transformer),
    ('drop_patient_number', drop_patient_number_transformer),
    ('drop_duplicates', drop_duplicates_transformer),
    ('drop_na', drop_na_transformer),
    ('clean_suicidal_thoughts', clean_suicidal_thoughts_transformer),
    ('ordinal_mapping', ordinal_map_transformer),
    ('ordinal_scores', ordinal_score_transformer),
])
pipeline_F7

In [26]:
data_F7 = pipeline_F7.fit_transform(data)

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64


## 🛠️ Feature Engineering

In [27]:
target = "expert_diagnose"
X = data_F7.drop(columns=target)
y = data_F7[target]

In [28]:
def get_dummies_transform(d):
    d = pd.DataFrame(d).copy()
    X_dummies = pd.get_dummies(d.drop(columns=target), drop_first=True, dtype=int)
    return pd.concat([X_dummies, d[target]], axis=1)


get_dummies_transformer = FunctionTransformer(get_dummies_transform)


In [29]:
pipeline2_F8 = Pipeline(steps=[
    ('onehot_encoding', get_dummies_transformer)

])
pipeline2_F8

In [30]:
pipeline_F8 = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer)

])
pipeline_F8

In [31]:
data_F8 = pipeline_F8.fit_transform(data)
data_F8

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64


Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,sexual_activity,concentration,optimisim,mood_swing_YES,suicidal_thoughts_YES,anorxia_YES,authority_respect_YES,try_explanation_YES,aggressive_response_YES,ignore_move_on_YES,nervous_break_down_YES,admit_mistakes_YES,overthinking_YES,expert_diagnose
0,3,1,2,2,3,3,4,1,1,0,0,1,0,0,1,1,1,Bipolar Type-2
1,3,1,3,2,4,2,5,0,1,0,0,0,0,0,0,0,0,Depression
2,2,4,2,2,6,5,7,1,0,0,0,1,1,0,1,1,0,Bipolar Type-1
3,3,1,3,4,3,2,2,1,1,1,0,1,0,0,0,0,0,Bipolar Type-2
4,3,3,2,2,5,5,6,0,0,0,0,0,0,0,1,1,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,4,1,3,2,2,5,3,0,1,0,0,1,0,1,0,0,1,Depression
116,2,2,2,1,6,7,8,1,0,0,0,0,1,0,0,0,1,Bipolar Type-1
117,3,2,3,2,1,5,3,1,0,1,1,0,0,0,1,0,1,Bipolar Type-2
118,3,2,1,1,7,7,7,0,1,1,0,1,1,1,0,1,1,Depression


In [32]:
def drop_high_corr_cols(d, threshold=0.9, target_col='expert_diagnose'):
    d = pd.DataFrame(d).copy()
    X = d.drop(columns=target_col)
    
    corr_matrix = X.corr().abs()
    
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
    
    print("Columns Eliminated for High Collinearity:", to_drop)

    X_clean = X.drop(columns=to_drop, errors='ignore')
    return pd.concat([X_clean, d[target_col]], axis=1)

drop_high_corr_transformer = FunctionTransformer(drop_high_corr_cols)



In [33]:
pipeline_F9 = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer),
    ('drop_high_corr', drop_high_corr_transformer),

])
pipeline_F9

In [34]:
data_F9 = pipeline_F9.fit_transform(data)
data_F9

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []


Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,sexual_activity,concentration,optimisim,mood_swing_YES,suicidal_thoughts_YES,anorxia_YES,authority_respect_YES,try_explanation_YES,aggressive_response_YES,ignore_move_on_YES,nervous_break_down_YES,admit_mistakes_YES,overthinking_YES,expert_diagnose
0,3,1,2,2,3,3,4,1,1,0,0,1,0,0,1,1,1,Bipolar Type-2
1,3,1,3,2,4,2,5,0,1,0,0,0,0,0,0,0,0,Depression
2,2,4,2,2,6,5,7,1,0,0,0,1,1,0,1,1,0,Bipolar Type-1
3,3,1,3,4,3,2,2,1,1,1,0,1,0,0,0,0,0,Bipolar Type-2
4,3,3,2,2,5,5,6,0,0,0,0,0,0,0,1,1,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,4,1,3,2,2,5,3,0,1,0,0,1,0,1,0,0,1,Depression
116,2,2,2,1,6,7,8,1,0,0,0,0,1,0,0,0,1,Bipolar Type-1
117,3,2,3,2,1,5,3,1,0,1,1,0,0,0,1,0,1,Bipolar Type-2
118,3,2,1,1,7,7,7,0,1,1,0,1,1,1,0,1,1,Depression


In [35]:
def drop_zero_variance_cols(d):
    d = pd.DataFrame(d).copy()
    deleted = d.columns[d.nunique() == 1].tolist()
    print("Columns eliminated by zero variance:", deleted)
    return d.loc[:, d.nunique() > 1]

drop_zero_var_transformer = FunctionTransformer(drop_zero_variance_cols)

In [36]:
pipeline_F10 = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer),
    ('drop_high_corr', drop_high_corr_transformer),
    ('drop_zero_variance', drop_zero_var_transformer),
])
pipeline_F10

In [37]:
data_F10 = pipeline_F10.fit_transform(data)
data_F10

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []


Unnamed: 0,sadness,euphoric,exhausted,sleep_dissorder,sexual_activity,concentration,optimisim,mood_swing_YES,suicidal_thoughts_YES,anorxia_YES,authority_respect_YES,try_explanation_YES,aggressive_response_YES,ignore_move_on_YES,nervous_break_down_YES,admit_mistakes_YES,overthinking_YES,expert_diagnose
0,3,1,2,2,3,3,4,1,1,0,0,1,0,0,1,1,1,Bipolar Type-2
1,3,1,3,2,4,2,5,0,1,0,0,0,0,0,0,0,0,Depression
2,2,4,2,2,6,5,7,1,0,0,0,1,1,0,1,1,0,Bipolar Type-1
3,3,1,3,4,3,2,2,1,1,1,0,1,0,0,0,0,0,Bipolar Type-2
4,3,3,2,2,5,5,6,0,0,0,0,0,0,0,1,1,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,4,1,3,2,2,5,3,0,1,0,0,1,0,1,0,0,1,Depression
116,2,2,2,1,6,7,8,1,0,0,0,0,1,0,0,0,1,Bipolar Type-1
117,3,2,3,2,1,5,3,1,0,1,1,0,0,0,1,0,1,Bipolar Type-2
118,3,2,1,1,7,7,7,0,1,1,0,1,1,1,0,1,1,Depression


In [38]:
from sklearn.tree import DecisionTreeClassifier

def select_features(d):

    X = d.drop(columns=target)
    y = d[target]
    
    
    dtc = DecisionTreeClassifier(random_state=42)
    dtc.fit(X, y)
    
    dtc_importances = pd.Series(dtc.feature_importances_, index=X.columns)
    features = dtc_importances.sort_values(ascending=False).head(11).index.tolist()
    print("Best features according to DecisionTreeClassifier:", features)

    return d[features + [target]]
  
select_features_transformer = FunctionTransformer(select_features)



In [39]:
pipeline_F11 = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer),
    ('drop_high_corr', drop_high_corr_transformer),
    ('drop_zero_variance', drop_zero_var_transformer),
    ('select_features', select_features_transformer),
])
pipeline_F11

In [40]:
data_F11 = pipeline_F11.fit_transform(data)
data_F11

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []
Best features according to DecisionTreeClassifier: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


Unnamed: 0,mood_swing_YES,sadness,sexual_activity,euphoric,optimisim,suicidal_thoughts_YES,exhausted,concentration,sleep_dissorder,anorxia_YES,nervous_break_down_YES,expert_diagnose
0,1,3,3,1,4,1,2,3,2,0,1,Bipolar Type-2
1,0,3,4,1,5,1,3,2,2,0,0,Depression
2,1,2,6,4,7,0,2,5,2,0,1,Bipolar Type-1
3,1,3,3,1,2,1,3,2,4,1,0,Bipolar Type-2
4,0,3,5,3,6,0,2,5,2,0,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...
115,0,4,2,1,3,1,3,5,2,0,0,Depression
116,1,2,6,2,8,0,2,7,1,0,0,Bipolar Type-1
117,1,3,1,2,3,0,3,5,2,1,1,Bipolar Type-2
118,0,3,7,2,7,1,1,7,1,1,0,Depression


## Scaling 

In [41]:
def scaling_features(d):
    d = pd.DataFrame(d).copy()
    X = d.drop(columns=target)
    y_ = d[target]
    scaled = StandardScaler().fit_transform(X)
    return pd.concat([pd.DataFrame(scaled, columns=X.columns), y_], axis=1)

scaling_features_transformer = FunctionTransformer(scaling_features, feature_names_out='one-to-one')


In [42]:
pipeline_final = Pipeline(steps=[
    ('preprocessing', pipeline_F7),
    ('onehot_encoding', get_dummies_transformer),
    ('drop_high_corr', drop_high_corr_transformer),
    ('drop_zero_variance', drop_zero_var_transformer),
    ('select_features', select_features_transformer),
    ('scaling', scaling_features_transformer)
])
pipeline_final

In [43]:
data_F12 = pipeline_final.fit_transform(data)
data_F12

Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []
Best features according to DecisionTreeClassifier: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']


Unnamed: 0,mood_swing_YES,sadness,sexual_activity,euphoric,optimisim,suicidal_thoughts_YES,exhausted,concentration,sleep_dissorder,anorxia_YES,nervous_break_down_YES,expert_diagnose
0,1.051315,0.488813,-0.869935,-1.014999,-0.235336,1.051315,-0.623370,-0.698317,-0.470670,-0.788430,0.967204,Bipolar Type-2
1,-0.951190,0.488813,-0.370451,-1.014999,0.268955,1.051315,0.360898,-1.256970,-0.470670,-0.788430,-1.033908,Depression
2,1.051315,-0.597438,0.628518,2.247498,1.277536,-0.951190,-0.623370,0.418990,-0.470670,-0.788430,0.967204,Bipolar Type-1
3,1.051315,0.488813,-0.869935,-1.014999,-1.243917,1.051315,0.360898,-1.256970,1.583164,1.268344,-1.033908,Bipolar Type-2
4,-0.951190,0.488813,0.129033,1.159999,0.773246,-0.951190,-0.623370,0.418990,-0.470670,-0.788430,0.967204,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...
115,-0.951190,1.575064,-1.369419,-1.014999,-0.739626,1.051315,0.360898,0.418990,-0.470670,-0.788430,-1.033908,Depression
116,1.051315,-0.597438,0.628518,0.072500,1.781827,-0.951190,-0.623370,1.536297,-1.497587,-0.788430,-1.033908,Bipolar Type-1
117,1.051315,0.488813,-1.868904,0.072500,-0.739626,-0.951190,0.360898,0.418990,-0.470670,1.268344,0.967204,Bipolar Type-2
118,-0.951190,0.488813,1.128002,0.072500,1.277536,1.051315,-1.607638,1.536297,-1.497587,1.268344,-1.033908,Depression


In [45]:

from sklearn.svm import SVC

data_final = pipeline_final.fit_transform(data)

X_final = data_final.drop(columns=target)
y_final = data_final[target]

clf_svc = SVC(C=0.01178769, random_state=42, kernel='linear', class_weight='balanced')
clf_svc.fit(X_final, y_final)


Normalized columns
Column patient number dropped
Duplicate Records (before): 0
Duplicate Records (after): 0
0 rows with null values ​​were removed.
sadness → [3 2 1 4] | dtype: int64
euphoric → [1 4 3 2] | dtype: int64
exhausted → [2 3 1 4] | dtype: int64
sleep_dissorder → [2 4 3 1] | dtype: int64
sexual_activity → [3 4 6 5 7 8 9 2 1] | dtype: int64
concentration → [3 2 5 4 7 6 1 8] | dtype: int64
optimisim → [4 5 7 2 6 9 3 8 1] | dtype: int64
Columns Eliminated for High Collinearity: []
Columns eliminated by zero variance: []
Best features according to DecisionTreeClassifier: ['mood_swing_YES', 'sadness', 'sexual_activity', 'euphoric', 'optimisim', 'suicidal_thoughts_YES', 'exhausted', 'concentration', 'sleep_dissorder', 'anorxia_YES', 'nervous_break_down_YES']
