In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import os
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px

In [2]:
df = pd.read_csv('b_depressed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1429 entries, 0 to 1428
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Survey_id              1429 non-null   int64  
 1   Ville_id               1429 non-null   int64  
 2   sex                    1429 non-null   int64  
 3   Age                    1429 non-null   int64  
 4   Married                1429 non-null   int64  
 5   Number_children        1429 non-null   int64  
 6   education_level        1429 non-null   int64  
 7   total_members          1429 non-null   int64  
 8   gained_asset           1429 non-null   int64  
 9   durable_asset          1429 non-null   int64  
 10  save_asset             1429 non-null   int64  
 11  living_expenses        1429 non-null   int64  
 12  other_expenses         1429 non-null   int64  
 13  incoming_salary        1429 non-null   int64  
 14  incoming_own_farm      1429 non-null   int64  
 15  inco

In [3]:
df.drop(['Survey_id', 'Ville_id'], axis=1, inplace=True)

In [4]:
df.isna().sum()

sex                       0
Age                       0
Married                   0
Number_children           0
education_level           0
total_members             0
gained_asset              0
durable_asset             0
save_asset                0
living_expenses           0
other_expenses            0
incoming_salary           0
incoming_own_farm         0
incoming_business         0
incoming_no_business      0
incoming_agricultural     0
farm_expenses             0
labor_primary             0
lasting_investment        0
no_lasting_investmen     20
depressed                 0
dtype: int64

In [5]:
df.dropna(axis=0, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1409 entries, 0 to 1428
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sex                    1409 non-null   int64  
 1   Age                    1409 non-null   int64  
 2   Married                1409 non-null   int64  
 3   Number_children        1409 non-null   int64  
 4   education_level        1409 non-null   int64  
 5   total_members          1409 non-null   int64  
 6   gained_asset           1409 non-null   int64  
 7   durable_asset          1409 non-null   int64  
 8   save_asset             1409 non-null   int64  
 9   living_expenses        1409 non-null   int64  
 10  other_expenses         1409 non-null   int64  
 11  incoming_salary        1409 non-null   int64  
 12  incoming_own_farm      1409 non-null   int64  
 13  incoming_business      1409 non-null   int64  
 14  incoming_no_business   1409 non-null   int64  
 15  inco

In [7]:
data = df.copy(deep=True)

## Prepare Data for EDA Dashboard

In [8]:
df['depressed'] = df['depressed'].astype('category')
df['depressed'] = df['depressed'].cat.rename_categories({1: 'Depressed', 0: 'Not'})
df['Married'] = df['Married'].astype('category')
df['Married'] = df['Married'].cat.rename_categories({1: 'Married', 0: 'Unmarried'})
df['sex'] = df['sex'].astype('category')
df['sex'] = df['sex'].cat.rename_categories({1: 'Man', 0: 'Woman'})
df['incoming_salary'] = df['incoming_salary'].astype('category')
df['incoming_salary'] = df['incoming_salary'].cat.rename_categories({1: 'Yes', 0: 'No'})
df['incoming_own_farm'] = df['incoming_own_farm'].astype('category')
df['incoming_own_farm'] = df['incoming_own_farm'].cat.rename_categories({1: 'Yes', 0: 'No'})
df['incoming_business'] = df['incoming_business'].astype('category')
df['incoming_business'] = df['incoming_business'].cat.rename_categories({1: 'Yes', 0: 'No'})
df['incoming_no_business'] = df['incoming_no_business'].astype('category')
df['incoming_no_business'] = df['incoming_no_business'].cat.rename_categories({1: 'Yes', 0: 'No'})
df['labor_primary'] = df['labor_primary'].astype('category')
df['labor_primary'] = df['labor_primary'].cat.rename_categories({1: 'Yes', 0: 'No'})


In [9]:
df.apply(lambda x: x.nunique())

sex                        2
Age                       69
Married                    2
Number_children           12
education_level           18
total_members             12
gained_asset             440
durable_asset            584
save_asset               266
living_expenses          408
other_expenses           608
incoming_salary            2
incoming_own_farm          2
incoming_business          2
incoming_no_business       2
incoming_agricultural    330
farm_expenses            694
labor_primary              2
lasting_investment       965
no_lasting_investmen     939
depressed                  2
dtype: int64

Save data frame to EDA dashboard

In [10]:
df.to_pickle("data/cleaned_data.pkl")

### Imbalanced Dataset

To deal with imbalanced dataset we well use Synthetic Minority Oversampling Technique or SMOTE to handdle it.

In [11]:
data.depressed.value_counts()

0    1174
1     235
Name: depressed, dtype: int64

In [12]:
from imblearn.over_sampling import SMOTE
# Resampling the minority class. The strategy can be changed as required.
sm = SMOTE(sampling_strategy='minority', random_state=42)
# Fit the model to generate the data.
oversampled_X, oversampled_Y = sm.fit_resample(data.drop('depressed', axis=1), data['depressed'])
oversampled = pd.concat([pd.DataFrame(oversampled_Y), pd.DataFrame(oversampled_X)], axis=1)

In [13]:
oversampled['depressed'].value_counts()

0    1174
1    1174
Name: depressed, dtype: int64

In [14]:
x = oversampled.drop('depressed',axis=1)
y = oversampled['depressed']

## What is the Causes of Depression?

We will dive into the factors that most significantly influence an individual's likelihood of experiencing depression. Through the application of various machine learning models on provided datasets, we seek to uncover patterns based on the available features. By analyzing the models' outcomes, we can pinpoint the most influential features, which are the likely culprits in causing depression.

In [15]:
x

Unnamed: 0,sex,Age,Married,Number_children,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_salary,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen
0,1,28,1,4,10,5,28912201,22861940,23399979,26692283,28203066,0,0,0,0,30028818,31363432,0,28411718,2.829271e+07
1,1,23,1,3,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,0,30028818,31363432,0,28411718,2.829271e+07
2,1,22,1,3,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,0,30028818,31363432,0,28411718,2.829271e+07
3,1,27,1,2,10,4,52667108,19698904,49647648,397715,44042267,0,1,0,1,22288055,18751329,0,7781123,6.921976e+07
4,0,59,0,4,10,6,82606287,17352654,23399979,80877619,74503502,1,0,0,0,53384566,20731006,1,20100562,4.341945e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2343,1,58,0,0,7,5,28912201,22861940,23399979,26692283,28203066,0,0,0,0,30028818,31363432,0,28411718,2.829271e+07
2344,1,40,1,1,12,3,33405614,46909658,86025095,37995297,3170122,0,0,0,0,10867332,19908852,0,50560308,3.121627e+07
2345,1,27,1,2,5,5,28912201,22861940,23399979,26692283,28203066,0,0,0,0,30028818,31363432,0,28411718,2.829271e+07
2346,1,29,1,2,13,4,61213160,34822879,26454490,9756410,34782737,0,0,0,0,18330894,89866949,0,34578304,2.145360e+07


In [16]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

In [17]:
x_train.head()

Unnamed: 0,sex,Age,Married,Number_children,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_salary,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen
215,1,72,0,5,5,9,20651573,22085196,23399979,70067245,94490681,0,1,0,0,24023054,20019212,0,25976105,24289980.0
1764,1,28,1,3,11,5,29769820,26730450,17118822,84725463,45519863,0,0,0,0,51871952,47826498,0,49527430,49538560.0
1010,1,34,1,4,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,0,30028818,31363432,0,28411718,28292710.0
704,1,19,1,1,10,3,28912201,19218443,23399979,20019212,49727722,0,1,0,0,30028818,40038424,0,23230739,44843040.0
1474,1,31,1,2,9,5,25326483,9888077,15695813,10490609,17730705,0,1,0,0,9484571,1684705,0,40460048,24587270.0


### **Model 1: XGBoost**

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import sklearn
import xgboost as xgb
from xgboost import XGBClassifier
import optuna


In [19]:
def objective(trial):

    param = {
        "objective": "binary:logistic",
        "n_estimators" : trial.suggest_int('n_estimators', 0, 1000), 
        "max_depth" : trial.suggest_int('max_depth', 3, 10),
        "gamma" : trial.suggest_float('gamma', 0.0, 0.9),
        "learning_rate" : trial.suggest_float('learning_rate', 0.0, 0.9)
    }
    
    model = XGBClassifier(**param)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    predictions = np.rint(y_pred)
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

In [20]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
print(study.best_trial)
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-10-18 01:39:54,481] A new study created in memory with name: no-name-3632bf72-f55c-43c7-a6de-66d66c9ccbdc
[I 2023-10-18 01:39:55,344] Trial 0 finished with value: 0.8340425531914893 and parameters: {'n_estimators': 312, 'max_depth': 10, 'gamma': 0.7349313711318972, 'learning_rate': 0.3481900815902065}. Best is trial 0 with value: 0.8340425531914893.
[I 2023-10-18 01:39:56,048] Trial 1 finished with value: 0.7936170212765957 and parameters: {'n_estimators': 796, 'max_depth': 8, 'gamma': 0.6817634161284081, 'learning_rate': 0.8471890070369724}. Best is trial 0 with value: 0.8340425531914893.
[I 2023-10-18 01:39:56,119] Trial 2 finished with value: 0.7808510638297872 and parameters: {'n_estimators': 16, 'max_depth': 5, 'gamma': 0.4737503005983182, 'learning_rate': 0.3376560976770683}. Best is trial 0 with value: 0.8340425531914893.
[I 2023-10-18 01:39:56,749] Trial 3 finished with value: 0.8042553191489362 and parameters: {'n_estimators': 415, 'max_depth': 3, 'gamma': 0.0014998514

FrozenTrial(number=844, state=TrialState.COMPLETE, values=[0.8595744680851064], datetime_start=datetime.datetime(2023, 10, 18, 1, 54, 8, 489663), datetime_complete=datetime.datetime(2023, 10, 18, 1, 54, 9, 620087), params={'n_estimators': 924, 'max_depth': 10, 'gamma': 0.16466141355212782, 'learning_rate': 0.19862025463541558}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=1000, log=False, low=0, step=1), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'gamma': FloatDistribution(high=0.9, log=False, low=0.0, step=None), 'learning_rate': FloatDistribution(high=0.9, log=False, low=0.0, step=None)}, trial_id=844, value=None)
Number of finished trials:  1000
Best trial:
  Value: 0.8595744680851064
  Params: 
    n_estimators: 924
    max_depth: 10
    gamma: 0.16466141355212782
    learning_rate: 0.19862025463541558


In [21]:
param = study.best_params

model = XGBClassifier(**param)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
predictions = np.rint(y_pred)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test,predictions)
print(accuracy)
print(f1)

0.8595744680851064
0.8571428571428572


In [22]:
# feature importance
Feature_importance = pd.DataFrame({'Feature': x_train.columns, 'Importance': model.feature_importances_})
Feature_importance = Feature_importance.sort_values(by='Importance')
Feature_importance

fig = px.bar(Feature_importance, x='Importance', y='Feature')
# Menampilkan plot
fig.show()

In [23]:
score = pd.DataFrame({'accuracy': [accuracy], 'f1': [f1]})
score.to_pickle("data/score_xgb.pkl")
Feature_importance.to_pickle("data/features_xgb.pkl")

### **Model 2: LightGBM**

In [24]:
# build the lightgbm model
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [25]:
def objective(trial):

    param = {
        "objective": "binary",
        "learning_rate" : trial.suggest_float("learning_rate", 0, 1),
        "boosting_type" : trial.suggest_categorical("boosting_type", ["gbdt", "dart"]),
        "num_leaves" : trial.suggest_int("num_leaves", 20, 256),
        "min_child_samples" : trial.suggest_int("min_child_samples", 0, 80)
    }
    
    model = LGBMClassifier(**param)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    predictions = np.rint(y_pred)
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

In [26]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
print(study.best_trial)
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-10-18 01:57:11,871] A new study created in memory with name: no-name-3d694e20-2357-4c94-9760-2f57e898873e
[I 2023-10-18 01:57:12,088] Trial 0 finished with value: 0.823404255319149 and parameters: {'learning_rate': 0.19442237544555463, 'boosting_type': 'gbdt', 'num_leaves': 78, 'min_child_samples': 59}. Best is trial 0 with value: 0.823404255319149.
[I 2023-10-18 01:57:12,683] Trial 1 finished with value: 0.8361702127659575 and parameters: {'learning_rate': 0.7945040141236696, 'boosting_type': 'dart', 'num_leaves': 90, 'min_child_samples': 10}. Best is trial 1 with value: 0.8361702127659575.
[I 2023-10-18 01:57:13,126] Trial 2 finished with value: 0.8361702127659575 and parameters: {'learning_rate': 0.24387836359269588, 'boosting_type': 'dart', 'num_leaves': 58, 'min_child_samples': 25}. Best is trial 1 with value: 0.8361702127659575.
[I 2023-10-18 01:57:13,285] Trial 3 finished with value: 0.8042553191489362 and parameters: {'learning_rate': 0.8978384962361743, 'boosting_type'

FrozenTrial(number=656, state=TrialState.COMPLETE, values=[0.8680851063829788], datetime_start=datetime.datetime(2023, 10, 18, 2, 6, 29, 874263), datetime_complete=datetime.datetime(2023, 10, 18, 2, 6, 31, 243550), params={'learning_rate': 0.17300571537198278, 'boosting_type': 'gbdt', 'num_leaves': 231, 'min_child_samples': 5}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'boosting_type': CategoricalDistribution(choices=('gbdt', 'dart')), 'num_leaves': IntDistribution(high=256, log=False, low=20, step=1), 'min_child_samples': IntDistribution(high=80, log=False, low=0, step=1)}, trial_id=656, value=None)
Number of finished trials:  1000
Best trial:
  Value: 0.8680851063829788
  Params: 
    learning_rate: 0.17300571537198278
    boosting_type: gbdt
    num_leaves: 231
    min_child_samples: 5


In [27]:
param = study.best_params

model = LGBMClassifier(**param)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
predictions = np.rint(y_pred)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test,predictions)
print(accuracy)
print(f1)

0.8680851063829788
0.8640350877192983


In [28]:
# feature importance
Feature_importance = pd.DataFrame({'Feature': x_train.columns, 'Importance': model.feature_importances_})
Feature_importance = Feature_importance.sort_values(by='Importance')
Feature_importance

fig = px.bar(Feature_importance, x='Importance', y='Feature')
# Menampilkan plot
fig.show()

In [29]:
score = pd.DataFrame({'accuracy': [accuracy], 'f1': [f1]})
score.to_pickle("data/score_lgbm.pkl")
Feature_importance.to_pickle("data/features_lgbm.pkl")

### **Model 3: Logistic Regression**

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


In [31]:
scaler = StandardScaler()
x_train_log = scaler.fit_transform(x_train)
x_test_log = scaler.transform(x_test)

In [43]:
def objective(trial):

    param = {
        "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]),
        "C" : trial.suggest_float("C", 0, 10000, step= 1e-4),
        "max_iter": 5000,
        "verbose": 0,
    }
    
    model = LogisticRegression(**param)
    model.fit(x_train_log, y_train)
    y_pred = model.predict(x_test_log)
    predictions = np.rint(y_pred)
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

In [44]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
print(study.best_trial)
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-10-18 02:25:57,138] A new study created in memory with name: no-name-0c7d46d7-93fc-4093-b92c-f60466a1ca2b
[I 2023-10-18 02:25:57,165] Trial 0 finished with value: 0.6680851063829787 and parameters: {'solver': 'newton-cg', 'C': 1590.8272000000002}. Best is trial 0 with value: 0.6680851063829787.
[I 2023-10-18 02:25:57,175] Trial 1 finished with value: 0.6680851063829787 and parameters: {'solver': 'liblinear', 'C': 9703.375600000001}. Best is trial 0 with value: 0.6680851063829787.
[I 2023-10-18 02:25:57,214] Trial 2 finished with value: 0.6680851063829787 and parameters: {'solver': 'sag', 'C': 8965.132300000001}. Best is trial 0 with value: 0.6680851063829787.
[I 2023-10-18 02:25:57,252] Trial 3 finished with value: 0.6680851063829787 and parameters: {'solver': 'sag', 'C': 5130.3334}. Best is trial 0 with value: 0.6680851063829787.
[I 2023-10-18 02:25:57,263] Trial 4 finished with value: 0.6680851063829787 and parameters: {'solver': 'liblinear', 'C': 8660.766300000001}. Best is 

FrozenTrial(number=0, state=TrialState.COMPLETE, values=[0.6680851063829787], datetime_start=datetime.datetime(2023, 10, 18, 2, 25, 57, 142131), datetime_complete=datetime.datetime(2023, 10, 18, 2, 25, 57, 164069), params={'solver': 'newton-cg', 'C': 1590.8272000000002}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'solver': CategoricalDistribution(choices=('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga')), 'C': FloatDistribution(high=10000.0, log=False, low=0.0, step=0.0001)}, trial_id=0, value=None)
Number of finished trials:  1000
Best trial:
  Value: 0.6680851063829787
  Params: 
    solver: newton-cg
    C: 1590.8272000000002


In [45]:
param = study.best_params

model = LogisticRegression(**param)
model.fit(x_train_log, y_train)
y_pred = model.predict(x_test_log)
predictions = np.rint(y_pred)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test,predictions)
print(accuracy)
print(f1)

0.6680851063829787
0.6829268292682926


In [35]:
coeff = model.coef_[0]
# feature importance
Feature_importance = pd.DataFrame({'Feature': x_train.columns, 'Importance': np.abs(coeff)})
Feature_importance = Feature_importance.sort_values(by='Importance')
Feature_importance

fig = px.bar(Feature_importance, x='Importance', y='Feature')
# Menampilkan plot
fig.show()

In [46]:
score = pd.DataFrame({'accuracy': [accuracy], 'f1': [f1]})
score.to_pickle("data/score_log_reg.pkl")
Feature_importance.to_pickle("data/features_log_reg.pkl")

### **Model 4: KNeighborsClassifier**

In [37]:
from sklearn.neighbors import KNeighborsClassifier

In [38]:
def objective(trial):

    param = {
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 20),
        "weights" : trial.suggest_categorical('weights',['uniform', 'distance']), 
    }
    
    model = KNeighborsClassifier(**param)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test.values)
    predictions = np.rint(y_pred)
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

In [39]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)
print(study.best_trial)
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-10-18 02:14:11,840] A new study created in memory with name: no-name-de9fd9e3-4ef5-4c79-8e6d-9f29a10334d6

X does not have valid feature names, but KNeighborsClassifier was fitted with feature names

[I 2023-10-18 02:14:12,021] Trial 0 finished with value: 0.7319148936170212 and parameters: {'n_neighbors': 20, 'weights': 'distance'}. Best is trial 0 with value: 0.7319148936170212.

X does not have valid feature names, but KNeighborsClassifier was fitted with feature names

[I 2023-10-18 02:14:12,254] Trial 1 finished with value: 0.6617021276595745 and parameters: {'n_neighbors': 17, 'weights': 'uniform'}. Best is trial 0 with value: 0.7319148936170212.

X does not have valid feature names, but KNeighborsClassifier was fitted with feature names

[I 2023-10-18 02:14:12,419] Trial 2 finished with value: 0.8170212765957446 and parameters: {'n_neighbors': 4, 'weights': 'distance'}. Best is trial 2 with value: 0.8170212765957446.

X does not have valid feature names, but KNeighborsCl

FrozenTrial(number=128, state=TrialState.COMPLETE, values=[0.8404255319148937], datetime_start=datetime.datetime(2023, 10, 18, 2, 14, 30, 816261), datetime_complete=datetime.datetime(2023, 10, 18, 2, 14, 31, 5218), params={'n_neighbors': 2, 'weights': 'uniform'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_neighbors': IntDistribution(high=20, log=False, low=1, step=1), 'weights': CategoricalDistribution(choices=('uniform', 'distance'))}, trial_id=128, value=None)
Number of finished trials:  1000
Best trial:
  Value: 0.8404255319148937
  Params: 
    n_neighbors: 2
    weights: uniform


In [40]:
param = study.best_params

model = KNeighborsClassifier(**param)
model.fit(x_train, y_train)
y_pred = model.predict(x_test.values)
predictions = np.rint(y_pred)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test,predictions)
print(accuracy)
print(f1)

0.8404255319148937
0.8366013071895424



X does not have valid feature names, but KNeighborsClassifier was fitted with feature names



In [41]:
from sklearn.inspection import permutation_importance
# feature importance
results = permutation_importance(model, x_train.values, y_train, scoring='neg_mean_squared_error')
results = results.importances_mean
Feature_importance = pd.DataFrame({'Feature': x_train.columns, 'Importance': results})
Feature_importance = Feature_importance.sort_values(by='Importance')
Feature_importance

fig = px.bar(Feature_importance, x='Importance', y='Feature')
# Menampilkan plot
fig.show()


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names


X does not have valid feature names, but KNeighborsClassifi

In [42]:
score = pd.DataFrame({'accuracy': [accuracy], 'f1': [f1]})
score.to_pickle("data/score_kn.pkl")
Feature_importance.to_pickle("data/features_kneighbor.pkl")