In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import joblib
import sys
import os

sys.path.append(os.path.abspath('../utils'))
from risk_profile_rules import assign_risk_profile
from preprocessing import clean_and_encode_data
df = pd.read_csv("../data/bank.csv", sep=',')

#risk profile rules
df['risk_profile'] = df.apply(assign_risk_profile, axis=1)
print("Risk profile distribution before encoding:")
print(df['risk_profile'].value_counts())

#encode categorical features
df_encoded, encoders = clean_and_encode_data(df)

X = df_encoded.drop(columns=['risk_profile'])
y = df_encoded['risk_profile']

# train test 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#classifier
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)

#report
y_pred = clf.predict(X_test)
print("\nClassification report:")
print(classification_report(y_test, y_pred))

#save model and encoders
joblib.dump(clf, '../app/model.pkl')
joblib.dump(encoders, '../app/encoders.pkl')
print("Model and encoders saved to app/")


Risk profile distribution before encoding:
risk_profile
Conservative    5759
Moderate        4030
Aggresive       1373
Name: count, dtype: int64

Classification report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       275
           1       0.93      0.92      0.92      1152
           2       0.86      0.87      0.86       806

    accuracy                           0.90      2233
   macro avg       0.90      0.91      0.91      2233
weighted avg       0.90      0.90      0.90      2233

Model and encoders saved to app/
