In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle


In [3]:
df = pd.read_csv('train.csv')

In [4]:
categorical_cols = ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
onehot_cols = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
numerical_cols = ['CLIENTNUM', 'Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 
                  'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 
                  'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
label_col = 'Attrition_Flag'
scale_cols = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 
                  'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 
                  'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']


In [10]:
import xgboost as xgb
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)


In [None]:
# not use pipeline for partial control
# label_encoder = LabelEncoder()
# df[label_col] = label_encoder.fit_transform(df[label_col])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('scale_num', MaxAbsScaler(), scale_cols),
#         # fix handle_unknown='ignore' to avoid errors with unseen categories
#         ('onehot_cate', OneHotEncoder(categories='auto', handle_unknown='ignore'), onehot_cols)
#     ],
#     remainder='passthrough'
# )
# processed = preprocessor.fit_transform(df.drop(columns=[label_col]))

# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', xgb.XGBClassifier())])
# X = df[scale_cols + onehot_cols ]
# y = df[label_col]
# pipeline.fit(X, y)

# val_df = pd.read_csv('val.csv')
# X_val = val_df[scale_cols + onehot_cols]
# X_val = preprocessor.transform(X_val)
# y_pred = pipeline.predict(X_val)
# y_val = label_encoder.transform(val_df[label_col])  

# acc = accuracy_score(y_val, y_pred)
# prec = precision_score(y_val, y_pred, average='macro')  # dùng macro nếu có nhiều class
# rec = recall_score(y_val, y_pred, average='macro')
# f1 = f1_score(y_val, y_pred, average='macro')
# # In ma trận nhầm lẫn
# print("Confusion Matrix:")
# # In các chỉ số đánh giá
# metrics_df = pd.DataFrame([[acc, prec, rec, f1]], columns= ["Accuracy", "Precision", "Recall", "F1 Score"])
# print("\nEvaluation Metrics:")
# print(metrics_df)

In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pandas as pd
import joblib
import logging

# Load encoders, scalers, model
label_encoder = joblib.load("encoder/label_encoder_attrition.pkl")
onehot_encoder = joblib.load("encoder/onehot_encoder.pkl")
scaler = joblib.load("encoder/scaler.pkl")
model = joblib.load("model/model.pkl")

# Init FastAPI
app = FastAPI()

# Logging
logging.basicConfig(filename='logs/predict.log', level=logging.INFO)

# Cột
categorical_cols = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
numerical_cols = [
    'Customer_Age', 'Dependent_count', 'Months_on_book',
    'Total_Relationship_Count', 'Months_Inactive_12_mon',
    'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
    'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
    'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'
]

# Schema input
class InputData(BaseModel):
    CLIENTNUM: int
    Attrition_Flag: str
    Customer_Age: int
    Gender: str
    Dependent_count: int
    Education_Level: str
    Marital_Status: str
    Income_Category: str
    Card_Category: str
    Months_on_book: int
    Total_Relationship_Count: int
    Months_Inactive_12_mon: int
    Contacts_Count_12_mon: int
    Credit_Limit: float
    Total_Revolving_Bal: float
    Avg_Open_To_Buy: float
    Total_Amt_Chng_Q4_Q1: float
    Total_Trans_Amt: float
    Total_Trans_Ct: int
    Total_Ct_Chng_Q4_Q1: float
    Avg_Utilization_Ratio: float

@app.post("/predict")
def predict(data: InputData):
    try:
        # Convert input to DataFrame
        df = pd.DataFrame([input_data.dict()])

        # Encode Attrition_Flag (label)
        df['Attrition_Flag'] = label_encoder.transform(df['Attrition_Flag'])

        # One-hot encode other categorical features
        df_cat = onehot_encoder.transform(df[categorical_cols])
        df_cat = pd.DataFrame(df_cat.toarray(), columns=onehot_encoder.get_feature_names_out(categorical_cols))

        # Scale numerical features (excluding CLIENTNUM)
        df_num = scaler.transform(df[numerical_cols])
        df_num = pd.DataFrame(df_num, columns=numerical_cols)

        # Kết hợp tất cả
        df_final = pd.concat([df[['Attrition_Flag']], df_num, df_cat], axis=1)

        # Predict
        pred = model.predict(df_final)[0]
        prob = model.predict_proba(df_final)[0]

        return {
            "prediction": int(pred),
            "probability": prob.tolist()
        }

    except Exception as e:
        logging.error(f"Prediction error: {str(e)} | Data: {data.dict()}")
        raise HTTPException(status_code=400, detail=f"Prediction failed: {str(e)}")
