In [2]:
import pandas as pd
import sys
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', 20)


In [2]:
df = pd.read_csv('Bank.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 23 columns):
 #   Column                                                                                                       Non-Null Count  Dtype  
---  ------                                                                                                       --------------  -----  
 0   CLIENTNUM                                                                                                    10127 non-null  int64  
 1   Attrition_Flag                                                                                               10127 non-null  object 
 2   Customer_Age                                                                                                 10127 non-null  int64  
 3   Gender                                                                                                       10127 non-null  object 
 4   Dependent_count                                           

In [4]:
df.drop(columns=df.columns[-2:], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10127 non-null  int64  
 1   Attrition_Flag            10127 non-null  object 
 2   Customer_Age              10127 non-null  int64  
 3   Gender                    10127 non-null  object 
 4   Dependent_count           10127 non-null  int64  
 5   Education_Level           10127 non-null  object 
 6   Marital_Status            10127 non-null  object 
 7   Income_Category           10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive_12_mon    10127 non-null  int64  
 12  Contacts_Count_12_mon     10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_

In [5]:
from sklearn.model_selection import train_test_split


In [8]:
df.columns[1]

'Attrition_Flag'

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.3,  random_state=42, shuffle=True, stratify=df[df.columns[1]])
test_df, val_df = train_test_split(temp_df, test_size=1/3, random_state=42, shuffle=True, stratify=temp_df[temp_df.columns[1]])
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
val_df.to_csv('val.csv', index=False)

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')   

In [4]:
cate_columns = train_df.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", cate_columns)
num_columns = train_df.select_dtypes(exclude=['object']).columns.tolist()
print("Numerical columns:", num_columns)

Categorical columns: ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
Numerical columns: ['CLIENTNUM', 'Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']


In [73]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler, LabelEncoder
import pickle as pkl
import os
from pathlib import Path
import json

In [74]:
def data_preprocessing_train(df, cate_columns, num_columns):

    """ Preprocess the training data by encoding categorical features, scaling numerical features, and saving the encoders, scaler."""    
    cols = {}
    df_temp = df.copy(deep = True)

    X_train = df_temp.drop(columns=df_temp.iloc[:,0:2], axis=1, inplace=False)
    y_train = df_temp.iloc[:, 1]
    cate_columns = [col for col in cate_columns if col in X_train.columns]
    num_columns = [col for col in num_columns if col in X_train.columns]
    # Ensure the output directory exists    
    output_dir = Path("models")
    output_dir.mkdir(parents=True, exist_ok=True)

    # 1. Label Encoding for target column                   
    le = LabelEncoder()     
    y_encoded = le.fit_transform(y_train)
    # Save the label encoder
    file_path = output_dir / "label_encoder.pkl"
    with open(file_path, 'wb') as f:
        pkl.dump(le, f)

    # 2. One-Hot Encoding for categorical columns
    ohe = OneHotEncoder(handle_unknown='ignore')
    cate_encoded = ohe.fit_transform(X_train[cate_columns])
    print(type(cate_encoded))
    file_path = output_dir / "ohe.pkl"
    # Save the OneHotEncoder    
    with open(file_path, 'wb') as f:
        pkl.dump(ohe, f)

    # 3. MaxAbsScaler for numerical columns
    scaler = MaxAbsScaler()
    num_scaled = scaler.fit_transform(X_train[num_columns])
    print(type(num_scaled))
    file_path = output_dir / "scaler.pkl"
    # Save the MaxAbsScaler
    with open(file_path, 'wb') as f:
        pkl.dump(scaler, f)
    # 4. concatenate the processed features
    # Convert the encoded categorical features and scaled numerical features back to DataFrame
    X_train_processed = pd.concat([pd.DataFrame(cate_encoded.toarray(), columns= ohe.get_feature_names_out(cate_columns)), pd.DataFrame(num_scaled, columns=num_columns)], axis=1)
    cols['cate_columns'] = ohe.get_feature_names_out(cate_columns).tolist()
    cols['num_columns'] = num_columns
    cols['target_column'] = y_train.name
    # Save the column names
    cols_file_path = output_dir / "columns.json"    
    with open(cols_file_path, 'w') as f:
        json.dump(cols, f)
    return X_train_processed, y_encoded
    


In [75]:
X_train_processed, y_encoded = data_preprocessing_train(train_df, cate_columns, num_columns)
X_train_processed.head(5)

<class 'scipy.sparse._csr.csr_matrix'>
<class 'numpy.ndarray'>


Unnamed: 0,Gender_F,Gender_M,Education_Level_College,Education_Level_Doctorate,Education_Level_Graduate,Education_Level_High School,Education_Level_Post-Graduate,Education_Level_Uneducated,Education_Level_Unknown,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Unknown,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.69863,0.6,0.642857,1.0,0.5,0.5,0.104589,0.617402,0.059567,0.147483,0.119887,0.244604,0.08625,0.43043
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.684932,0.8,0.642857,0.333333,0.333333,0.5,0.071764,0.0,0.071764,0.130998,0.110203,0.309353,0.108373,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.452055,0.0,0.642857,1.0,0.166667,0.5,0.079239,0.702821,0.027987,0.431263,0.178857,0.510791,0.272193,0.647648
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.780822,0.4,0.714286,0.5,0.333333,0.5,0.353691,0.786651,0.296326,0.181336,0.076228,0.244604,0.173341,0.162162
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.808219,0.4,0.946429,0.666667,0.833333,0.666667,0.078196,0.962257,0.008025,0.227554,0.140608,0.309353,0.121255,0.897898


In [76]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)


In [79]:
def load_pickle(model_name=None):
    """Load a pickle file."""
    folder_path = Path("models") 
    file_path = folder_path / model_name
    if not file_path.exists():
        print(f"Model file {file_path} does not exist.")
        return None 
    else:
        print(f"Loading model from {file_path}")
        with open(file_path, 'rb') as f:
            return pkl.load(f)

In [81]:
def process_val():

    val_df = pd.read_csv('val.csv')
    val_df_temp = val_df.copy(deep=True)

    cate_columns = val_df_temp.select_dtypes(include=['object']).columns.tolist()

    cate_columns_processed = json.load(open('models/columns.json', 'r'))['cate_columns']
    num_columns = json.load(open('models/columns.json', 'r'))['num_columns']
    cols = json.load(open('models/columns.json', 'r'))  

    le = load_pickle("label_encoder.pkl")
    ohe = load_pickle("ohe.pkl")        
    scaler = load_pickle("scaler.pkl")
    if le is None or ohe is None or scaler is None:
        print("Error loading preprocessing models.")
        return None, None   
    else:
        # Preprocess the validation data
        X_val = val_df_temp.drop(columns=val_df_temp.iloc[:,0:2], axis=1, inplace=False)
        y_val = val_df_temp.iloc[:, 1]
        # Ensure the columns are in the same order as during training   
        cate_columns = [col for col in cate_columns if col in X_val.columns]
        num_columns = [col for col in num_columns if col in X_val.columns]
        # 1. Label Encoding for target column
        y_val_encoded = le.transform(y_val) 
        # 2. One-Hot Encoding for categorical columns
        cate_encoded = ohe.transform(X_val[cate_columns])
        if cate_columns_processed != ohe.get_feature_names_out(cate_columns).tolist():
            print("Warning: Categorical columns in validation data do not match training data.")
            return None, None
        else:
            # 3. MaxAbsScaler for numerical columns
            num_scaled = scaler.transform(X_val[num_columns])
            # 4. concatenate the processed features
            X_val_processed = pd.concat([pd.DataFrame(cate_encoded.toarray(), columns= ohe.get_feature_names_out(cate_columns)), pd.DataFrame(num_scaled, columns=num_columns)], axis=1)
            return X_val_processed, y_val_encoded
        
    

In [100]:
X_train = X_train_processed
y_train = y_encoded
X_val_processed, y_val_encoded = process_val()
X_val = X_val_processed
y_val = y_val_encoded

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

cm = confusion_matrix(y_val, y_pred)

acc = accuracy_score(y_val, y_pred)
prec = precision_score(y_val, y_pred, average='macro')  # dùng macro nếu có nhiều class
rec = recall_score(y_val, y_pred, average='macro')
f1 = f1_score(y_val, y_pred, average='macro')
# In ma trận nhầm lẫn
print("Confusion Matrix:")
print(cm)
# In các chỉ số đánh giá
metrics_df = pd.DataFrame([[acc, prec, rec, f1]], columns= ["Accuracy", "Precision", "Recall", "F1 Score"])
print("\nEvaluation Metrics:")
print(metrics_df)
folder_path = Path("models")
file_path = folder_path / "model.pkl"   
if not file_path.exists():
    with open(file_path, 'wb') as f:
        pkl.dump(model, f)
    print(f"Model saved to {file_path}")
file_path = folder_path / "metrics.json"
with open(file_path, 'w') as f:
    json.dump(metrics_df.T.to_dict(), f)


Loading model from models\label_encoder.pkl
Loading model from models\ohe.pkl
Loading model from models\scaler.pkl
Confusion Matrix:
[[150  13]
 [ 16 834]]

Evaluation Metrics:
   Accuracy  Precision    Recall  F1 Score
0  0.971372   0.944133  0.950711  0.947383


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [1]:
import pandas as pd

df = pd.read_csv('test.csv')

In [None]:
df.columns[1]

'Attrition_Flag'

In [5]:
df.drop(columns=df.columns[1], axis=1, inplace=True)

In [6]:
df.columns

Index(['CLIENTNUM', 'Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
      dtype='object')

In [9]:
df.T.to_json('test_r.json', index=False)