In [1]:
import pandas as pd
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

In [2]:
df = pd.read_csv("assets/BankChurners.csv")
df.head(3)

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998


In [3]:
df.columns

Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
      dtype='object')

In [4]:
def clean(df: pd.DataFrame) -> pd.DataFrame:
    clean_df = df.drop(columns=[df.columns[len(df.columns)-1], df.columns[len(df.columns)-2]], axis=1)
    clean_df.set_index("CLIENTNUM", inplace=True)
    clean_df.Income_Category = clean_df.Income_Category.replace(
        {
            "Unknown": 0,
            "Less than $40K": 1,
            "$40K - $60K": 2,
            "$60K - $80K": 3,
            "$80K - $120K": 4,
            "$120K +": 5
        
        }
    )
    clean_df["Attrition_Flag"] = clean_df["Attrition_Flag"].astype('category').cat.codes.replace({0:1, 1:0})
    return clean_df


In [5]:
df = clean(df)

In [6]:
cat_ft = ["Gender", "Education_Level", "Marital_Status", "Income_Category", "Card_Category"]
num_ft = [i for i in list(df.drop("Attrition_Flag", axis=1).columns) if i not in cat_ft]
oversample = SMOTE()

colt = ColumnTransformer(
    [
        ("norm", MinMaxScaler(), num_ft),
        ("ohe", OneHotEncoder(handle_unknown="error", drop="if_binary"), cat_ft)
    ]
)


In [7]:
x = df.drop("Attrition_Flag", axis=1)
y = df.Attrition_Flag

X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

## LinearSVC

In [8]:
lsvc = LinearSVC(random_state=42)

svcp = Pipeline(
    steps=[
        ("transform", colt),
        ("oversample", oversample),
        ("cls", lsvc)
    ]
)


In [9]:
svcp.fit(X_train, y_train)
pred = svcp.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.97      0.87      0.91      1703
           1       0.55      0.84      0.66       323

    accuracy                           0.86      2026
   macro avg       0.76      0.85      0.79      2026
weighted avg       0.90      0.86      0.87      2026



## LGBMClassifier

In [10]:
lgbm = LGBMClassifier(
    n_estimators=600,
    random_state=42,
    n_jobs=-1
)

lgbmp = Pipeline(
    steps=[
        ("transform", colt),
        ("oversample", oversample),
        ("lgbm", lgbm)
    ]
)

In [11]:
lgbmp.fit(X_train, y_train)
pred = lgbmp.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1703
           1       0.92      0.92      0.92       323

    accuracy                           0.97      2026
   macro avg       0.95      0.95      0.95      2026
weighted avg       0.97      0.97      0.97      2026



## LGBMClassifier test

In [16]:
test_pred = lgbmp.predict(x)
print(classification_report(y, test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8500
           1       0.98      0.98      0.98      1627

    accuracy                           0.99     10127
   macro avg       0.99      0.99      0.99     10127
weighted avg       0.99      0.99      0.99     10127

