In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_parquet("s3://dev-ds-clean-data-1654/clean_data.parquet")
df.head()

Unnamed: 0,attrition_flag,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolving_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,existing customer,45,m,3,high school,married,$60k - $80k,blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,existing customer,49,f,5,graduate,single,less than $40k,blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,existing customer,51,m,3,graduate,married,$80k - $120k,blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,existing customer,40,f,4,high school,unknown,less than $40k,blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,existing customer,40,m,3,uneducated,married,$60k - $80k,blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   attrition_flag            10127 non-null  object 
 1   customer_age              10127 non-null  int64  
 2   gender                    10127 non-null  object 
 3   dependent_count           10127 non-null  int64  
 4   education_level           10127 non-null  object 
 5   marital_status            10127 non-null  object 
 6   income_category           10127 non-null  object 
 7   card_category             10127 non-null  object 
 8   months_on_book            10127 non-null  int64  
 9   total_relationship_count  10127 non-null  int64  
 10  months_inactive_12_mon    10127 non-null  int64  
 11  contacts_count_12_mon     10127 non-null  int64  
 12  credit_limit              10127 non-null  float64
 13  total_revolving_bal       10127 non-null  int64  
 14  avg_op

In [4]:
df['attrition_flag'].unique()

array(['existing customer', 'attrited customer'], dtype=object)

In [5]:
df['attrition_flag'] = df['attrition_flag'].map({
    'existing customer': 0,
    'attrited customer': 1
})

In [6]:
from sklearn.preprocessing import OneHotEncoder

gender_ohe = OneHotEncoder(drop='first', sparse_output=False)
gender_encoded = gender_ohe.fit_transform(df[['gender']])
gender_cols = gender_ohe.get_feature_names_out(['gender'])
df_gender = pd.DataFrame(gender_encoded, columns=gender_cols, index=df.index)
df = pd.concat([df.drop(columns=['gender']), df_gender], axis=1)

In [7]:
from sklearn.preprocessing import OrdinalEncoder

education_order = [
    'unknown',
    'uneducated',
    'high school',
    'college',
    'graduate',
    'post-graduate',
    'doctorate'
]

edu_encoder = OrdinalEncoder(
    categories=[education_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

df['education_level'] = edu_encoder.fit_transform(
    df[['education_level']]
)

In [8]:
marital_ohe = OneHotEncoder(drop='first', sparse_output=False)
marital_encoded = marital_ohe.fit_transform(df[['marital_status']])
marital_cols = marital_ohe.get_feature_names_out(['marital_status'])
df_marital = pd.DataFrame(marital_encoded, columns=marital_cols, index=df.index)
df = pd.concat([df.drop(columns=['marital_status']), df_marital], axis=1)

In [9]:
income_order = [
    "unknown",
    "less than $40k",
    "$40k - $60k",
    "$60k - $80k",
    "$80k - $120k",
    "$120k +"
]

income_encoder = OrdinalEncoder(
    categories=[income_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

df['income_category'] = income_encoder.fit_transform(
    df[['income_category']]
)

In [10]:
card_order = ["blue", "silver", "gold", "platinum"]

card_encoder = OrdinalEncoder(
    categories=[card_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

df['card_category'] = card_encoder.fit_transform(
    df[['card_category']]
)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   attrition_flag            10127 non-null  int64  
 1   customer_age              10127 non-null  int64  
 2   dependent_count           10127 non-null  int64  
 3   education_level           10127 non-null  float64
 4   income_category           10127 non-null  float64
 5   card_category             10127 non-null  float64
 6   months_on_book            10127 non-null  int64  
 7   total_relationship_count  10127 non-null  int64  
 8   months_inactive_12_mon    10127 non-null  int64  
 9   contacts_count_12_mon     10127 non-null  int64  
 10  credit_limit              10127 non-null  float64
 11  total_revolving_bal       10127 non-null  int64  
 12  avg_open_to_buy           10127 non-null  float64
 13  total_amt_chng_q4_q1      10127 non-null  float64
 14  total_

In [12]:
df['credit_limit'] = np.log1p(df['credit_limit'])

In [13]:
df['avg_open_to_buy'] = np.log1p(df['avg_open_to_buy'])

In [14]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='yeo-johnson')

df['total_amt_chng_q4_q1'] = pt.fit_transform(
    df[['total_amt_chng_q4_q1']]
)

In [15]:
df['total_trans_amt'] = np.log1p(df['total_trans_amt'])

In [16]:
df['total_ct_chng_q4_q1'] = pt.fit_transform(
    df[['total_ct_chng_q4_q1']]
)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   attrition_flag            10127 non-null  int64  
 1   customer_age              10127 non-null  int64  
 2   dependent_count           10127 non-null  int64  
 3   education_level           10127 non-null  float64
 4   income_category           10127 non-null  float64
 5   card_category             10127 non-null  float64
 6   months_on_book            10127 non-null  int64  
 7   total_relationship_count  10127 non-null  int64  
 8   months_inactive_12_mon    10127 non-null  int64  
 9   contacts_count_12_mon     10127 non-null  int64  
 10  credit_limit              10127 non-null  float64
 11  total_revolving_bal       10127 non-null  int64  
 12  avg_open_to_buy           10127 non-null  float64
 13  total_amt_chng_q4_q1      10127 non-null  float64
 14  total_

In [18]:
X = df.drop(columns=['attrition_flag'])
y = df['attrition_flag']


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [21]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    #class_weight='balanced'
)

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

cv_scores = cross_val_score(
    log_reg,
    X_train_scaled,
    y_train,
    cv=cv,
    scoring='roc_auc'
)

print("CV ROC-AUC scores:", cv_scores)
print("Mean CV ROC-AUC:", cv_scores.mean())

CV ROC-AUC scores: [0.93673372 0.93550622 0.94637443 0.94241516 0.93270068]
Mean CV ROC-AUC: 0.9387460430393733


In [22]:
log_reg.fit(X_train_scaled, y_train)

from sklearn.metrics import roc_auc_score, classification_report

y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

print("Test ROC-AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))

Test ROC-AUC: 0.935165739610184
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1701
           1       0.82      0.62      0.71       325

    accuracy                           0.92      2026
   macro avg       0.88      0.80      0.83      2026
weighted avg       0.91      0.92      0.91      2026

