## Santander Customer Satisfaction
### Source
https://www.kaggle.com/competitions/santander-customer-satisfaction/overview

### Tasks
Predict customer satisfaction

### Evaluation
ROC-AUC Curve (small portion of dissatisfaction)

In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib as plt
import seaborn as sns
import warnings

Matplotlib is building the font cache; this may take a moment.


In [10]:
base_path='/Users/minha/ai/ai-practice/python-machine-learning-guide/classification/santander-customer-satisfaction/'

warnings.filterwarnings('ignore')
customer_df = pd.read_csv(f'{base_path}train.csv', encoding='latin-1')
print('data shape:', customer_df.shape)
customer_df.head(3) # 371 features

data shape: (76020, 371)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0


In [11]:
customer_df.info() # feature data type is all number: 111 of float and 260 of int, no NULL

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB


In [18]:
## 불만족 고객의 비율 계산
# value_count(): 해당 컬럼의 값 빈도수 (0과 1의 빈도수)
# count(): 총 개수

print(customer_df['TARGET'].value_counts())

unsatisfied_customer =  customer_df[customer_df['TARGET'] == 1].TARGET.count()
total_customer = customer_df.TARGET.count()

print(f'unsatisfied customers ratio: {unsatisfied_customer/total_customer}')

TARGET
0    73012
1     3008
Name: count, dtype: int64
unsatisfied customers ratio: 0.0395685345961589


In [19]:
## 각 feature 값의 분포
# var3의 min 값이 -999999으로 NaN이거나 특정 예외 값을 변환
customer_df.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


In [21]:
## 결측값 확인
# var3 컬럼의 상위 10개 인덱스의 값 빈도수
# -999999가 116개이고, var3는 숫자형이기 때문에 그 편차가 심해 가장 값이 많은 2로 변환 (최빈값으로 결측값 보완)
print(customer_df.var3.value_counts()[:10]) 

var3
 2         74165
 8           138
-999999      116
 9           110
 3           108
 1           105
 13           98
 7            97
 4            86
 12           85
Name: count, dtype: int64
