In [None]:
import pandas as pd

playground_df = pd.read_csv('train.csv')
playground_df.head(10)

## 컬럼 설명 
- id             : id 번호 
- age            : 나이
- job            : 직업
- marital        : 결혼 여부
- education      : 교육 수준
- default        : 신용 불량 여부         
- balance        : 계좌 잔고
- housing        : 주택 보유 여부
- loan           : 대출 여부
- contact        : 마지막 접촉 수단
- day            : 마지막 접촉 일자
- month          : 마지막 접촉 월
- duration       : 마지막 접촉 통화 시간
- campaign       : 이번 캠페인 동안 연락한 횟수
- pdays          : 마지막 연락 후 경과일
- previous       : 이전 캠페인에서 연락 횟수
- poutcome       : 이전 캠페인의 결과
- y              : 예금 가입 여부

입력변수  : 
출력 변수 : y

In [None]:
playground_df.shape

In [None]:
playground_df['job'].value_counts()

In [None]:
playground_df.isna().sum()

In [None]:
playground_df[['previous','campaign']].corr()

In [None]:
playground_df['poutcome'].value_counts().sort_values(ascending=False)

In [None]:
playground_df['pdays'].value_counts()

In [None]:
playground_df.sort_values(by='age', ascending=False)

In [None]:
pd.DataFrame(playground_df[playground_df['pdays']==-1])

## 카이제곱 검정 시행

In [None]:
chi = ['marital','education','housing','loan','contact','month','poutcome']

In [None]:
import scipy.stats as stats
from scipy.stats import chi2_contingency

for i in chi:
    ct = pd.crosstab(playground_df[i],playground_df['y'])
    chi2, p, _, _ = chi2_contingency(ct)
    print(f'== {i} vs y ==')
    print(ct)
    print(f'Chi2: {chi2:.3f}, p-value: {p:.8f}')
    print('\n')

데이터가 많아서 p-value값이 0이 나온 것 같긴 함                                       
그래도 통계적 유의성만 본다면 범주형 컬럼중에 제외시켜야 할 컬럼은 없는 것 같음

In [None]:
map_YN = ['default','housing','loan']
for col in map_YN:
    playground_df[col] = playground_df[col].map({
    'no' : 0,
    'yes' : 1
})

In [None]:
playground_df

In [None]:
playground_df = pd.get_dummies(columns=['job','marital','education','contact','month','poutcome'],data=playground_df)

In [None]:
playground_df.shape

In [None]:
X = playground_df.drop(columns=['id','y'])
y = playground_df['y']

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_val,y_train,y_val = train_test_split(X,y, test_size=0.2)

In [None]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier()
models, predictions = clf.fit(X_train,X_val,y_train,y_val)
print(models.head(5))

In [None]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score, StratifiedKFold

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_float("learning_rate", 0.01,1.0, log=True)
    #max_depth = trial.suggest_int("max_depth", 1, 20)

    base_estimator = DecisionTreeClassifier()

    model = AdaBoostClassifier(
    estimator=base_estimator,
    n_estimators=n_estimators,
    learning_rate = learning_rate,
    random_state=2020158011
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020158011)
    scores = cross_val_score(model,X_train,y_train, cv=cv, scoring='roc_auc')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("best trial : ")
trial = study.best_trial
print(f" AUC score : {trial.value}")
print(f" Params   : {trial.params}")

In [None]:
best_params = study.best_params
print(best_params)

In [None]:
best_model = AdaBoostClassifier(**best_params)
best_model.fit(X,y)

## 테스트 데이터 전처리

In [None]:
test_df = pd.read_csv('test.csv')
test_df.head()

In [None]:
for col in map_YN:
    test_df[col] = test_df[col].map({
    'no' : 0,
    'yes' : 1
})

In [None]:
test_df = pd.get_dummies(columns=['job','marital','education','contact','month','poutcome'],data=test_df)

In [None]:
test_df.head(5).dtypes

In [None]:
X_test = test_df.drop(columns='id')
y_test_pred = best_model.predict(X_test)

submission = pd.DataFrame({
    'id' : test_df['id'],
    'y' : y_test_pred
})
submission.to_csv('submission',index=False)

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub.head(5)