# K-fold practice
## 1. Lib. dataload


In [49]:
import pandas as pd

df = pd.read_csv("kaggle_titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1.0,0.0,3.0,male,22.0,1.0,0.0,7.25,S
1,2.0,1.0,1.0,female,38.0,1.0,0.0,71.2833,C
2,3.0,1.0,3.0,female,26.0,0.0,0.0,7.925,S
3,4.0,1.0,1.0,female,35.0,1.0,0.0,53.1,S
4,5.0,0.0,3.0,male,35.0,0.0,0.0,8.05,S


## 2. PreProcessing
- Noise 처리, Scaling, Labeling, One-hot Encoding ... 

In [50]:
# 결측값이 존재하는지?
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       2
dtype: int64

In [51]:
df["Embarked"].value_counts()

Embarked
S    782
C    212
Q     50
Name: count, dtype: int64

In [52]:
pd.get_dummies(df["Sex"]).head()

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


In [53]:
pd.get_dummies(df["Embarked"]).head()

Unnamed: 0,C,Q,S
0,False,False,True
1,True,False,False
2,False,False,True
3,False,False,True
4,False,False,True


In [54]:
df["Embarked"].mode()

0    S
Name: Embarked, dtype: object

In [55]:
# 결측값 처리
def fill_na(df):
    df["Embarked"].fillna('S', inplace=True)
    return df

#불필요한 변수 삭제
def drop_features(df):
    remove_cols = ["PassengerId", "Sex", "Embarked"]
    df.drop(remove_cols, axis=1, inplace=True)
    return df

#categorical -> numeric : label or one-hot encoding
def format_features(df):
    
    tmp_df1 = pd.get_dummies(df["Sex"])
    tmp_df2= pd.get_dummies(df["Embarked"])

    df["female"] = tmp_df1["female"]
    df["male"] = tmp_df1["male"]
    
    df["C"] = tmp_df2["C"]
    df["Q"] = tmp_df2["Q"]
    df["S"] = tmp_df2["S"]

    return df

In [56]:
# 전처리 기능을 한번에 수행하기 위한 함수
def transform_features(df):
    df = fill_na(df)
    df = format_features(df)
    df = drop_features(df)
    return df

In [57]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1.0,0.0,3.0,male,22.0,1.0,0.0,7.25,S
1,2.0,1.0,1.0,female,38.0,1.0,0.0,71.2833,C
2,3.0,1.0,3.0,female,26.0,0.0,0.0,7.925,S
3,4.0,1.0,1.0,female,35.0,1.0,0.0,53.1,S
4,5.0,0.0,3.0,male,35.0,0.0,0.0,8.05,S


In [58]:
y = df["Survived"]
X = df.drop("Survived", axis=1)

X = transform_features(X)
X.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna('S', inplace=True)


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,3.0,22.0,1.0,0.0,7.25,False,True,False,False,True
1,1.0,38.0,1.0,0.0,71.2833,True,False,True,False,False
2,3.0,26.0,0.0,0.0,7.925,True,False,False,False,True
3,1.0,35.0,1.0,0.0,53.1,True,False,False,False,True
4,3.0,35.0,0.0,0.0,8.05,False,True,False,False,True


In [59]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
x_train.shape, x_test.shape

((732, 10), (314, 10))

## 3. EDA ==> insight(생략)
- 예, 가설 : 돈이 세상에서 제일 좋다 --> 선실등급이 1등급인 경우, 생존확률이 높다


## 4. Model 생성
- 어떤 고객은 생존했는가? 사망했는가? 
- model : decistion tree, logistic regression, random forest, xgbclassifier

In [60]:
# import lib.
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# 모델생성
dt = DecisionTreeClassifier(random_state=12)
rf = RandomForestClassifier(random_state=12)
lr = LogisticRegression(random_state=12, solver="lbfgs", max_iter=500)

# 모델학습
dt.fit(x_train, y_train)
rf.fit(x_train, y_train)
lr.fit(x_train, y_train)

# 예측
dt_y_pred = dt.predict(x_test)
rf_y_pred = rf.predict(x_test)
lr_y_pred = lr.predict(x_test)

## 5. 모델검증
- classification report를 출력

In [61]:
from sklearn.metrics import classification_report, accuracy_score
print("DT", classification_report(y_test, dt_y_pred))
print("RF", classification_report(y_test, rf_y_pred))
print("LR", classification_report(y_test, lr_y_pred))

print("DT", accuracy_score(y_test, dt_y_pred), 
      "RF", accuracy_score(y_test, rf_y_pred), 
      "LR", accuracy_score(y_test, lr_y_pred))

DT               precision    recall  f1-score   support

         0.0       0.87      0.85      0.86       188
         1.0       0.78      0.81      0.80       126

    accuracy                           0.83       314
   macro avg       0.83      0.83      0.83       314
weighted avg       0.84      0.83      0.83       314

RF               precision    recall  f1-score   support

         0.0       0.87      0.87      0.87       188
         1.0       0.81      0.81      0.81       126

    accuracy                           0.85       314
   macro avg       0.84      0.84      0.84       314
weighted avg       0.85      0.85      0.85       314

LR               precision    recall  f1-score   support

         0.0       0.89      0.88      0.89       188
         1.0       0.83      0.83      0.83       126

    accuracy                           0.86       314
   macro avg       0.86      0.86      0.86       314
weighted avg       0.86      0.86      0.86       314

DT 0.83439

## 6. K-Fold 를 적용시켜서 학습

In [62]:
from sklearn.model_selection import KFold

In [63]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1.0,0.0,3.0,male,22.0,1.0,0.0,7.25,S
1,2.0,1.0,1.0,female,38.0,1.0,0.0,71.2833,C
2,3.0,1.0,3.0,female,26.0,0.0,0.0,7.925,S
3,4.0,1.0,1.0,female,35.0,1.0,0.0,53.1,S
4,5.0,0.0,3.0,male,35.0,0.0,0.0,8.05,S


In [64]:
import numpy as np

In [65]:
indexs = [1, 2]
indexs = np.array(indexs)
print(indexs)
df.values[indexs]

[1 2]


array([[2.0, 1.0, 1.0, 'female', 38.0, 1.0, 0.0, 71.2833, 'C'],
       [3.0, 1.0, 3.0, 'female', 26.0, 0.0, 0.0, 7.925, 'S']],
      dtype=object)

In [68]:
transform_df = df.copy()

In [69]:
transform_df = transform_features(transform_df)

x_trans_df = transform_df.loc[:,"Pclass":"S"]
y_trans_df = transform_df["Survived"]
x_trans_df.shape, y_trans_df.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna('S', inplace=True)


((1046, 10), (1046,))

In [70]:
x_trans_df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,3.0,22.0,1.0,0.0,7.25,False,True,False,False,True
1,1.0,38.0,1.0,0.0,71.2833,True,False,True,False,False
2,3.0,26.0,0.0,0.0,7.925,True,False,False,False,True
3,1.0,35.0,1.0,0.0,53.1,True,False,False,False,True
4,3.0,35.0,0.0,0.0,8.05,False,True,False,False,True


In [78]:
def exec_kfold(model, nFold=5):

    kfold = KFold(n_splits=nFold)
    acc_scores = list()

    for iter_count, (train_index, test_index)in enumerate(kfold.split(x_trans_df)):
        # print(iter_count, train_index)
        x_train, x_test = x_trans_df.values[train_index], x_trans_df.values[test_index]
        y_train, y_test = y_trans_df.values[train_index], y_trans_df.values[test_index]

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)

        acc = accuracy_score(y_test, y_pred)
        acc_scores.append(acc)
        print(f"iter_count :{iter_count}, acc :{acc}")

    mean_acc_score = np.mean(acc_scores)
    print(f"Final Acc : {mean_acc_score}")

In [79]:
df = DecisionTreeClassifier(random_state=12)
exec_kfold(df, 10)

iter_count :0, acc :0.780952380952381
iter_count :1, acc :0.7714285714285715
iter_count :2, acc :0.8095238095238095
iter_count :3, acc :0.7047619047619048
iter_count :4, acc :0.7714285714285715
iter_count :5, acc :0.780952380952381
iter_count :6, acc :0.8269230769230769
iter_count :7, acc :0.8557692307692307
iter_count :8, acc :0.8653846153846154
iter_count :9, acc :0.8653846153846154
Final Acc : 0.8032509157509157


In [80]:
dt = DecisionTreeClassifier(random_state=12)
rf = RandomForestClassifier(random_state=12)
lr = LogisticRegression(random_state=12, solver="lbfgs", max_iter=500)

models = [dt, rf, lr]

for m in models:
    exec_kfold(m, 10)

iter_count :0, acc :0.780952380952381
iter_count :1, acc :0.7714285714285715
iter_count :2, acc :0.8095238095238095
iter_count :3, acc :0.7047619047619048
iter_count :4, acc :0.7714285714285715
iter_count :5, acc :0.780952380952381
iter_count :6, acc :0.8269230769230769
iter_count :7, acc :0.8557692307692307
iter_count :8, acc :0.8653846153846154
iter_count :9, acc :0.8653846153846154
Final Acc : 0.8032509157509157
iter_count :0, acc :0.819047619047619
iter_count :1, acc :0.7904761904761904
iter_count :2, acc :0.8476190476190476
iter_count :3, acc :0.7619047619047619
iter_count :4, acc :0.7619047619047619
iter_count :5, acc :0.8
iter_count :6, acc :0.8557692307692307
iter_count :7, acc :0.9134615384615384
iter_count :8, acc :0.9038461538461539
iter_count :9, acc :0.9230769230769231
Final Acc : 0.8377106227106227
iter_count :0, acc :0.7714285714285715
iter_count :1, acc :0.819047619047619
iter_count :2, acc :0.8095238095238095
iter_count :3, acc :0.7523809523809524
iter_count :4, acc :0

## 7. k-fold 구현 -> cross_val_score() 사용

In [5]:
# K Fold Validation을 위해서 lib.
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

In [2]:
# load data
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [3]:
data = iris["data"]
label = iris["target"]

In [6]:
df = pd.DataFrame(data=data, columns=iris["feature_names"])
df["label"] = label
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [13]:
# 모델 생성
dt = DecisionTreeClassifier(random_state=47)

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=47)
train.shape, test.shape

((120, 5), (30, 5))

In [9]:
train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
108,6.7,2.5,5.8,1.8,2
91,6.1,3.0,4.6,1.4,1
107,7.3,2.9,6.3,1.8,2
46,5.1,3.8,1.6,0.2,0
138,6.0,3.0,4.8,1.8,2


In [11]:
train.loc[:,"sepal length (cm)":"petal width (cm)"].head(1)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
108,6.7,2.5,5.8,1.8


In [14]:
scores = cross_val_score(estimator=dt,
                         X=train.loc[:,"sepal length (cm)":"petal width (cm)"], 
                         y=train["label"],
                         scoring="accuracy",
                         cv=3)

In [15]:
scores

array([0.95 , 0.925, 0.95 ])

In [None]:
y_pred = dt.predict(test.loc[:,"sepal length (cm)":"petal width (cm)"])

## GridSearchCV
- 위의 cross_val_score의 단점을 보완

In [17]:
from sklearn.model_selection import GridSearchCV

In [26]:
x_train, x_test, y_train, y_test = train_test_split(iris["data"],
                                                    iris["target"],
                                                    test_size=0.2,
                                                    random_state=47)

In [27]:
#모델
dt = DecisionTreeClassifier(random_state=47)

In [28]:
h_params = {
    "max_depth":[1, 2, 3],
    "min_samples_split":[2, 3]
}

In [31]:
#GV 적용
grid_dt = GridSearchCV(estimator=dt,
                param_grid=h_params,
                cv=3,
                refit=True
                )

In [33]:
grid_dt.fit(x_train, y_train)

In [37]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [38]:
grid_dt.best_score_

np.float64(0.9333333333333335)

In [39]:
opt_dt_model = grid_dt.best_estimator_

In [42]:
# 예측과 검증
from sklearn.metrics import classification_report

y_pred = opt_dt_model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.88      1.00      0.93         7
           2       1.00      0.92      0.96        12

    accuracy                           0.97        30
   macro avg       0.96      0.97      0.96        30
weighted avg       0.97      0.97      0.97        30

