# K-fold practice
## 1. Lib. dataload


In [19]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [24]:
df = pd.read_csv("kaggle_titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1.0,0.0,3.0,male,22.0,1.0,0.0,7.25,S
1,2.0,1.0,1.0,female,38.0,1.0,0.0,71.2833,C
2,3.0,1.0,3.0,female,26.0,0.0,0.0,7.925,S
3,4.0,1.0,1.0,female,35.0,1.0,0.0,53.1,S
4,5.0,0.0,3.0,male,35.0,0.0,0.0,8.05,S


## 2. PreProcessing
- Noise 처리, Scaling, Labeling, One-hot Encoding ... 

In [25]:
#결측값이 존재하는지 확인

df.isnull().sum()

#Embarked가 2개 결측값이 존재함을 확인

#변수 -> Parch 부모님과 같이탔는지여부
#SibSp -> 친척 1촌이상과 같이 탔는지
#Embarked -> 내가 어디서 탔는가, 항구의 이니셜 3곳임

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       2
dtype: int64

In [26]:
df["Embarked"].value_counts()

# S로시작되는항구, C로시작되는항구, Q로 시작되는 항구 확인

# 수치형이 아닌 문자로일단 보이는거는 Sex, Embarked 두가지가 보임
# male과 female은 명목변수 -> 이름이라는상징적인 의미밖에없고, 순위를 매길수없음 -> 원핫인코딩
# Embarked도 항구에 순위를 매겨서 의미가 없음 -> 따라서 얘네도 원핫인코딩

## 라벨인코딩을 하는경우 -> 순위가 있을 때, 만약에 Pclass가 A, B, C로 되어있다하면은,(1,2,3이 아니라) 그러면 Pclass는 라벨인코딩을 해서 수치형으로 변환해야함
#PassengerId 변수같은것들은 모델에 넣어도 쓸모가없을것같다 그냥 ID일 뿐이니까

# 이렇게까지가 df로만봤을 때 눈으로 보여지는 데이터 정보들

Embarked
S    782
C    212
Q     50
Name: count, dtype: int64

In [31]:
df["Embarked"].mode()

0    S
Name: Embarked, dtype: object

In [50]:
#embarked에 2개있는 결측값 해결

def fill_na(df):
    df["Embarked"].fillna('S', inplace=True)   #그냥 결측값을 N으로 채워넣기 (원래는), 근데 어차피 데이터가 적고 결측값이 2개밖에없으니까 이건 3%도 해당이안되서
    return df                                  #그냥 S로해서 데이터를 합쳐버리는

#불필요한 변수를 삭제

def drop_features(df):
    remove_cols = ["PassengerId", "Sex", "Embarked"]
    df.drop(remove_cols, axis=1, inplace=True)   #axis=1은 컬럼을 의미하니까, ㅁ 에서 -> axis=1, 아래방향이 axis=0
    return df

# 문자 categorical 데이터를 -> numeric으로 변경을 해야함 -> 이런게 라벨인코딩을 사용하던지, one-hot 인코딩을 사용하던지


def format_features(df):
    
    tmp_df1 = pd.get_dummies(df["Sex"])  #이거는 판다스모듈에서 원핫인코딩을 해주는함수 female, False, male True 이런식으로 False True로 해도 0, 1이니까
    tmp_df2 = pd.get_dummies(df["Embarked"])
    
    df["Female"] = tmp_df1["female"]
    df["male"] = tmp_df1["male"]
    
    df["C"] = tmp_df2["C"]
    df["Q"] = tmp_df2["Q"]
    df["S"] = tmp_df2["S"]

    return df



In [47]:
pd.get_dummies(df["Sex"]).head()  ##한번씩 찍어보면서 확인

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


In [51]:
#전처리 기능을 한번에 수행하기 위한 함수 (위에서 만든 def들)

def transform_features(df):
    df = fill_na(df)
    df = format_features(df)
    df = drop_features(df)

    return df

#이 펑션만 수행하면 위에 전처리과정이 일괄로 수행되니까 이렇게 구분함

#스케일링은 해도되고 안해도된다 => 무슨의미지?

In [52]:
y = df["Survived"]
x = df.drop("Survived", axis=1)    #서바이벌은 y에서 종속변수가 될꺼니까 뺴고 나머지 살리는

x = transform_features(x)
x.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna('S', inplace=True)   #그냥 결측값을 N으로 채워넣기 (원래는), 근데 어차피 데이터가 적고 결측값이 2개밖에없으니까 이건 3%도 해당이안되서


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Female,male,C,Q,S
0,3.0,22.0,1.0,0.0,7.25,False,True,False,False,True
1,1.0,38.0,1.0,0.0,71.2833,True,False,True,False,False
2,3.0,26.0,0.0,0.0,7.925,True,False,False,False,True
3,1.0,35.0,1.0,0.0,53.1,True,False,False,False,True
4,3.0,35.0,0.0,0.0,8.05,False,True,False,False,True


In [None]:
##위에 결과가 이렇게 나왔는데, 문제는 라벨링을 하고나서 sex와 embarked는 넣으면 안된다
#Sex을 가지고 female, male / embarked를 가지고 C, Q, S를 만들었으니까 중복이된거잖아
# 
# 그래서 위에코드에서 다시 drop_features여기 함수에 Sex랑 Embarked를 drop으로 추가함, 그리고나서 밑에 transform_features에서 드랍하는 순서를 조정해서 바꿔줌

In [55]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=27)
x_train.shape, x_test.shape

((732, 10), (314, 10))

## 3. EDA ==> insight(생략)
- 예, 가설 : 돈이 세상에서 제일 좋다 --> 선실등급이 1등급인 경우, 생존확률이 높다


## 4. Model 생성
- 어떤 고객은 생존했는가? 사망했는가? 
- model : decistion tree, logistic regression, random forest, xgbclassifier

In [60]:
# 모델 생성을 위해 라이브러리 임포트

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# 제일먼저 모델 생성

dt = DecisionTreeClassifier(random_state=27)     #다른거없이 그냥, 여기서는 데이터에 대한 유의성 검정이나 그런부분은 빠져있음, 여기서는 그냥 큰 맥락으로만
rf = RandomForestClassifier(random_state=27)     #다른 옵션은 안넣고 튜닝없이 그냥 하면 디폴트적으로 그냥 각각하는거임
lr = LogisticRegression(max_iter=500, random_state=27, solver="lbfgs")

# 모델을 생성했으면 이제 학습을 시켜야하니까 -> 학습은 그냥 fit만 넣어주면 됨
dt.fit(x_train, y_train)
rf.fit(x_train, y_train)
lr.fit(x_train, y_train)

#이제 예측을 해야함 y의 햇을 구해야지 성능 비교를 할 수 있으니까

dt_y_pred = dt.predict(x_test)        #x_test값을 넣어야지 성능비교를 할 수 있으니까
rf_y_pred = rf.predict(x_test)
lr_y_pred = lr.predict(x_test)



## 5. 모델검증
- classification report를 출력

In [62]:
from sklearn.metrics import classification_report, accuracy_score

print("dt:", classification_report(y_test, dt_y_pred))
print("rf:", classification_report(y_test, rf_y_pred))
print("lr:", classification_report(y_test, lr_y_pred))

#precision하고 recall을 보는데 1.0부분 -> 이거 확인필요

print("dt", accuracy_score(y_test, dt_y_pred),
      "rf", accuracy_score(y_test, rf_y_pred),
      "lr", accuracy_score(y_test, lr_y_pred),)

dt:               precision    recall  f1-score   support

         0.0       0.80      0.86      0.83       174
         1.0       0.81      0.74      0.77       140

    accuracy                           0.81       314
   macro avg       0.81      0.80      0.80       314
weighted avg       0.81      0.81      0.80       314

rf:               precision    recall  f1-score   support

         0.0       0.84      0.87      0.85       174
         1.0       0.83      0.79      0.81       140

    accuracy                           0.83       314
   macro avg       0.83      0.83      0.83       314
weighted avg       0.83      0.83      0.83       314

lr:               precision    recall  f1-score   support

         0.0       0.85      0.90      0.87       174
         1.0       0.86      0.81      0.83       140

    accuracy                           0.86       314
   macro avg       0.86      0.85      0.85       314
weighted avg       0.86      0.86      0.86       314

dt 0.80

## 6. K-Fold를 적용시켜서 학습

In [63]:
from sklearn.model_selection import KFold


In [66]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1.0,0.0,3.0,male,22.0,1.0,0.0,7.25,S
1,2.0,1.0,1.0,female,38.0,1.0,0.0,71.2833,C
2,3.0,1.0,3.0,female,26.0,0.0,0.0,7.925,S
3,4.0,1.0,1.0,female,35.0,1.0,0.0,53.1,S
4,5.0,0.0,3.0,male,35.0,0.0,0.0,8.05,S


In [71]:
import numpy as np

In [77]:
indexs = [1, 2]
indexs = np.array(indexs) #넘파이 어레이에 인덱스를 넣으면 가져오니까, 거기에 values를 가져오면 어레이의 값을 가져옴
print(indexs)
df.values[np.array([0,1])]

[1 2]


array([[1.0, 0.0, 3.0, 'male', 22.0, 1.0, 0.0, 7.25, 'S'],
       [2.0, 1.0, 1.0, 'female', 38.0, 1.0, 0.0, 71.2833, 'C']],
      dtype=object)

In [78]:
transform_df = df.copy()

In [79]:
transform_df = transform_features(transform_df)

x_trans_df = transform_df.loc[:,"Pclass":"S"]
y_trans_df = transform_df["Survived"]
x_trans_df.shape, y_trans_df.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna('S', inplace=True)   #그냥 결측값을 N으로 채워넣기 (원래는), 근데 어차피 데이터가 적고 결측값이 2개밖에없으니까 이건 3%도 해당이안되서


((1046, 10), (1046,))

In [80]:
x_trans_df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Female,male,C,Q,S
0,3.0,22.0,1.0,0.0,7.25,False,True,False,False,True
1,1.0,38.0,1.0,0.0,71.2833,True,False,True,False,False
2,3.0,26.0,0.0,0.0,7.925,True,False,False,False,True
3,1.0,35.0,1.0,0.0,53.1,True,False,False,False,True
4,3.0,35.0,0.0,0.0,8.05,False,True,False,False,True


In [84]:

def exec_kfold(model, nfold=5):

    kfold = KFold(n_splits=nfold)   #데이터셋을 설정하게되면은 5개의 폴드를 가질것이다 까지만
    acc_scores = list()


    for iter_count, (train_index, test_index) in enumerate(kfold.split(df)):
        # print(iter_count, test_index)
        x_train, x_test = x_trans_df.values[train_index], x_trans_df.values[test_index]
        y_train, y_test = y_trans_df.values[train_index], y_trans_df.values[test_index]

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)

        acc = accuracy_score(y_test, y_pred)
        acc_scores.append(acc)
        print(f"iter_count: {iter_count}, acc: {acc}")

    mean_acc_score = np.mean(acc_scores)
    print(f"Final Acc: {mean_acc_score}")




In [86]:
df = DecisionTreeClassifier(random_state=27)
exec_kfold(df, 10)

TypeError: Singleton array array(DecisionTreeClassifier(random_state=27), dtype=object) cannot be considered a valid collection.

In [None]:
df = DecisionTreeClassifier(random_state=12)
exec_kfold(df, 10)

In [None]:
dt = DecisionTreeClassifier(random_state=12)
rf = RandomForestClassifier(random_state=12)
lr = LogisticRegression(random_state=12, solver="lbfgs", max_iter=500)

models = [dt, rf, lr]

for m in models:
    exec_kfold(m, 10)

## 7. k-fold 구현 -> cross_val_score() 사용

In [4]:
# k Fold validation을 위해서 lib.
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

In [2]:
# load data

iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [3]:
data = iris["data"]
label = iris["target"]


In [6]:
df = pd.DataFrame(data=data, columns=iris["feature_names"])
df["label"] = label
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [14]:
#모델 생성
dt = DecisionTreeClassifier(random_state=27)


In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=27)
train.shape, test.shape

((120, 5), (30, 5))

In [8]:
train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
123,6.3,2.7,4.9,1.8,2
40,5.0,3.5,1.3,0.3,0
111,6.4,2.7,5.3,1.9,2
97,6.2,2.9,4.3,1.3,1
86,6.7,3.1,4.7,1.5,1


In [12]:
train.loc[:,"sepal length (cm)":"petal width (cm)"].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
123,6.3,2.7,4.9,1.8
40,5.0,3.5,1.3,0.3
111,6.4,2.7,5.3,1.9
97,6.2,2.9,4.3,1.3
86,6.7,3.1,4.7,1.5


In [16]:
scores = cross_val_score(estimator=dt,
                X=train.loc[:,"sepal length (cm)":"petal width (cm)"],
                y=train["label"],
                scoring="accuracy",
                cv=3)


In [17]:
scores

array([0.975, 1.   , 0.925])

## GridsearchCV
 - 위의 cross_val_score의 단점을 보완

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
x_train, x_test, y_train, y_test = train_test_split(iris["data"], iris["target"], test_size=0.2, random_state=27)

In [20]:
#모델생성
dt = DecisionTreeClassifier(random_state=27)

In [21]:
h_params = {
    "max_depth":[1, 2, 3],
    "min_samples_split":[2, 3]
}

In [22]:
#GV 적용
grid_dt = GridSearchCV(estimator=dt,
                        param_grid=h_params,
                        cv=3,
                        refit=True)

In [23]:
grid_dt.fit(x_train, y_train)

In [24]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [25]:
grid_dt.best_score_

np.float64(0.975)

In [27]:
opt_dt_model = grid_dt.best_estimator_

In [29]:
#예측과 검증
from sklearn.metrics import classification_report

y_pred = opt_dt_model.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.83      0.91      0.87        11
           2       0.91      0.83      0.87        12

    accuracy                           0.90        30
   macro avg       0.91      0.91      0.91        30
weighted avg       0.90      0.90      0.90        30

