In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Perceptron
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
train_df = pd.read_csv('./train_scaling.csv')
test_df = pd.read_csv('./test_scaling.csv')
combine = [train_df, test_df] # 데이터 프레임 두개를 한번에 저장한 리스트 타입의 변수 선언

# preview the data
train_df.info()
print('_'*40)
test_df.info()
print('_'*40)
print(combine)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  891 non-null    int64
 1   Survived     891 non-null    int64
 2   Pclass       891 non-null    int64
 3   Sex          891 non-null    int64
 4   Age          891 non-null    int64
 5   Fare         891 non-null    int64
 6   Embarked     891 non-null    int64
 7   Title        891 non-null    int64
 8   IsAlone      891 non-null    int64
 9   Age*Class    891 non-null    int64
dtypes: int64(10)
memory usage: 69.7 KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Pclass       418 non-null    int64
 2   Sex          418 non-null    int64
 3   Age          418 non-null

In [3]:
# 제출용 파일 제작
Pid_df = test_df['PassengerId']

In [4]:
# K fold 를 위한 데이터 분할
index_train_df=train_df.drop(['Survived'], axis=1)
y = train_df['Survived']

In [5]:
# k fold는 배열로 이루어짐 -> 데이터 프레임을 배열로 변환
X = np.array(index_train_df.iloc[:, :]) # survived 제외한 피처 전부 열들 모음

In [6]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [7]:
# K-fold 검증 과정으로 실제 랜덤 포레스트 모델을 학습하여 정확도 평균을 내는 방법
def kFold(clf):
    kf = KFold(n_splits = 5, shuffle = True)
    accuracy_history = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train) # 모델 학습
        y_pred = clf.predict(X_test) # 예측 라벨
        
        accuracy_history.append(accuracy_score(y_pred, y_test)) # 정확도 측정 및 기록
    
    print("사용한 모델 :", clf)
    print("각 분할의 정확도 기록 :", accuracy_history)
    print("평균 정확도 :", np.mean(accuracy_history))

In [8]:
logreg = LogisticRegression()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
perceptron = Perceptron()
xgboost = XGBClassifier()

In [9]:
kFold(logreg)
kFold(decision_tree)
kFold(random_forest)
kFold(perceptron)
kFold(xgboost)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

사용한 모델 : LogisticRegression()
각 분할의 정확도 기록 : [0.8268156424581006, 0.8146067415730337, 0.7359550561797753, 0.7640449438202247, 0.8146067415730337]
평균 정확도 : 0.7912058251208336
사용한 모델 : DecisionTreeClassifier()
각 분할의 정확도 기록 : [0.7821229050279329, 0.7584269662921348, 0.7808988764044944, 0.6629213483146067, 0.7134831460674157]
평균 정확도 : 0.7395706484213169
사용한 모델 : RandomForestClassifier()
각 분할의 정확도 기록 : [0.8100558659217877, 0.8314606741573034, 0.7528089887640449, 0.797752808988764, 0.7808988764044944]
평균 정확도 : 0.7945954428472788
사용한 모델 : Perceptron()
각 분할의 정확도 기록 : [0.6201117318435754, 0.5617977528089888, 0.6235955056179775, 0.38764044943820225, 0.6629213483146067]
평균 정확도 : 0.5712133576046702
사용한 모델 : XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_

In [10]:
Y_pred = logreg.predict(test_df)
submission = pd.DataFrame({
        "PassengerId": Pid_df,
        "Survived": Y_pred
    })
submission.to_csv('submission_preprocessing_crossVal_LogisticRegression.csv', index=False)

Y_pred = decision_tree.predict(test_df)
submission = pd.DataFrame({
        "PassengerId": Pid_df,
        "Survived": Y_pred
    })
submission.to_csv('submission_preprocessing_crossVal_DecisionTree.csv', index=False)


Y_pred = random_forest.predict(test_df)
submission = pd.DataFrame({
        "PassengerId": Pid_df,
        "Survived": Y_pred
    })
submission.to_csv('submission_preprocessing_crossVal_RandomForest.csv', index=False)


Y_pred = xgboost.predict(test_df)
submission = pd.DataFrame({
        "PassengerId": Pid_df,
        "Survived": Y_pred
    })
submission.to_csv('submission_preprocessing_crossVal_XGboost.csv', index=False)

# Submission 결과
- k fold 방식에서는 오히려 성능 저하가 일어났음
- 전부 classifier 모델이라 그럴수도?
- 그렇다면 stratified K fold는? -> 실험 진행

# cross_val_score 이용

In [11]:
scoring = 'accuracy'

def validation(clf):
    print(clf)
    score = cross_val_score(clf,X,y,scoring=scoring)
    print("교차 검증 정확도:", score)
    print("교차 검증 평균:", score.mean())

In [12]:
validation(logreg)
validation(decision_tree)
validation(random_forest)
validation(perceptron)
validation(xgboost)

LogisticRegression()
교차 검증 정확도: [0.77653631 0.82022472 0.81460674 0.78089888 0.78089888]
교차 검증 평균: 0.7946331052664617
DecisionTreeClassifier()
교차 검증 정확도: [0.67039106 0.7752809  0.61235955 0.79213483 0.79213483]
교차 검증 평균: 0.728460234762413
RandomForestClassifier()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

교차 검증 정확도: [0.67597765 0.79213483 0.81460674 0.80337079 0.83146067]
교차 검증 평균: 0.78351013746783
Perceptron()
교차 검증 정확도: [0.62011173 0.61797753 0.61797753 0.61797753 0.39325843]
교차 검증 평균: 0.573460548615906
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
교차 검증 정확도: [0.61452514 0.75842697 0.8258427 

# logistic regression, RandomForest 가 가장 높은 정확도를 보임
-> XGBoost가 갈수록 높아지는 정확도를 보임 -> 왜인지 알아보기

In [13]:
# 제출 파일 생성 -> score 확인
# 그 전에 prediction 진행
from sklearn.model_selection import train_test_split

# 학습용, 테스트용 데이터 분할
# y_train 구성
y_train_df = train_df['Survived']
# x_train 을 위한  survived 값 드롭
X_train_df= train_df.drop('Survived',axis=1)

X_train, X_test, y_train, y_test=train_test_split(X_train_df, y_train_df, \
                                                  test_size=0.2, random_state=11)

In [14]:
#Logistic Regression
logreg.fit(X_train,y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_test, y_test) * 100, 2) # accuracy percentage
acc_log

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


83.8

In [15]:
# Random Forest

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)
acc_random_forest

83.8

In [19]:
Pid_df = test_df['PassengerId']
X_test_df = test_df

In [23]:
Y_pred = logreg.predict(X_test_df)
submission = pd.DataFrame({
        "PassengerId": Pid_df,
        "Survived": Y_pred
    })
submission.to_csv('submission_preprocessing_crossVal_LogisticRegression.csv', index=False)

Y_pred = random_forest.predict(X_test_df)
submission = pd.DataFrame({
        "PassengerId": Pid_df,
        "Survived": Y_pred
    })
submission.to_csv('submission_preprocessing_crossVal_randomForest.csv', index=False)
