In [19]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Perceptron
from sklearn.model_selection import KFold

# 1. 데이터셋 연결, 결측값 확인

In [20]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
combine = [train_df, test_df] # 데이터 프레임 두개를 한번에 저장한 리스트 타입의 변수 선언

# preview the data
train_df.info()
print('_'*40)
test_df.info()
print('_'*40)
print(combine)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passenger

In [21]:
combine = [train_df, test_df]

In [22]:
# train의 데이터셋은 11개, 테스트 데이터셋은 10개임 -> test 데이터셋에는 survived 피처가 없음
print("After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

After (891, 12) (418, 11) (891, 12) (418, 11)


# 해당 인물에 특정한 문자열이 있다면 추출 -> 성별과 함께 묶어 비율 확인

In [23]:
# 이름에서 문자열 추출
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    print(*combine)
    print('*'*40)

#combine[0] -> train_df , combine[1] -> test_df 

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

# 정규 표현식
[A-Za-z] : 모든 알파벳
\. : \을 포함한 모든 문자
-> 문자에서 "," 와 "." 사이 문자열을 추출함 

In [24]:
# 이름에서 추출한 특정 문자열을 기반으로 성별, 기혼 여부에 따른 생존률 확인
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt',\
                                            'Col','Don', 'Dr', 'Major', \
                                            'Rev', 'Sir', 'Jonkheer', 'Dona'],\
                                            'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [25]:
pd.crosstab(train_df['Title'], train_df['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,0,40
Miss,185,0
Mr,0,517
Mrs,126,0
Rare,3,20


In [26]:
#원핫 인코딩
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    #dataset['Title'] = dataset['Title'].fillna(0)

# 성별 인코딩
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [27]:
train_df['Title'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 891 entries, 0 to 890
Series name: Title
Non-Null Count  Dtype
--------------  -----
891 non-null    int64
dtypes: int64(1)
memory usage: 7.1 KB


In [28]:
test_df['Title'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 418 entries, 0 to 417
Series name: Title
Non-Null Count  Dtype
--------------  -----
418 non-null    int64
dtypes: int64(1)
memory usage: 3.4 KB


# 이름에서 성별,혼인 여부 추출 완료 -> 이름과 승객아이디는 필요 없는 데이터는 아래에서 drop 진행

In [29]:
# 새로운 combine 실행
combine = [train_df, test_df]
train_df.shape, test_df.shape

((891, 13), (418, 12))

In [30]:
# 선실 등급을 행으로, 성별을 열로 하는 2차원 배열 생성
guess_ages = np.zeros((2,3))
guess_ages

array([[0., 0., 0.],
       [0., 0., 0.]])

In [31]:
# 이게 뭐지? -> 특정 성별, 클래스의 평균 나이를 구함 -> 구간에 해당하는 결측값을 채워넣음
# print(combine)

for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()
            # dataset['Sex'] == i) & (dataset['Pclass'] == j+1) == true 인 애트리뷰트 중 결측값이 있으면 결측값 삭제
            print("guess_df:")
            print(guess_df)
            print('*'*40)
            print(guess_ages)
            print('*'*40)
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std) 
            
           
            age_guess = guess_df.median()
            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int(age_guess/0.5 + 0.5 ) * 0.5
            print(int(age_guess/0.5 + 0.5 ) * 0.5)
            #guess_ages_pd = pd.DataFrame(guess_ages)

# 탑승실의 등급에 따른 여성, 남성의 평균 나이 구함 -> 2번 진행 -> 왜?

guess_df:
6      54.0
23     28.0
27     19.0
30     40.0
34     28.0
       ... 
822    38.0
857    51.0
867    31.0
872    33.0
889    26.0
Name: Age, Length: 101, dtype: float64
****************************************
[[0. 0. 0.]
 [0. 0. 0.]]
****************************************
40.0
guess_df:
20     35.0
21     34.0
33     66.0
70     32.0
72     21.0
       ... 
848    28.0
861    21.0
864    24.0
883    28.0
886    27.0
Name: Age, Length: 99, dtype: float64
****************************************
[[40.  0.  0.]
 [ 0.  0.  0.]]
****************************************
30.0
guess_df:
0      22.0
4      35.0
7       2.0
12     20.0
13     39.0
       ... 
876    20.0
877    19.0
881    33.0
884    25.0
890    32.0
Name: Age, Length: 253, dtype: float64
****************************************
[[40. 30.  0.]
 [ 0.  0.  0.]]
****************************************
25.0
guess_df:
1      38.0
3      35.0
11     58.0
52     49.0
61     38.0
       ... 
856    45.0
862    48.0
871 

In [32]:
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

In [33]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S,3
4,5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S,1


# age 피처를 범주형으로 변환
연속형인 나이 피처를 일정한 범위로 나누어 단순한 숫자형 데이터로 변환

In [34]:
# train_df의 전체 구간을 5개로 나눔
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)

# 나눈 5개 구간의 평균 생존률을 분석
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.337374
2,"(32.0, 48.0]",0.412037
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [35]:
# 나눈 5개의 구간을 원핫 인코딩
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']

train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,0,PC 17599,71.2833,C85,C,3
2,3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282,7.925,,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2,1,0,113803,53.1,C123,S,3
4,5,0,3,"Allen, Mr. William Henry",0,2,0,0,373450,8.05,,S,1


# 동승 부모/자식의 인원수, 동승 형제/배우자의 인원수를 합한 새로운 피처 생성

In [36]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 # 한 가족의 총 인원수
    
#Family Size에 대한 생존률을 그룹화, 평균값 print -> 생존률이 가장 높은 순으로 출력
train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [37]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    int64  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Title        891 non-null    int64  
 13  FamilySize   891 non-null    int64  
dtypes: float64(1), int64(9), object(4)
memory usage: 97.6+ KB


In [19]:
# 1인 탑승객 처리
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [20]:
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

train_df.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)


Unnamed: 0,Age*Class,Age,Pclass
0,3,1,3
1,2,2,1
2,3,1,3
3,2,2,1
4,6,2,3
5,3,1,3
6,3,3,1
7,0,0,3
8,3,1,3
9,0,0,2


# embarked 데이터의 결측치 2건
가장 빈도수가 높은 S로 채워넣음

In [21]:
freq_port = train_df.Embarked.dropna().mode()[0]
# mode() : 최빈값

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [22]:
# 결측치 replace 후 원핫 인코딩
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,IsAlone,Age*Class
0,1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171,7.25,,0,1,2,0,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,0,PC 17599,71.2833,C85,1,3,2,0,2
2,3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282,7.925,,0,2,1,1,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2,1,0,113803,53.1,C123,0,3,2,0,2
4,5,0,3,"Allen, Mr. William Henry",0,2,0,0,373450,8.05,,0,1,1,1,6


# fare 피처의 결측값 처리
티켓의 가격은 선실의 등급으로 대체 가능한 데이터 -> drop 할수는 없으니, 결측값은 중앙값으로 처리

In [23]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,IsAlone,Age*Class
0,892,3,"Kelly, Mr. James",0,2,0,0,330911,7.8292,,2,1,1,1,6
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,2,1,0,363272,7.0,,0,3,2,0,6
2,894,2,"Myles, Mr. Thomas Francis",0,3,0,0,240276,9.6875,,2,1,1,1,6
3,895,3,"Wirz, Mr. Albert",0,1,0,0,315154,8.6625,,0,1,1,1,3
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,1,1,1,3101298,12.2875,,0,3,3,0,3


In [24]:
# fare 범주 생성
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


In [25]:
# 인코딩
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]
    
train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,IsAlone,Age*Class
0,1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171,0,,0,1,2,0,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,0,PC 17599,3,C85,1,3,2,0,2
2,3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282,1,,0,2,1,1,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2,1,0,113803,3,C123,0,3,2,0,2
4,5,0,3,"Allen, Mr. William Henry",0,2,0,0,373450,1,,0,1,1,1,6
5,6,0,3,"Moran, Mr. James",0,1,0,0,330877,1,,2,1,1,1,3
6,7,0,1,"McCarthy, Mr. Timothy J",0,3,0,0,17463,3,E46,0,1,1,1,3
7,8,0,3,"Palsson, Master. Gosta Leonard",0,0,3,1,349909,2,,0,4,5,0,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,1,0,2,347742,1,,0,3,3,0,3
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,0,1,0,237736,2,,1,3,2,0,0


# 모든 데이터의 범주화, 원앤핫 인코딩 완료
이전 데이터보다 조금 더 간결해짐 -> 입력값으로 사용하기에 이전의 데이터보다 원활함

# 합치고 남은 데이터셋은 drop
합쳐져 원핫인코딩 되어 새로 만들어진 피처의 재료 피처들은 drop

In [26]:
# drop 만 모아놓음
train_df = train_df.drop(['Name','Ticket', 'Cabin','Parch', 'SibSp', 'FamilySize'], axis=1)
test_df = test_df.drop(['Name','Ticket', 'Cabin','Parch', 'SibSp', 'FamilySize'], axis=1)

combine = [train_df, test_df]

# 학습용, 테스트용 데이터 나누기

In [27]:
from sklearn.model_selection import train_test_split

# y_train 구성
y_train_df = train_df['Survived']
# x_train 을 위한  survived 값 드롭
X_train_df= train_df.drop('Survived',axis=1)

X_train, X_test, y_train, y_test=train_test_split(X_train_df, y_train_df, \
                                                  test_size=0.2, random_state=11)

In [28]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,1,0,3,0,1,0,0,1,0,3
1,2,1,1,1,2,3,1,3,0,2
2,3,1,3,1,1,1,0,2,1,3
3,4,1,1,1,2,3,0,3,0,2
4,5,0,3,0,2,1,0,1,1,6
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,1,1,0,5,1,2
887,888,1,1,1,1,2,0,2,1,1
888,889,0,3,1,1,2,0,2,0,3
889,890,1,1,0,1,2,1,1,1,1


In [29]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,1,0,3,0,1,0,0,1,0,3
1,2,1,1,1,2,3,1,3,0,2
2,3,1,3,1,1,1,0,2,1,3
3,4,1,1,1,2,3,0,3,0,2
4,5,0,3,0,2,1,0,1,1,6


In [30]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,892,3,0,2,0,2,1,1,6
1,893,3,1,2,0,0,3,0,6
2,894,2,0,3,1,2,1,1,6
3,895,3,0,1,1,0,1,1,3
4,896,3,1,1,1,0,3,0,3


In [42]:
# y_train 구성
Pid_df = test_df['PassengerId']
# x_train 을 위한  survived 값 드롭
X_test_df= test_df

# 각 데이터셋의 피처 개수가 동일한 것을 확인

In [43]:
X_train.info()
print("*"*40)
X_test.info()
print("*"*40)
y_train.info()
print("*"*40)
y_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 333 to 703
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  712 non-null    int64
 1   Pclass       712 non-null    int64
 2   Sex          712 non-null    int64
 3   Age          712 non-null    int64
 4   Fare         712 non-null    int64
 5   Embarked     712 non-null    int64
 6   Title        712 non-null    int64
 7   IsAlone      712 non-null    int64
 8   Age*Class    712 non-null    int64
dtypes: int64(9)
memory usage: 55.6 KB
****************************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 179 entries, 431 to 484
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  179 non-null    int64
 1   Pclass       179 non-null    int64
 2   Sex          179 non-null    int64
 3   Age          179 non-null    int64
 4   Fare         179 non-nu

# 마찬가지로 train set과 피처의 개수가 같은것을 확인

In [44]:
X_test_df.info()
print("*"*40)
Pid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Pclass       418 non-null    int64
 2   Sex          418 non-null    int64
 3   Age          418 non-null    int64
 4   Fare         418 non-null    int64
 5   Embarked     418 non-null    int64
 6   Title        418 non-null    int64
 7   IsAlone      418 non-null    int64
 8   Age*Class    418 non-null    int64
dtypes: int64(9)
memory usage: 29.5 KB
****************************************
<class 'pandas.core.series.Series'>
RangeIndex: 418 entries, 0 to 417
Series name: PassengerId
Non-Null Count  Dtype
--------------  -----
418 non-null    int64
dtypes: int64(1)
memory usage: 3.4 KB


# Logistic Regression 
 로지스틱 선형분석은 범주화된 변수와 하나 혹은 그 이상의 독립적 변수들의 상관과녜를 분석, 사건이 일어날 확률을 예측한다

In [45]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
# 왜 test 데이터는 x_train 인건지?
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_test, y_test) * 100, 2) # accuracy percentage
acc_log

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


83.8

# 결정트리

In [46]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree_test = round(decision_tree.score(X_test, y_test) * 100, 2)
acc_decision_tree_test

78.77

# 랜덤포레스트

In [47]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)
acc_random_forest

82.12

# 퍼셉트론

In [48]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_test, y_test) * 100, 2)
acc_perceptron

36.31

# XGboost

In [49]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

xgboost = xgb.XGBClassifier(objective='binary:logistic',n_estimators=20, seed=123)
xgboost.fit(X_train, y_train)
Y_pred = xgboost.predict(X_test)
acc_xgboost = round(xgboost.score(X_test,y_test) * 100, 2)
acc_xgboost

86.59

# 모델 평가

In [50]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 
              'Random Forest', 'Decision Tree','Perceptron','XGBoost'],
    'Score': [acc_log, acc_random_forest, acc_decision_tree_test,acc_perceptron,acc_xgboost]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
4,XGBoost,86.59
0,Logistic Regression,83.8
1,Random Forest,82.12
2,Decision Tree,78.77
3,Perceptron,36.31


가장 적합한 모델은 랜덤 포레스트/결정 트리 모델로 판단 됨

In [51]:
Y_pred = random_forest.predict(X_test_df)
submission = pd.DataFrame({
        "PassengerId": Pid_df,
        "Survived": Y_pred
    })
submission.to_csv('submission.csv', index=False)

# K-fold 교차 검증
과적합의 우려가 있음 -> 교차 검증으로 과적합 교정

In [119]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# K fold 를 위한 데이터 분할
index_train_df=train_df.drop(['Survived'], axis=1)
y = train_df['Survived']

In [120]:
X = np.array(index_train_df.iloc[:, :]) # survived 제외한 피처 전부 열들 모음

In [121]:
print(X)

[[  1   3   0 ...   1   0   3]
 [  2   1   1 ...   3   0   2]
 [  3   3   1 ...   2   1   3]
 ...
 [889   3   1 ...   2   0   3]
 [890   1   0 ...   1   1   1]
 [891   3   0 ...   1   1   3]]


In [122]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [123]:
kf = KFold(n_splits = 5, shuffle = True)
accuracy_history = []

# K-fold 검증 과정으로 실제 랜덤 포레스트 모델을 학습하여 정확도 평균을 내는 방법
def kFold(clf):
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # random_forest.fit(X_train, y_train) # 모델 학습
        y_pred = clf.predict(X_test) # 예측 라벨
        
        accuracy_history.append(accuracy_score(y_pred, y_test)) # 정확도 측정 및 기록
    
    print("각 분할의 정확도 기록 :", accuracy_history)
    print("평균 정확도 :", np.mean(accuracy_history))

In [124]:
kFold(decision_tree)
kFold(random_forest)
kFold(perceptron)
kFold(xgboost)

각 분할의 정확도 기록 : [0.9608938547486033, 0.9606741573033708, 0.9438202247191011, 0.9606741573033708, 0.9606741573033708]
평균 정확도 : 0.9573473102755635


ValueError: X has 9 features, but DecisionTreeClassifier is expecting 10 features as input.

In [102]:
# y_train 구성
y_train_df = train_df['Survived']
# x_train 을 위한  survived 값 드롭
X_train_df= train_df.drop('Survived',axis=1)

X_train, X_test, y_train, y_test=train_test_split(X_train_df, y_train_df, \
                                                  test_size=0.2, random_state=11)

from sklearn.model_selection import cross_val_score

basic_scores = cross_val_score(random_forest, X_test, y_test, cv = 5)
basic_scores

# 하이퍼 파라미터 튜닝 (GridSearchCV)

print("하이퍼 파라미터 튜닝 적용 이전:", random_forest.score(X_train, y_train))

# Random Forest
params = {
    'n_estimators':[100,150,200,250],
    'max_depth':[6, 8, 10, 12, 16, 20],
    'min_samples_leaf':[8, 12, 16, 20],
    'min_samples_split':[8, 16, 20]
}

rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 2, n_jobs = -1)
grid_cv.fit(X_train, y_train)

grid_cv.score(X_train, y_train)
print('최적의 하이퍼 파라미터:\n',grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

#gridsearchCV 말고 k-fold 검증이 더 적합함