In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('titanic.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.shape

(891, 12)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## 데이터 전처리하기
1. 성별 수치화하기
2. 이름에서 연령대 추측할 수 있는 정보 찾기
3. 1번에서 찾은 정보 바탕으로 Age 결측값 채우기
4. Age 수치화하기
5. Fare 수치화하기
6. Cabin 수치화하기
7. Cabin 결측값 채우기
8. Embarked 결측값 채우고 수치화하기
9. FamilySize 구하고 수치화하기

In [7]:
# 1. 성별 수치화하기.

In [8]:
train.groupby('Sex')['Survived'].mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [9]:
# 위 결과를 살펴보면 여자의 경우가 훨씬 생존율이 높음을 알 수 있다.

In [10]:
train['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [11]:
# male: 0, female: 1로 mapping하기.

In [12]:
train['Sex2'] = train['Sex'].map({'male':0, 'female':1})

In [13]:
train['Sex2'].value_counts()

0    577
1    314
Name: Sex2, dtype: int64

In [14]:
# 2. 이름에서 연령대 추측할 수 있는 정보 찾기.

In [15]:
train['Name'].head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [16]:
train['Title'] = train['Name'].str.extract('([A-Za-z]+)\.')

In [17]:
train['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: Title, dtype: int64

In [18]:
# Mr: 0, Miss:1, Mrs: 2, 나머지: 3으로 mapping하기.

In [19]:
def titleMap(title):
    if title == 'Mr':
        return 0
    elif title == 'Miss':
        return 1
    elif title == 'Mrs':
        return 2
    else :
        return 3

In [20]:
train['Title2'] = train['Title'].apply(titleMap)

In [21]:
train['Title2'].value_counts()

0    517
1    182
2    125
3     67
Name: Title2, dtype: int64

In [22]:
train.groupby('Title2')['Age'].median()

Title2
0    30.0
1    21.0
2    35.0
3     9.0
Name: Age, dtype: float64

In [23]:
# 아래의 코드는 모든 데이터를 변경하기 때문에 결측값만 채워야하는 현재의 상황에서는 맞지 않는다.
train.groupby('Title2')['Age'].transform('median')

0      30.0
1      35.0
2      21.0
3      35.0
4      30.0
       ... 
886     9.0
887    21.0
888    21.0
889    30.0
890    30.0
Name: Age, Length: 891, dtype: float64

In [24]:
# 3. 1번에서 찾은 정보 바탕으로 Age 결측값 채우기
train['Age'].fillna(train.groupby('Title2')['Age'].transform('median'), inplace=True)

In [25]:
train['Age'].isnull().sum()

0

## Age 수치화 기준
child(<=16) : 0<br>
young(16<age<=26) : 1<br>
adult(26<age<=36) : 2<br>
mid-age(36<age<=62) : 3<br>
senoir(age>62) : 4

In [26]:
# 4.Age 수치화하기

def ageMap(age):
    if age<=16:
        return 0
    elif 16<age<=26:
        return 1
    elif 26<age<=36:
        return 2
    elif 36<age<=62:
        return 3
    else :
        return 4

In [27]:
train['Age2'] = train['Age'].apply(ageMap)

In [28]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex2,Title,Title2,Age2
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,2,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs,2,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr,0,2


In [29]:
# 5. Fare 수치화하기

fare <= 17 : 0 <br>
17 < fare <=30 : 1 <br>
30 < fare <= 100 : 2 <br>
fare > 100 : 3

In [30]:
train['Fare']

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: Fare, Length: 891, dtype: float64

In [31]:
train['Fare'].isnull().sum()

0

In [32]:
def fareMap(fare):
    if fare <= 17:
        return 0
    elif 17 < fare <=30:
        return 1
    elif 30 < fare <= 100:
        return 2
    else :
        return 3

In [33]:
train['Fare2'] = train['Fare'].apply(fareMap)

In [34]:
train['Fare2'].value_counts()

0    496
2    181
1    161
3     53
Name: Fare2, dtype: int64

In [35]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex2,Title,Title2,Age2,Fare2
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,2,3,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss,1,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs,2,2,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr,0,2,0


In [36]:
# Cabin 결측값 채울 때 Pclass별로 중앙값 이용해서 채우기

In [37]:
train['Cabin2'] = train['Cabin'].str[0]

In [38]:
train['Cabin2'].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin2, dtype: int64

In [39]:
cabin_mapping = {
'A' : 0,
'B' : 0.4,
'C' : 0.8,
'D' : 1.2,
'E' : 1.6,
'F' : 2.0,
'G' : 2.4,
'T' : 2.8
}

In [40]:
train['Cabin2'] = train['Cabin2'].map(cabin_mapping)

In [41]:
train.groupby('Pclass')['Cabin2'].median()

Pclass
1    0.8
2    1.8
3    2.0
Name: Cabin2, dtype: float64

In [42]:
train['Cabin2'].fillna(train.groupby('Pclass')['Cabin2'].transform('median'), inplace=True)

In [43]:
train.groupby('Pclass')['Cabin2'].median()

Pclass
1    0.8
2    1.8
3    2.0
Name: Cabin2, dtype: float64

In [44]:
train['Cabin2']

0      2.0
1      0.8
2      2.0
3      0.8
4      2.0
      ... 
886    1.8
887    0.4
888    2.0
889    0.8
890    2.0
Name: Cabin2, Length: 891, dtype: float64

In [45]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Sex2             0
Title            0
Title2           0
Age2             0
Fare2            0
Cabin2           0
dtype: int64

In [46]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [47]:
train['Embarked'].fillna('S', inplace=True)

In [48]:
train['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [49]:
def embarkedMap(embarked):
    if embarked == 'S':
        return 0
    elif embarked == 'C':
        return 1
    else :
        return 2

In [50]:
train['Embarked2'] = train['Embarked'].apply(embarkedMap)

In [51]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex2,Title,Title2,Age2,Fare2,Cabin2,Embarked2
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr,0,1,0,2.0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,2,3,2,0.8,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss,1,1,0,2.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs,2,2,2,0.8,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr,0,2,0,2.0,0


In [52]:
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

In [53]:
train['FamilySize'].value_counts()

1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: FamilySize, dtype: int64

In [54]:
familysize_mapping = {}
for i in range(1, 12):
    familysize_mapping[i] = round(((i-1)*0.4), 2)

In [55]:
familysize_mapping

{1: 0.0,
 2: 0.4,
 3: 0.8,
 4: 1.2,
 5: 1.6,
 6: 2.0,
 7: 2.4,
 8: 2.8,
 9: 3.2,
 10: 3.6,
 11: 4.0}

In [56]:
train['FamilySize'] = train['FamilySize'].map(familysize_mapping)

In [57]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex2,Title,Title2,Age2,Fare2,Cabin2,Embarked2,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr,0,1,0,2.0,0,0.4
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,2,3,2,0.8,1,0.4
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss,1,1,0,2.0,0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs,2,2,2,0.8,0,0.4
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr,0,2,0,2.0,0,0.0


In [58]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex2', 'Title',
       'Title2', 'Age2', 'Fare2', 'Cabin2', 'Embarked2', 'FamilySize'],
      dtype='object')

In [59]:
train_data = train[['Survived', 'Pclass', 'Sex2', 'Title2', 'Age2', 'Fare2', 'Cabin2', 'Embarked2', 'FamilySize']]

In [60]:
# train_data 완성, test_data 시작

In [61]:
test = pd.read_csv('test.csv')

In [62]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [63]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [64]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [65]:
train.groupby('Pclass')['Fare'].median()

Pclass
1    60.2875
2    14.2500
3     8.0500
Name: Fare, dtype: float64

In [66]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex2,Title2,Age2,Fare2,Cabin2,Embarked2,FamilySize
0,0,3,0,0,1,0,2.0,0,0.4
1,1,1,1,2,3,2,0.8,1,0.4
2,1,3,1,1,1,0,2.0,0,0.0
3,1,1,1,2,2,2,0.8,0,0.4
4,0,3,0,0,2,0,2.0,0,0.0


In [67]:
test[test['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [68]:
test['Fare'].fillna(8.05, inplace=True)

In [69]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [70]:
test['Sex2'] = test['Sex'].map({'male':0, 'female':1})

In [71]:
train.groupby('Title2')['Age'].median()

Title2
0    30.0
1    21.0
2    35.0
3     9.0
Name: Age, dtype: float64

In [72]:
test['Title'] = test['Name'].str.extract('([A-Za-z]+)\.')

In [73]:
test['Title2'] = test['Title'].apply(titleMap)

In [74]:
test['Title2'].head()

0    0
1    2
2    0
3    0
4    2
Name: Title2, dtype: int64

In [75]:
def title_ageMap(title):
    if title == 0:
        return 30.0
    elif title == 1:
        return 21.0
    elif title == 2:
        return 35.0
    else :
        return 9.0

In [76]:
test['Age'].fillna(test['Title2'].apply(title_ageMap), inplace=True)

In [77]:
train_data

Unnamed: 0,Survived,Pclass,Sex2,Title2,Age2,Fare2,Cabin2,Embarked2,FamilySize
0,0,3,0,0,1,0,2.0,0,0.4
1,1,1,1,2,3,2,0.8,1,0.4
2,1,3,1,1,1,0,2.0,0,0.0
3,1,1,1,2,2,2,0.8,0,0.4
4,0,3,0,0,2,0,2.0,0,0.0
...,...,...,...,...,...,...,...,...,...
886,0,2,0,3,2,0,1.8,0,0.0
887,1,1,1,1,1,1,0.4,0,0.0
888,0,3,1,1,1,1,2.0,0,1.2
889,1,1,0,0,1,1,0.8,1,0.0


In [78]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Sex2             0
Title            0
Title2           0
dtype: int64

In [79]:
test['Age2'] = test['Age'].apply(ageMap)

In [80]:
test['Fare2'] = test['Fare'].apply(fareMap)

In [81]:
test['Cabin2'] = test['Cabin'].str[0]

In [82]:
test['Cabin2'].value_counts()

C    35
B    18
D    13
E     9
F     8
A     7
G     1
Name: Cabin2, dtype: int64

In [83]:
test['Cabin2'] = test['Cabin2'].map(cabin_mapping)

In [84]:
test['Cabin2']

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
413    NaN
414    0.8
415    NaN
416    NaN
417    NaN
Name: Cabin2, Length: 418, dtype: float64

In [85]:
def cabin_pclassMap(pclass):
    if pclass == 1:
        return 0.8
    elif pclass == 2:
        return 1.8
    else :
        return 2.0

In [86]:
test['Cabin2'].fillna(test['Pclass'].apply(cabin_pclassMap), inplace=True)

In [87]:
test['Embarked2'] = test['Embarked'].apply(embarkedMap)

In [88]:
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [89]:
test['FamilySize'] = test['FamilySize'].map(familysize_mapping)

In [90]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex2,Title,Title2,Age2,Fare2,Cabin2,Embarked2,FamilySize
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,Mr,0,2,0,2.0,2,0.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,Mrs,2,3,0,2.0,0,0.4
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,Mr,0,3,0,1.8,2,0.0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,Mr,0,2,0,2.0,0,0.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,Mrs,2,1,0,2.0,0,0.8


In [91]:
test_data = test[['Pclass', 'Sex2', 'Title2', 'Age2', 'Fare2', 'Cabin2', 'Embarked2', 'FamilySize']]

In [92]:
train_data

Unnamed: 0,Survived,Pclass,Sex2,Title2,Age2,Fare2,Cabin2,Embarked2,FamilySize
0,0,3,0,0,1,0,2.0,0,0.4
1,1,1,1,2,3,2,0.8,1,0.4
2,1,3,1,1,1,0,2.0,0,0.0
3,1,1,1,2,2,2,0.8,0,0.4
4,0,3,0,0,2,0,2.0,0,0.0
...,...,...,...,...,...,...,...,...,...
886,0,2,0,3,2,0,1.8,0,0.0
887,1,1,1,1,1,1,0.4,0,0.0
888,0,3,1,1,1,1,2.0,0,1.2
889,1,1,0,0,1,1,0.8,1,0.0


In [93]:
test_data

Unnamed: 0,Pclass,Sex2,Title2,Age2,Fare2,Cabin2,Embarked2,FamilySize
0,3,0,0,2,0,2.0,2,0.0
1,3,1,2,3,0,2.0,0,0.4
2,2,0,0,3,0,1.8,2,0.0
3,3,0,0,2,0,2.0,0,0.0
4,3,1,2,1,0,2.0,0,0.8
...,...,...,...,...,...,...,...,...
413,3,0,0,2,0,2.0,0,0.0
414,1,1,3,3,3,0.8,1,0.0
415,3,0,0,3,0,2.0,0,0.0
416,3,0,0,2,0,2.0,0,0.0


In [94]:
target = train_data['Survived']
y = target.values
y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [95]:
x = train_data.drop(['Survived'], axis = 1).values

In [96]:
x

array([[3. , 0. , 0. , ..., 2. , 0. , 0.4],
       [1. , 1. , 2. , ..., 0.8, 1. , 0.4],
       [3. , 1. , 1. , ..., 2. , 0. , 0. ],
       ...,
       [3. , 1. , 1. , ..., 2. , 0. , 1.2],
       [1. , 0. , 0. , ..., 0.8, 1. , 0. ],
       [3. , 0. , 0. , ..., 2. , 2. , 0. ]])

# 데이터 자르기

In [97]:
from sklearn.model_selection import train_test_split

In [98]:
import numpy as np

In [99]:
from sklearn.metrics import accuracy_score

In [100]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
len(x_train), len(x_test)

(712, 179)

In [101]:
x_train.shape

(712, 8)

# 모델 설계

In [102]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [103]:
model = Sequential()

In [104]:
model.add(Dense(1024, activation = 'relu', input_dim = 8))
model.add(Dense(512, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(16, activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(4, activation = 'relu'))
model.add(Dense(2, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

In [105]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1024)              9216      
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 dense_2 (Dense)             (None, 256)               131328    
                                                                 
 dense_3 (Dense)             (None, 128)               32896     
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 32)                2080      
                                                                 
 dense_6 (Dense)             (None, 16)                5

In [106]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [121]:
model.fit(x_train, y_train, epochs=500, verbose = 0)

<keras.src.callbacks.History at 0x7e63b8089120>

In [122]:
y_pred = model.predict(x_test)



In [123]:
y_pred

array([[1.6487570e-10],
       [9.9934769e-01],
       [9.9930215e-01],
       [8.9253634e-02],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.8318556e-06],
       [9.9999994e-01],
       [1.0000000e+00],
       [6.2266159e-01],
       [4.0263575e-01],
       [1.0000000e+00],
       [9.9326479e-01],
       [7.4750719e-06],
       [3.5843381e-03],
       [1.9918854e-05],
       [1.0986514e-01],
       [5.3793225e-02],
       [1.4406358e-02],
       [5.2328065e-02],
       [2.0456081e-02],
       [1.0000000e+00],
       [5.2328065e-02],
       [1.2396154e-02],
       [5.2328065e-02],
       [1.0000000e+00],
       [9.9819684e-01],
       [1.0000000e+00],
       [1.0000000e+00],
       [4.0263575e-01],
       [1.0654001e-01],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0074128e-01],
       [1.0074128e-01],
       [1.0000000e+00],
       [6.9059053e-04],
       [2.2890702e-02],
       [5.3793225e-02],
       [5.2328065e-02],
       [8.1211847e-01],
       [4.651448

In [124]:
t  = []
for i in range(len(y_pred)):
  if y_pred[i] > 0.5:
    t.append(1)
  else :
    t.append(0)

In [125]:
t

[0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

In [126]:
accuracy_score(y_test, t) * 100

75.97765363128491