## 타이타닉 데이터 분석

1. survived : 생존여부 (1 생존/ 0 사망)
2. pclass: 사회적 지위를 나타내는 지표\
    1st = Upper\
    2nd = Middle\
    3rd = Lower
3. age: 연령. 예측된 연령은 .의 형태로 되어 있음
4. sibsp: 형제/배우자 여부\
    Sibling = brother, sister, stepbrother, stepsister\
    Spouse = husband, wife (mistresses and fiancés were ignored)
5. parch: 부모/자식 여부\
    Parent = mother, father
    Child = daughter, son, stepdaughter, stepson\
    Some children travelled only with a nanny, therefore parch=0 for them.
6. ticket : 티켓 번호
7. fare : 승객요금
8. cabin : 캐빈 번호
9. embarked\
    탑승했던 곳

### 1. 문제설정

어떤 군의 사람들이 가장 높은 사망률을 보이는가?/사망을 예측하는 지표는?

In [22]:
import pandas as pd

titanic = pd.read_csv("data/titanic.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 2. 데이터 관찰/정제

In [23]:
# 총 데이터는 891명의 데이터로 확인됨.

titanic.shape

(891, 12)

In [3]:
# 중복된 데이터가 있는지?

titanic.Name.nunique(), titanic.PassengerId.nunique()

(891, 891)

In [7]:
# 데이터 집계 

titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [29]:
# 빈값이 있는지 확인

titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [32]:
# 컬럼별로 살펴보자.
# 특별히 비어있는 값이 많은 cabin을 살펴보자.

titanic['Cabin'].isna().value_counts()

Cabin
True     687
False    204
Name: count, dtype: int64

In [34]:
# 먼저 사망/생존의 비율을 살펴보자.

titanic['Survived'].value_counts(normalize=True)

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [39]:
# Na 값을 다루는 방법
# 1. 컬럼을 삭제하는 방법
# 2. 있는 데이터만 사용
# 3. 새로운 데이터로 형성해서 최대한으로 유지

# 데이터가 있는 경우 분포가 달라지는가? 
titanic[~titanic['Cabin'].isna()]["Survived"].value_counts(normalize=True)

Survived
1    0.666667
0    0.333333
Name: proportion, dtype: float64

In [40]:
# 사망률에 있어서 큰 차이가 앖는 것을 확인할 수 있다. 
# 즉, 캐빈정보의 있고/없고는 사망에 큰 영향을 주지 않는 정보임을 확인할 수 있다.

titanic[titanic['Cabin'].isna()]["Survived"].value_counts(normalize=True)

Survived
0    0.700146
1    0.299854
Name: proportion, dtype: float64

In [41]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [42]:
# cabin 정보는 drop 

titanic = titanic.drop(columns="Cabin")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C


In [43]:
# 연령정보를 추가할 수 있는가?

titanic[titanic['Age'].isna()]['Survived'].value_counts(normalize=True)

Survived
0    0.706215
1    0.293785
Name: proportion, dtype: float64

In [44]:
# 전체 중에서 177명을 삭제하는 것은 좋은 선택은 아니다.
# 일단 데이터가 없는 상태에서 그대로 진행하기로 한다.

titanic[titanic['Age'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,S


In [48]:
# Name에서는 얻을 수 있는 insight가 없으므로 일단 drop

titanic.drop(columns = "Name", inplace=True)

In [49]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,female,35.0,1,0,113803,53.1,S
4,5,0,3,male,35.0,0,0,373450,8.05,S


In [21]:
titanic.groupby(['Survived', 'Pclass'], as_index=False)['PassengerId'].count()

Unnamed: 0,Survived,Pclass,PassengerId
0,0,1,80
1,0,2,97
2,0,3,372
3,1,1,136
4,1,2,87
5,1,3,119


In [50]:
# groupby를 통해서 두 군의 차이를 확인해보자.

pclass = titanic.groupby(['Survived', 'Pclass'], as_index=False)['PassengerId'].count()
pclass.pivot_table(index="Survived", columns="Pclass").reset_index(drop=True)


Unnamed: 0_level_0,PassengerId,PassengerId,PassengerId
Pclass,1,2,3
0,80.0,97.0,372.0
1,136.0,87.0,119.0


`**위에서 보는 것처럼 Pclass가 3일수록 사망률이 더 높다. 0가 높음을 볼 수 있다**`

In [17]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,female,35.0,1,0,113803,53.1,S
4,5,0,3,male,35.0,0,0,373450,8.05,S


In [18]:
# 성별에 따라서 예측 가능한지 관찰해보자.
# 관찰하면 남성이 사망한 경우가 더 많은 것을 볼 수 있다. 

gender = titanic.groupby(['Survived', 'Sex'], as_index=False)['PassengerId'].count()
gender.pivot_table(index="Survived", columns="Sex").reset_index(drop=True)

Unnamed: 0_level_0,PassengerId,PassengerId
Sex,female,male
0,81.0,468.0
1,233.0,109.0


In [19]:
# SibSp를 보자

titanic['SibSp'].value_counts().sort_index()

SibSp
0    608
1    209
2     28
3     16
4     18
5      5
8      7
Name: count, dtype: int64

In [51]:
# 새롭게 그루핑을 해보자
# SibSp를 0,1, 2>= 으로 만든다.

def encode_sibsp(value) :
    if value == 0 :
        return "0"
    elif value == 1 :
        return "1"
    else : 
        return ">=2"
    
titanic['SibSp'] = titanic['SibSp'].apply(encode_sibsp)
titanic['SibSp'].value_counts()

SibSp
0      608
1      209
>=2     74
Name: count, dtype: int64

In [52]:
sibsp = titanic.groupby(['Survived', 'SibSp'], as_index=False)['PassengerId'].count()
# droplevel은 multiindex column을 해제
cols = sibsp.pivot_table(index="Survived", columns = "SibSp").reset_index(drop=True).columns.droplevel()
sibsp = sibsp.pivot_table(index="Survived", columns="SibSp")

In [53]:
# column 재정렬
sibsp.columns = cols
sibsp.reset_index(drop=True).index.name="Survived"
sibsp

SibSp,0,1,>=2
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,398.0,97.0,54.0
1,210.0,112.0,20.0


In [23]:
titanic['Parch'].value_counts().sort_index()

Parch
0    678
1    118
2     80
3      5
4      4
5      5
6      1
Name: count, dtype: int64

In [54]:
# 동일하게 그룹화를 해보자.
def encode_parch(value) :
    if value >= 2 :
        return ">=2"
    elif value == 1 :
        return "1"
    else :
        return "0"

titanic['Parch'] = titanic['Parch'].apply(encode_parch)
titanic['Parch'].value_counts()

Parch
0      678
1      118
>=2     95
Name: count, dtype: int64

In [55]:
titanic.groupby(['Survived','Parch'], as_index=False)['PassengerId'].count()

Unnamed: 0,Survived,Parch,PassengerId
0,0,0,445
1,0,1,53
2,0,>=2,51
3,1,0,233
4,1,1,65
5,1,>=2,44


In [56]:
parch = titanic.groupby(['Survived','Parch'], as_index=False)['PassengerId'].count()
parch.pivot_table(index="Survived",columns="Parch")

Unnamed: 0_level_0,PassengerId,PassengerId,PassengerId
Parch,0,1,>=2
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,445.0,53.0,51.0
1,233.0,65.0,44.0


In [57]:
# 이제 ticket을 본다.
titanic['Ticket']

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

In [58]:
# 무의미한 것 같아 drop
titanic.drop(columns = 'Ticket', inplace=True)

In [59]:
# fare 비용에 따라 Surive가 차이가 나는지
# fare가 높을수록 생존

titanic.groupby(['Survived'], as_index=False)['Fare'].mean()

Unnamed: 0,Survived,Fare
0,0,22.117887
1,1,48.395408


In [60]:
# 마지막으로 Embarked 관찰

embarked = titanic.groupby(['Survived','Embarked'], as_index=False)['PassengerId'].count()
embarked

Unnamed: 0,Survived,Embarked,PassengerId
0,0,C,75
1,0,Q,47
2,0,S,427
3,1,C,93
4,1,Q,30
5,1,S,217


In [61]:
# 사망한 사람 중에 S의 비율이 높다.

embarked.pivot_table(index="Survived", columns="Embarked")

Unnamed: 0_level_0,PassengerId,PassengerId,PassengerId
Embarked,C,Q,S
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,75.0,47.0,427.0
1,93.0,30.0,217.0


### 3. 가설 설정
- 모든 변수가 사망/생존에 영향을 줌을 볼 수 있었다.
- 그렇다면 Pclass와 사망/생존은 영향이 있는 변수인 것인지
    - `Pclass와 사망/생존은 관련성이 없다` = 귀무가설(H0, Null Hypothesis)
- 이것을 확인하기 위한 test : Chi-Square Test

### 4. 가설 검정

In [33]:
# 기술통계를 위한 tableone 패키지 설치
!pip3 install tableone



In [65]:
pclass = titanic.groupby(["Survived","Pclass"], as_index=False)['PassengerId'].count()
pclass

Unnamed: 0,Survived,Pclass,PassengerId
0,0,1,80
1,0,2,97
2,0,3,372
3,1,1,136
4,1,2,87
5,1,3,119


In [66]:
pclass.rename(columns = {"PassengerId" : "counts"}, inplace=True)
pclass

Unnamed: 0,Survived,Pclass,counts
0,0,1,80
1,0,2,97
2,0,3,372
3,1,1,136
4,1,2,87
5,1,3,119


In [67]:
pclass = pclass.pivot_table(index='Pclass', columns='Survived')
pclass

Unnamed: 0_level_0,counts,counts
Survived,0,1
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
1,80.0,136.0
2,97.0,87.0
3,372.0,119.0


In [72]:
# survival과 titanic Pclass의 유의성 검정
# chisquare-test를 진행한다.
# pvalue가 0보다 작으므로 연관이 있음을 볼 수 있다.

from scipy.stats import chi2_contingency
chi2_contingency(pclass.values)

Chi2ContingencyResult(statistic=102.88898875696056, pvalue=4.549251711298793e-23, dof=2, expected_freq=array([[133.09090909,  82.90909091],
       [113.37373737,  70.62626263],
       [302.53535354, 188.46464646]]))

In [73]:
# tableone은 전체적인 기술통계를 확인하는 좋은 패키지다.

from tableone import TableOne

columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
categorical = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
continuous = ['Age', 'Fare']

TableOne(titanic, groupby="Survived", 
         columns=columns, categorical=categorical, continuous=continuous,
         normal_test=True, pval=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Survived,Grouped by Survived,Grouped by Survived,Grouped by Survived,Grouped by Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,0,1,P-Value
n,,,891,549,342,
"Pclass, n (%)",1,,216 (24.2),80 (14.6),136 (39.8),<0.001
"Pclass, n (%)",2,,184 (20.7),97 (17.7),87 (25.4),
"Pclass, n (%)",3,,491 (55.1),372 (67.8),119 (34.8),
"Sex, n (%)",female,,314 (35.2),81 (14.8),233 (68.1),<0.001
"Sex, n (%)",male,,577 (64.8),468 (85.2),109 (31.9),
"Age, mean (SD)",,177.0,29.7 (14.5),30.6 (14.2),28.3 (15.0),0.041
"SibSp, n (%)",0,,608 (68.2),398 (72.5),210 (61.4),<0.001
"SibSp, n (%)",1,,209 (23.5),97 (17.7),112 (32.7),
"SibSp, n (%)",>=2,,74 (8.3),54 (9.8),20 (5.8),


In [74]:
# 더 세부적으로 pClass를 검정해보자.

pclass = titanic.groupby(['Survived','Pclass'], as_index=False)['PassengerId'].count()
pclass

Unnamed: 0,Survived,Pclass,PassengerId
0,0,1,80
1,0,2,97
2,0,3,372
3,1,1,136
4,1,2,87
5,1,3,119


In [75]:
pclass.pivot_table(index="Survived", columns='Pclass')

Unnamed: 0_level_0,PassengerId,PassengerId,PassengerId
Pclass,1,2,3
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,80.0,97.0,372.0
1,136.0,87.0,119.0


### 5. 결론

- 결국 모든 변수들이 사망/생존에 있어 영향을 줌을 알 수 있었다. 성별에 따라서는 남성이 여성보다 많이 사망했다.
- 사망한 사람의 연령이 조금 더 높았으며
- 형제 혹은 자녀와 같이 가족이 있는 경우의 생존률이 더 높았다.
- 특별히 Passenger Class에서 사망한 경우는 3등칸에 있던 사람들의 사망이 앞도적으로 많았다.