In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [13]:
import warnings
warnings.simplefilter("ignore")

In [2]:
# 타이타닉 데이터셋 로드

titanic_df = sns.load_dataset('titanic')

#### 데이터 탐색

In [3]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


- survived : 생존여부(0:사망, 1:생존)
- sibsp : Sibling, Spouse(탑승한 형제자매/배우자 수)
- parch : parent, Child (탑승한 부모/자녀 수)
- Pclass : 1st=Upper, 2nd=Middle, 3rd=Lower(1에 가까울수록 좋은 자리)
- Embarked(승선지): C(Cherbourg), Q(Queestown), S(Southampton)
- who : main, woman, child
- fare : 요금
- deck : cabin 객실번호

In [4]:
# 행, 열

titanic_df.shape

(891, 15)

In [5]:
# 기술통계요약
titanic_df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
titanic_df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [7]:
# 누락값 제외하고 기술통계요약 정보 조회

titanic_df.dropna().describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,182.0,182.0,182.0,182.0,182.0,182.0
mean,0.675824,1.192308,35.623187,0.467033,0.478022,78.919735
std,0.469357,0.516411,15.671615,0.645007,0.755869,76.490774
min,0.0,1.0,0.92,0.0,0.0,0.0
25%,0.0,1.0,24.0,0.0,0.0,29.7
50%,1.0,1.0,36.0,0.0,0.0,57.0
75%,1.0,1.0,47.75,1.0,1.0,90.0
max,1.0,3.0,80.0,3.0,4.0,512.3292


In [15]:
# 클래스별 탑승자는 몇 명인가?

titanic_df.groupby('class').count()
titanic_df.groupby('class')['survived'].count()    # 시리즈 구조
titanic_df.groupby('class')[['survived']].count()  # 데이터프레임 구조

Unnamed: 0_level_0,survived
class,Unnamed: 1_level_1
First,216
Second,184
Third,491


In [16]:
# 승객들이 클래스별로 지불한 요금의 중앙값은 얼마인가?

print("중앙값 ",titanic_df.groupby('class')['fare'].median())
print("평균 ",titanic_df.groupby('class')['fare'].mean())

중앙값  class
First     60.2875
Second    14.2500
Third      8.0500
Name: fare, dtype: float64
평균  class
First     84.154687
Second    20.662183
Third     13.675550
Name: fare, dtype: float64


In [20]:
# 생존자 정보 조회

titanic_df.head(2)
titanic_df[titanic_df['alive'] == 'yes']
titanic_df.query("alive == 'yes'")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [21]:
# 생존자 중에서 클래스별 정보 조회

titanic_df.query("alive == 'yes'").groupby('class').count()

Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alive,alone
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
First,136,136,136,122,136,136,136,134,136,136,117,134,136,136
Second,87,87,87,83,87,87,87,87,87,87,13,87,87,87
Third,119,119,119,85,119,119,119,119,119,119,6,119,119,119


In [24]:
# 나이가 30세 초과인 승객에 대해 클래스별 중앙값

titanic_df.query("age > 30").groupby('class').median(numeric_only=True)

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
First,1.0,1.0,45.0,0.0,0.0,56.9292,1.0,0.0
Second,0.0,2.0,39.0,0.0,0.0,14.0,1.0,1.0
Third,0.0,3.0,38.0,0.0,0.0,8.05,1.0,1.0


In [25]:
# 요금지불비용이 20보다 작은 승객에 대해 클래스별 중앙값

titanic_df.query("fare < 20").groupby('class').median(numeric_only=True)

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
First,0.0,1.0,38.5,0.0,0.0,0.0,1.0,1.0
Second,0.0,2.0,30.0,0.0,0.0,13.0,1.0,1.0
Third,0.0,3.0,25.0,0.0,0.0,7.8958,1.0,1.0
