<a href="https://colab.research.google.com/github/dantae74/machine-learning/blob/main/kaggle/02-data-analysis-for-titanic-by-pclass-sex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### [이유한님] 캐글 스터디 강좌에서 가져왔습니다.

# Titanic 데이터 분석

In [None]:
! pip install kaggle

In [None]:
! mkdir ~/.kaggle

In [None]:
# kaggle에서 로그인하여 계정에서 Account 탭에서 API -> Create New API Token을 다운로드하여 colab에 업로드 후 아래 실행

! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Kaggle Competititions에서 과제 검색 -> Data에서 복사하여 실행

! kaggle competitions download -c titanic

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=2)

import missingno as msno

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
for col in df_train.columns:
    print('column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100*df_train[col].isnull().sum()/df_train[col].shape[0]))

In [None]:
for col in df_test.columns:
    print('column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100*df_test[col].isnull().sum()/df_test[col].shape[0]))

In [None]:
msno.matrix(df=df_train.iloc[:,:], figsize=(8,8), color=(0.8,0.5,0.2))

In [None]:
msno.bar(df=df_train, figsize=(8,8), color=(0.8,0.5,0.2))

In [None]:
f, ax = plt.subplots(1, 2, figsize=(18,8))
df_train['Survived'].value_counts().plot.pie(ax=ax[0], explode=[0,0.05], autopct='%.1f%%', shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('')
sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')
plt.show()

## 2.1 Pclass

In [None]:
df_train[['Pclass', 'Survived']].groupby('Pclass').count()

In [None]:
df_train[['Pclass', 'Survived']].groupby('Pclass').sum()

In [None]:
df_train['Survived'].unique()

In [None]:
pd.crosstab(df_train['Pclass'], df_train['Survived'], margins=True).style.background_gradient(cmap='cool')

In [None]:
y_position = 1.02
f, ax = plt.subplots(1,2,figsize=(18,8))
df_train['Pclass'].value_counts().plot.bar(color=['#CD7F32', '#FFDF00', '#D3D3D3'], ax=ax[0])
ax[0].set_title('Number of passenger by Plcass')
ax[0].set_ylabel('Count')
sns.countplot('Pclass', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Pclass: Survived vs Dead')
ax[1].set_ylabel('Count')
plt.show()

## 2.2 Sex

In [None]:
f, ax = plt.subplots(1,2, figsize=(18, 8))
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('Sex: Survived vs Dead')
plt.show()

In [None]:
pd.crosstab(df_train['Sex'], df_train['Survived'], margins=True).style.background_gradient(cmap='summer_r')

## 2.2 Both Sex and Pclass

In [None]:
sns.factorplot('Pclass', 'Survived', hue='Sex', data=df_train, size=6, aspect=1.5)

# Lady first
# Money brings Survival? 

In [None]:
sns.factorplot(x='Sex', y='Survived', col='Pclass', data=df_train, saturation=5, size=9, aspect=1)

In [None]:
sns.factorplot(x='Sex', y='Survived', hue='Pclass', data=df_train, saturation=5, size=9, aspect=1)

## Age

In [None]:
print("제일 나이 많은 탑승객: {:.1f} year".format(df_train['Age'].max()))
print("제일 나이 어린 탑승객: {:.1f} year".format(df_train['Age'].min()))
print("평균 탑승객 나이: {:.1f} year".format(df_train['Age'].mean()))

In [None]:
fig, ax = plt.subplots(1,1, figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived']==1]['Age'], ax=ax)
sns.kdeplot(df_train[df_train['Survived']==0]['Age'], ax=ax)
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

## PClass

In [None]:
plt.figure(figsize=(8,9))
df_train[df_train['Pclass']==1]['Age'].plot(kind='kde')
df_train[df_train['Pclass']==2]['Age'].plot(kind='kde')
df_train[df_train['Pclass']==3]['Age'].plot(kind='kde')
plt.xlabel("Age")
plt.legend(['1st class', '2nd class', '3rd class'])
plt.show()

In [None]:
plt.figure(figsize=(8,9))
df_train[(df_train['Pclass']==1) & (df_train['Survived'] == 1)]['Age'].plot(kind='kde')
df_train[(df_train['Pclass']==2) & (df_train['Survived'] == 1)]['Age'].plot(kind='kde')
df_train[(df_train['Pclass']==3) & (df_train['Survived'] == 1)]['Age'].plot(kind='kde')
plt.xlabel("Age")
plt.legend(['1st class', '2nd class', '3rd class'])
plt.show()

In [None]:
change_age_range_survival_ratio = []

for i in range(1, 80):
  change_age_range_survival_ratio.append(df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age'] < i]['Survived']))

plt.figure(figsize=(7,7))
plt.plot(change_age_range_survival_ratio)
plt.title('Survival ratio change depending on range of Age', y=1.02)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(0~x)')
plt.show()

# Pclass, Sex, Age

In [None]:
fig, ax = plt.subplots(1,2,figsize=(18,8))
sns.violinplot('Pclass', 'Age', hue='Survived', data=df_train, scale='count', split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,100,10))

sns.violinplot('Sex', 'Age', hue='Survived', data=df_train, scale='count', split=True, ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,100,10))