In [197]:
import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt 
import seaborn as sns

In [198]:
df = pd.read_csv(r'C:\labs\src\datasets\seaborn-data\titanic.csv')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
df.shape, df.info(), df.describe()

In [199]:
df['survived'].value_counts()

0    549
1    342
Name: survived, dtype: int64

In [200]:
df['family_size'] = df['sibsp'] + df['parch'] # sibsp : 형재 혹은 부부의 수 / parch : 부모 혹은 자녀의 수

In [201]:
df1 = df.drop(['sibsp','parch','fare','class','adult_male','embark_town','alive'], axis=1, inplace=False)
df1.head()

Unnamed: 0,survived,pclass,sex,age,embarked,who,deck,alone,family_size
0,0,3,male,22.0,S,man,,False,1
1,1,1,female,38.0,C,woman,C,False,1
2,1,3,female,26.0,S,woman,,True,0
3,1,1,female,35.0,S,woman,C,False,1
4,0,3,male,35.0,S,man,,True,0


In [202]:
df1.isnull().sum()/len(df)

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
embarked       0.002245
who            0.000000
deck           0.772166
alone          0.000000
family_size    0.000000
dtype: float64

In [None]:
fig = plt.figure(figsize=(6,3))
ax = fig.add_subplot(111)
sns.heatmap(df1.isnull())
plt.show()

In [203]:
df1['deck'].fillna('Nan_data', inplace=True)
df1
groupby_count = df1.groupby(df1['deck']).count()
groupby_sum = df1.groupby(df1['deck']).sum()
groupby_sum['survived_rate'] = groupby_sum['survived'] / groupby_count['survived']
groupby_sum
# when deck is not missing, the survived rate is much higher than when deck data is missing 

Unnamed: 0_level_0,survived,pclass,age,alone,family_size,survived_rate
deck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,7,15,538.0,12,4,0.466667
B,35,47,1573.0,22,44,0.744681
C,35,59,1840.42,22,66,0.59322
D,25,37,1210.0,13,24,0.757576
E,24,42,1143.5,17,20,0.75
F,8,31,219.5,7,14,0.615385
G,2,12,59.0,0,7,0.5
Nan_data,206,1814,14621.75,444,627,0.299419


In [None]:
bins = [5*i for i in range(20)]
kws = {'edgecolor':'black', 'linewidth':'2'}

sns.distplot(df1[df1['sex']=='male']['age'], color='blue', bins=bins, hist_kws=kws, label='male')
sns.distplot(df1[df1['sex']=='female']['age'], color='red', bins=bins, hist_kws=kws, label='female')
plt.legend()
plt.show()

In [204]:
# age,embarked의 missing data에 중앙 값, 최빈 값을 추가

df1['age'] = df1['age'].fillna(df1['age'].median()) 
df1['embarked'] = df1['embarked'].fillna(df1['embarked'].mode()[0]) 
df1.isnull().sum()/len(df1)

survived       0.0
pclass         0.0
sex            0.0
age            0.0
embarked       0.0
who            0.0
deck           0.0
alone          0.0
family_size    0.0
dtype: float64

In [None]:
def plot_row(src, ref, col_list):
    for index, value in enumerate(col_list):
        plt.subplot(1, len(col_list), index+1)
        sns.countplot(x=value, hue=ref, data=src)
        plt.title(value)
    plt.tight_layout()
    plt.show()

In [None]:
plt.figure(figsize=(15,3))
plot_row(df1, 'survived', ['pclass', 'sex', 'age', 'embarked'])

In [None]:
plt.figure(figsize=(15,3))
plot_row(df1, 'survived', ['who', 'alone', 'family_size', 'deck'])

In [205]:
# 범주형 처리1 
labels, unique = pd.factorize(df1['sex'])
df1['sex'] = labels 
df1['sex'].value_counts()

0    577
1    314
Name: sex, dtype: int64

In [206]:
# 범주형 처리2 
labels, unique = pd.factorize(df1['embarked'])
df1['embarked'] = labels
df1['embarked'].value_counts()

0    646
1    168
2     77
Name: embarked, dtype: int64

In [207]:
# 범주형 처리3
labels, unique = pd.factorize(df1['who'])
df1['who'] = labels
df1['who'].value_counts()

0    537
1    271
2     83
Name: who, dtype: int64

In [208]:
# 범주형 처리4
labels, unique = pd.factorize(df1['alone'])
df1['alone'] = labels
df1['alone'].value_counts()

1    537
0    354
Name: alone, dtype: int64

In [None]:
# data 타입 변경
df1['age'] = df1['age'].astype(int)
df1.drop(['deck'], axis=1, inplace=True)

In [210]:
# 학습
from sklearn.model_selection import train_test_split 

y_df = df1['survived']
x_df = df1.drop(['survived'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.25, random_state=5)

In [224]:
# 호지스틱 회귀
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression(random_state=15)
lr.fit(x_train, y_train)
lr_y_pred = lr.predict(x_test)
lr_y_pred[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [212]:
# 평가 
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, lr_y_pred))
print(accuracy_score(lr_y_pred, y_test))

0.8430493273542601
0.8430493273542601


In [213]:
lr.score(x_test, y_test)

0.8430493273542601

In [214]:
print('테스트 셋 관측 수 : ', y_test.shape[0])
print('오분류 관측 수 : ', sum(lr.predict(x_test) != y_test))

테스트 셋 관측 수 :  223
오분류 관측 수 :  35


In [215]:
# 예측 확률 반환
lr.predict_proba(x_test)[:3] # more.....

array([[0.86834406, 0.13165594],
       [0.89513474, 0.10486526],
       [0.92551434, 0.07448566]])

In [216]:
# Attributes (속성)
print('classes:', lr.classes_)
print('beta:', lr.coef_)
print('intercept:', lr.intercept_)


classes: [0 1]
beta: [[-1.06173319  1.62013784 -0.01679187  0.25790195  1.13157741 -0.47209086
  -0.45177897]]
intercept: [1.72526284]


In [None]:
# Methods 
# fit(X,y)
# predict(X)
# score(X,y) : accuracy 값 반환 
# decision_function(X) : 클래스 레비을 예측의 확신 점수 / 각 클래스의 분류 초평면으로부터 떨어진 거리 
# predict_proba(X)

In [None]:
lr.decision_function(x_test)

In [223]:
df3 = df1.copy()
df3.head()

Unnamed: 0,survived,pclass,sex,age,embarked,who,alone,family_size
0,0,3,0,22,0,0,0,1
1,1,1,1,38,1,1,0,1
2,1,3,1,26,0,1,1,0
3,1,1,1,35,0,1,0,1
4,0,3,0,35,0,0,1,0


Label Encoding

In [222]:
# label Encoding 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 

lb_encoder = LabelEncoder()
lb_encoder.fit(df3['sex'])
labels = lb_encoder.transform(df3['sex'])
labels[:5]

array([0, 1, 1, 1, 0], dtype=int64)

In [219]:
lb_encoder.classes_

array([0, 1], dtype=int64)

One-Hot Encoding

In [220]:
df4 = df.copy()
# 2차원 변환
labels = labels.reshape(-1,1)
labels[:5]

array([[0],
       [1],
       [1],
       [1],
       [0]], dtype=int64)

In [221]:
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)

df5 = pd.DataFrame(oh_labels.toarray(), columns=lb_encoder.classes_, dtype=int) 

df6 = pd.concat([df3, df5], axis=1)
df6.head()

Unnamed: 0,survived,pclass,sex,age,embarked,who,alone,family_size,0,1
0,0,3,0,22,0,0,0,1,1,0
1,1,1,1,38,1,1,0,1,0,1
2,1,3,1,26,0,1,1,0,0,1
3,1,1,1,35,0,1,0,1,0,1
4,0,3,0,35,0,0,1,0,1,0
