# 타이타닉 생존자 분석

## 기본 데이터 분석

In [789]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [790]:
test = pd.read_csv('test.csv')

In [791]:
train = pd.read_csv('train.csv')

In [792]:
train.drop(['PassengerId'],axis=1, inplace=True)
train.drop(['Name'],axis=1, inplace=True)
train.drop(['Ticket'],axis=1, inplace=True)
train.drop(['Cabin'],axis=1, inplace=True)
train.dropna(axis=0, inplace=True)

# train=pd.get_dummies(data=train, columns=['Pclass'], prefix=['Pclass'])
train=pd.get_dummies(data=train, columns=['Sex'], prefix=['Sex'])
train=pd.get_dummies(data=train, columns=['Embarked'], prefix=['Embarked'])

In [793]:
train.head(5)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


In [794]:
data = train[train.columns.difference(['Survived'])].to_numpy()
target = train['Survived'].to_numpy()

In [795]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=16)

In [796]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(x_train,y_train)

LGBMClassifier()

In [797]:
y_pred = clf.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy= accuracy_score(y_pred,y_test)
accuracy

0.8364485981308412

In [798]:
print(f'Training set score : {clf.score(x_train,y_train)}')
print(f'Test set score : {clf.score(x_test,y_test)}')

Training set score : 0.9457831325301205
Test set score : 0.8364485981308412


## 하이퍼파라미터 튜닝

In [799]:
train_ds = lgb.Dataset(x_train, label=y_train)
test_ds = lgb.Dataset(x_test, label=y_test)

params = {
    'application' : 'binary',
    'objective' : 'binary',
    'metric' : 'auc',
    'is_unbalance' : 'true',
    'boosting' : 'gbdt',
    'num_leaves' : 31,
    'feature_fraction' : 0.5,
    'bagging_fraction' : 0.5,
    'bagging_freq' : 20,
    'learning_rate' : 0.01,
    'verbose' : -1
    
}

model = lgb.train(params, train_ds, 
                 valid_sets = test_ds,
                  num_boost_round=2000,
                  
                 )

[1]	valid_0's auc: 0.799499
[2]	valid_0's auc: 0.812169
[3]	valid_0's auc: 0.832451
[4]	valid_0's auc: 0.84127
[5]	valid_0's auc: 0.843776
[6]	valid_0's auc: 0.852873
[7]	valid_0's auc: 0.861134
[8]	valid_0's auc: 0.85575
[9]	valid_0's auc: 0.857096
[10]	valid_0's auc: 0.858071
[11]	valid_0's auc: 0.856307
[12]	valid_0's auc: 0.858999
[13]	valid_0's auc: 0.860763
[14]	valid_0's auc: 0.859649
[15]	valid_0's auc: 0.86002
[16]	valid_0's auc: 0.861134
[17]	valid_0's auc: 0.86262
[18]	valid_0's auc: 0.863455
[19]	valid_0's auc: 0.862155
[20]	valid_0's auc: 0.862805
[21]	valid_0's auc: 0.861691
[22]	valid_0's auc: 0.86132
[23]	valid_0's auc: 0.860206
[24]	valid_0's auc: 0.859928
[25]	valid_0's auc: 0.860438
[26]	valid_0's auc: 0.859881
[27]	valid_0's auc: 0.860345
[28]	valid_0's auc: 0.859231
[29]	valid_0's auc: 0.859324
[30]	valid_0's auc: 0.85951
[31]	valid_0's auc: 0.859788
[32]	valid_0's auc: 0.859928
[33]	valid_0's auc: 0.860206
[34]	valid_0's auc: 0.860299
[35]	valid_0's auc: 0.860763


[1360]	valid_0's auc: 0.883227
[1361]	valid_0's auc: 0.883227
[1362]	valid_0's auc: 0.883505
[1363]	valid_0's auc: 0.883598
[1364]	valid_0's auc: 0.883784
[1365]	valid_0's auc: 0.883691
[1366]	valid_0's auc: 0.883876
[1367]	valid_0's auc: 0.883969
[1368]	valid_0's auc: 0.884062
[1369]	valid_0's auc: 0.883876
[1370]	valid_0's auc: 0.884062
[1371]	valid_0's auc: 0.884248
[1372]	valid_0's auc: 0.884248
[1373]	valid_0's auc: 0.88434
[1374]	valid_0's auc: 0.884805
[1375]	valid_0's auc: 0.884805
[1376]	valid_0's auc: 0.884805
[1377]	valid_0's auc: 0.884712
[1378]	valid_0's auc: 0.885083
[1379]	valid_0's auc: 0.885083
[1380]	valid_0's auc: 0.885362
[1381]	valid_0's auc: 0.885269
[1382]	valid_0's auc: 0.885362
[1383]	valid_0's auc: 0.885454
[1384]	valid_0's auc: 0.885454
[1385]	valid_0's auc: 0.88564
[1386]	valid_0's auc: 0.88564
[1387]	valid_0's auc: 0.885454
[1388]	valid_0's auc: 0.885547
[1389]	valid_0's auc: 0.885362
[1390]	valid_0's auc: 0.885733
[1391]	valid_0's auc: 0.885826
[1392]	vali

In [800]:
y_pred = model.predict(x_test)
print(y_pred)

[0.81448623 0.16424132 0.14219096 0.05745426 0.20255803 0.98354658
 0.9806228  0.10668306 0.96881066 0.8004608  0.13373227 0.92008124
 0.04409191 0.65642635 0.10131262 0.40390504 0.88554925 0.83354964
 0.09886422 0.29995429 0.64914652 0.01444132 0.78321959 0.03275707
 0.09464793 0.79305495 0.57633773 0.55730653 0.27298188 0.11138336
 0.01975637 0.62080442 0.27943472 0.29928297 0.05425084 0.12954714
 0.32843468 0.99437324 0.98927284 0.6471257  0.95443878 0.21353488
 0.17737254 0.16034171 0.37510441 0.97364931 0.3236662  0.10146269
 0.14599309 0.73122055 0.90617455 0.13514277 0.26958184 0.35720914
 0.04542096 0.99284025 0.4182779  0.97665482 0.19525343 0.05813378
 0.07734999 0.23827332 0.14138663 0.09995592 0.19934018 0.07524827
 0.63413679 0.98148375 0.32785621 0.0316764  0.37331541 0.17995895
 0.7763668  0.5280965  0.19116276 0.16353983 0.97376733 0.06994879
 0.43420106 0.70080458 0.8781839  0.61852389 0.64907486 0.72504161
 0.86996971 0.19721947 0.03275012 0.31721296 0.86965767 0.2886

In [801]:
from sklearn.metrics import confusion_matrix, accuracy_score
for i in range(len(y_pred)) : 
    if y_pred[i] >= .5:
        y_pred[i] =1
    else : 
        y_pred[i] = 0
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_pred,y_test)

In [802]:
print(cm)
print(accuracy)

[[115  18]
 [ 15  66]]
0.8457943925233645
