In [58]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split

df = sns.load_dataset('titanic')
X_train, X_test, y_train, y_test = train_test_split(df, df['survived'], test_size=0.2, random_state=42, stratify=df['survived'])
X_train = X_train.drop(['alive', 'survived'], axis=1)
X_test = X_test.drop(['alive', 'survived'], axis=1)

#사용자 코드
#1. 결측치 입력
missing = ['age']
for i in missing:
    X_train[i] = X_train[i].fillna(X_train[i].mean())
    X_test[i] = X_test[i].fillna(X_test[i].mean())
    
X_train['deck'] =  X_train['deck'].fillna('C')
X_test['deck'] =  X_test['deck'].fillna('C')

X_train['embarked'] = X_train['embarked'].fillna('S')
X_test['embarked'] = X_test['embarked'].fillna('S')

X_train['embark_town'] = X_train['embark_town'].fillna('S')
X_test['embark_town'] = X_test['embark_town'].fillna('S')

#2. 라벨인코딩
from sklearn.preprocessing import LabelEncoder
label =  ['sex', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alone']
X_train[label] = X_train[label].apply(LabelEncoder().fit_transform)
X_test[label] = X_test[label].apply(LabelEncoder().fit_transform)

#.3 데이터 타입변환, 더미
print(X_train.dtypes)
dtype = ['pclass', 'sex', 'class']
for i in X_train[dtype]:
    X_train[i] = X_train[i].astype('category')
for i in X_test[dtype]:
    X_test[i] = X_test[i].astype('category')

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

#4. 파생변수
X_train['age_qcut'] = pd.qcut(X_train['age'], 5, labels=False)
X_test['age_qcut'] = pd.qcut(X_test['age'], 5, labels=False)

#5. 스케일
from sklearn.preprocessing import MinMaxScaler
scaler = ['age', 'fare']
min = MinMaxScaler()
min.fit(X_train[scaler])

X_train[scaler] = min.transform(X_train[scaler])
X_test[scaler] = min.transform(X_test[scaler])

#6. 데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)



pclass           int64
sex              int64
age            float64
sibsp            int64
parch            int64
fare           float64
embarked         int64
class            int64
who              int64
adult_male       int64
deck             int64
embark_town      int64
alone            int64
dtype: object


In [59]:
#7. 모형학습, 앙상블
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
model1.fit(X_train, y_train)
pred1 = pd.DataFrame(model1.predict_proba(X_valid))
pred1

Unnamed: 0,0,1
0,0.066004,0.933996
1,0.604567,0.395433
2,0.229059,0.770941
3,0.930458,0.069542
4,0.845249,0.154751
...,...,...
138,0.816158,0.183842
139,0.865457,0.134543
140,0.839259,0.160741
141,0.044794,0.955206


In [60]:
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier()
model2.fit(X_train, y_train)
pred2 = pd.DataFrame(model2.predict_proba(X_valid))
model2
pred2

Unnamed: 0,0,1
0,0.060000,0.940000
1,0.780000,0.220000
2,0.100000,0.900000
3,0.997500,0.002500
4,0.880000,0.120000
...,...,...
138,1.000000,0.000000
139,0.957381,0.042619
140,1.000000,0.000000
141,0.000000,1.000000


In [61]:
from sklearn.ensemble import VotingClassifier
model3 = VotingClassifier(estimators = [('logistic', model1), ('random', model2)], voting='soft')
model3.fit(X_train, y_train)
pred3 = pd.DataFrame(model3.predict_proba(X_valid))
pred3


Unnamed: 0,0,1
0,0.053002,0.946998
1,0.702284,0.297716
2,0.139530,0.860470
3,0.965229,0.034771
4,0.847625,0.152375
...,...,...
138,0.908079,0.091921
139,0.918562,0.081438
140,0.919630,0.080370
141,0.042397,0.957603


In [62]:
#7. 모형학습, 앙상블
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
model1.fit(X_train, y_train)
pred1 = pd.DataFrame(model1.predict_proba(X_valid))

from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier()
model2.fit(X_train, y_train)
pred2 = pd.DataFrame(model2.predict_proba(X_valid))

from sklearn.ensemble import VotingClassifier
model3 = VotingClassifier(estimators = [('logistic', model1), ('random', model2)], voting='soft')
model3.fit(X_train, y_train)
pred3 = pd.DataFrame(model3.predict_proba(X_valid))

print(pred3)

            0         1
0    0.103002  0.896998
1    0.697284  0.302716
2    0.144530  0.855470
3    0.965229  0.034771
4    0.862625  0.137375
..        ...       ...
138  0.908079  0.091921
139  0.917312  0.082688
140  0.919630  0.080370
141  0.027397  0.972603
142  0.490595  0.509405

[143 rows x 2 columns]


In [63]:
#9. 모형평가
from sklearn.metrics import roc_auc_score
print('로지스틱', roc_auc_score(y_valid, pred1.iloc[:, 1]))
print('랜포', roc_auc_score(y_valid, pred2.iloc[:, 1]))
print('보팅', roc_auc_score(y_valid, pred3.iloc[:, 1]))

로지스틱 0.8560950413223141
랜포 0.8412190082644628
보팅 0.8643595041322314


In [67]:
model5 = RandomForestClassifier()
model5

RandomForestClassifier()

In [65]:
#10. 하이퍼파라미터 튜닝
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[50, 100], 'max_depth':[4,6]}
model5 = RandomForestClassifier()
clf = GridSearchCV(estimator=model5, param_grid=parameters, cv=3)
clf.fit(X_train, y_train)
print('최적의 파라미터', clf.best_params_)

최적의 파라미터 {'max_depth': 6, 'n_estimators': 100}


In [68]:
#11. 파일저장
result = pd.DataFrame(model3.predict_proba(X_test))
result = result.iloc[:,1]
pd.DataFrame({'id':X_test.index, 'result':result}).to_csv('00300_11.csv', index=False)

In [69]:
# 확인
check = pd.read_csv('00300_11.csv')
check

Unnamed: 0,id,result
0,565,0.205380
1,160,0.093969
2,553,0.059872
3,860,0.091200
4,241,0.744738
...,...,...
174,880,0.844054
175,91,0.144595
176,883,0.181882
177,473,0.919581
