# 1. 데이터 불러오기

In [34]:
import pandas as pd

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/submission.csv')

# 2. 전처리

1. 독립변수 columns 7개 사용
2. Age, Fare 결측치 평균으로 대체
3. Age, SibSp, Parch, Fare 수치 표준화 (로지스틱회귀를 위한)
4. Sex, Embarked, Pclass 카테고리화
5. 독립변수, 종속변수 나누기
6. 데이터 불균형 해결 - SMOTE

In [35]:
# 1.
columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train = train[columns+['Survived']]
test = test[columns]

# 2.
mean_age = train['Age'].mean()
mean_fare = train['Fare'].mean()

train['Age'] = train['Age'].fillna(mean_age)
test['Age'] = test['Age'].fillna(mean_age)
train['Fare'] = train['Fare'].fillna(mean_fare)
test['Fare'] = test['Fare'].fillna(mean_fare)

# 3.
float_columns = ['Age', 'SibSp', 'Parch', 'Fare']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train[float_columns])
train[float_columns] = scaler.transform(train[float_columns])
test[float_columns] = scaler.transform(test[float_columns])

# 4. 
train = pd.get_dummies(train, columns=['Sex','Embarked','Pclass'])
test = pd.get_dummies(test, columns=['Sex','Embarked','Pclass'])

# 5.
train_x = train.drop(columns='Survived')
train_y = train['Survived']
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y  = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

# 6.
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(train_x,list(train_y))
X_resampled['Survived'] = y_resampled
train_dataset = X_resampled

In [36]:
train_dataset.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Survived
0,0.0,-0.474545,2.008933,-0.341452,True,False,True,False,False,False,False,True,0
1,0.100109,-0.474545,-0.473674,-0.437007,False,True,False,False,True,False,True,False,0
2,0.100109,0.432793,0.76763,0.096646,False,True,True,False,False,False,True,False,0
3,-0.746389,-0.474545,-0.473674,-0.567631,False,True,True,False,False,False,False,True,0
4,-0.669435,-0.474545,-0.473674,-0.502445,False,True,False,False,True,False,False,True,0


# 3. 모델

In [37]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### 3-1. 로지스틱 회귀 모델

In [38]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train_dataset.drop(columns='Survived'),train_dataset['Survived'])

y_pred = model.predict(val_x)
print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))
print('AUC :',accuracy_score(val_y,y_pred))

[[87 23]
 [12 57]]
              precision    recall  f1-score   support

           0       0.88      0.79      0.83       110
           1       0.71      0.83      0.77        69

    accuracy                           0.80       179
   macro avg       0.80      0.81      0.80       179
weighted avg       0.81      0.80      0.81       179

AUC : 0.8044692737430168


### 3-2. 의사결정나무 모델

In [39]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=6, random_state=0)
model.fit(train_dataset.drop(columns='Survived'),train_dataset['Survived'])

y_pred = model.predict(val_x)
print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))
print('AUC :',accuracy_score(val_y,y_pred))

[[95 15]
 [16 53]]
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       110
           1       0.78      0.77      0.77        69

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

AUC : 0.8268156424581006


### 3-3. 랜덤 포레스트 모델

In [40]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
model.fit(train.drop(columns='Survived'),train['Survived'])

y_pred = model.predict(val_x)
print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))
print('AUC :',accuracy_score(val_y,y_pred))

[[105   5]
 [ 21  48]]
              precision    recall  f1-score   support

           0       0.83      0.95      0.89       110
           1       0.91      0.70      0.79        69

    accuracy                           0.85       179
   macro avg       0.87      0.83      0.84       179
weighted avg       0.86      0.85      0.85       179

AUC : 0.8547486033519553


### 3-4. XGBoost 모델

In [41]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=5, random_state = 0)
model.fit(train.drop(columns='Survived'),train['Survived'])

y_pred = model.predict(val_x)
print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))
print('AUC :',accuracy_score(val_y,y_pred))

[[105   5]
 [ 18  51]]
              precision    recall  f1-score   support

           0       0.85      0.95      0.90       110
           1       0.91      0.74      0.82        69

    accuracy                           0.87       179
   macro avg       0.88      0.85      0.86       179
weighted avg       0.88      0.87      0.87       179

AUC : 0.8715083798882681


xgboost의 성능이 가장 좋다

# 4. 모델 선택, 제출

In [42]:
pick_model = XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=5, random_state = 0)
pick_model.fit(train_dataset.drop(columns='Survived'),train_dataset['Survived'])
XGB_pred = pick_model.predict(test)

sub1 = submission.copy()
sub1['Survived'] = XGB_pred
sub1.to_csv('./sub/sub_smote_xgboost.csv',index=False)
# 0.7500243427

In [43]:
# 랜덤 포레스트 결과
rf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
rf.fit(train.drop(columns='Survived'),train['Survived'])

rf_pred = rf.predict(test)
sub2 = submission.copy()
sub2['Survived'] = rf_pred
sub2.to_csv('./sub/sub_smote_randomForest.csv',index=False)
# 0.7574001947