# 1. 데이터 불러오기

In [23]:
import pandas as pd

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/submission.csv')

# 2. 결측치 채우기

In [24]:
mean_age = train['Age'].mean()
mean_fare = train['Fare'].mean()

# ‘Age’ ‘Fare’ feature의 Null 값을 각 feature의 평균(mean)값으로 대체
train['Age'] = train['Age'].fillna(mean_age)
test['Age'] = test['Age'].fillna(mean_age)
train['Fare'] = train['Fare'].fillna(mean_fare)
test['Fare'] = test['Fare'].fillna(mean_fare)

# 3. 데이터 분류 (학습용, 검증용)

In [25]:
train_x = train.drop(columns='Survived')
train_y = train['Survived']

from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y  = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

# 4. 모델

### 4-1. 로지스틱 모델
sklearn라이브러리 아닌 다른 라이브러리 사용

In [26]:
# 로지스틱 회귀 모델
import statsmodels.api as sm
train_dataset = pd.concat([train_x,train_y],axis=1)

# C()는 해당변수를 categorical 변수로 인식하도록 변환해주는 역할
# scale()는 수치형 변수를 표준화하기 위해 사용되는 함수
formula = """
Survived ~ C(Pclass)+ C(Sex) + C(Embarked) + scale(Age) + scale(SibSp) + scale(Parch) + scale(Fare) 
"""

model = sm.Logit.from_formula(formula,data=train_dataset)
result = model.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.446245
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               Survived   No. Observations:                  710
Model:                          Logit   Df Residuals:                      700
Method:                           MLE   Df Model:                            9
Date:                Tue, 04 Jul 2023   Pseudo R-squ.:                  0.3288
Time:                        23:20:01   Log-Likelihood:                -316.83
converged:                       True   LL-Null:                       -472.07
Covariance Type:            nonrobust   LLR p-value:                 1.567e-61
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            2.6579      0.354      7.505      0.000       1.964       3.352
C(Pclass)[T

### 4-2 모델 검증

In [27]:
val_pred = result.predict(val_x)
val_pred_s = val_pred.apply(lambda x:1 if x >=0.5 else 0)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(val_y,val_pred_s))

from sklearn.metrics import classification_report
print(classification_report(val_y,val_pred_s))

[[93 17]
 [18 51]]
              precision    recall  f1-score   support

           0       0.84      0.85      0.84       110
           1       0.75      0.74      0.74        69

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



평가지표 AUC

In [28]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(val_y,val_pred_s))

0.792292490118577


In [32]:
# 0과 1로 분류하지 않은 결과
from sklearn.metrics import roc_auc_score
print(roc_auc_score(val_y,val_pred))

0.8665349143610013


# 5. 제출

In [35]:
y_pred = result.predict(test)
y_pred_s = y_pred.apply(lambda x:1 if x>=0.5 else 0)

sub_s = submission.copy()
sub_s['Survived'] = y_pred_s
sub_s.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [30]:
sub_s.to_csv('./sub/sub_logistic_sm.csv',index=False)
# 0.7513875365

In [36]:
# 0과 1로 분류하지 않은 결과
sub = submission.copy()
sub['Survived'] = y_pred
sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0.124315
1,893,0.33018
2,894,0.121448
3,895,0.106257
4,896,0.583853


In [37]:
sub.to_csv('./sub/sub_logistic_sm_float.csv',index=False)
# 0.8091407011	

<hr/>

sklearn 회귀와 비교

In [31]:
s = pd.read_csv('./sub/sub_logisticRegression.csv')
merge_df = pd.merge(submission,s,on='PassengerId')
merge_df[merge_df['Survived_x']!=merge_df['Survived_y']]

Unnamed: 0,PassengerId,Survived_x,Survived_y
118,1010,1,0


sklearn라이브러리 사용과 1행 차이남