In [13]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [14]:
# 데이터 불러오기
train=pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [15]:
# null값 확인
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [16]:
# 데이터 전처리
def preprocess(df: pd.DataFrame):
    # PassengerId, Name 제거
    df=df.drop(['PassengerId', 'Name'],axis=1)
    # null 제거
    df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age', 'RoomService']] = df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age', 'RoomService']].fillna(0)
    df[['VIP', 'CryoSleep']]=df[['VIP', 'CryoSleep']].astype(bool)
    if 'Transported' in df.columns:
        df['Transported']=df['Transported'].astype(float)
    # HomePlanet, Destination 문자열 숫자로 변환
    df[['HomePlanet', 'Destination']]=df[['HomePlanet', 'Destination']].fillna(0)
    df['HomePlanet']=df['HomePlanet'].astype('category').cat.codes.astype(int)
    df['Destination']=df['Destination'].astype('category').cat.codes.astype(int)
    # Cabin 처리
    # Cabin은 deck/num/side로 구성되어 있음
    df[['Deck', 'CabinNum', 'CabinSide']]=df['Cabin'].str.extract(r'(\w)/(\d+)/(\w)')
    df['Deck']=df['Deck'].fillna('Z').astype('category').cat.codes.astype(int)
    df['CabinNum']=df['CabinNum'].fillna(0).astype(int)
    df['CabinSide']=df['CabinSide'].fillna('Z').astype('category').cat.codes.astype(int)
    df=df.drop('Cabin', axis=1)
    return df
train=preprocess(train)

train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,CabinNum,CabinSide
0,2,False,3,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
1,1,False,3,24.0,False,109.0,9.0,25.0,549.0,44.0,1.0,5,0,1
2,2,False,3,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0.0,0,0,1
3,2,False,3,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0.0,0,0,1
4,1,False,3,16.0,False,303.0,70.0,151.0,565.0,2.0,1.0,5,1,1


In [17]:
# train_test_split
X=train.drop('Transported', axis=1)
y=train['Transported']
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# 로지스틱 회귀
logistic=LogisticRegression(max_iter=5000)
logistic.fit(X_train, y_train)
y_pred=logistic.predict(X_test)
accuracy_score(y_test, y_pred)

0.7711328349626222

In [23]:
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')
test=preprocess(test)
test_pred=logistic.predict(test)
submission['Transported']=test_pred.astype(bool)
submission.to_csv('submission.csv', index=False)