In [2]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import torch
import torch.nn as nn

In [3]:
# 데이터 불러오기
train=pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
# null값 확인
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
train.nunique(axis=0)

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

In [6]:
display(train['HomePlanet'].unique())
display(train['Destination'].unique())

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [7]:
# 데이터 전처리
def preprocess(df: pd.DataFrame):
    # PassengerId, Name 제거
    df=df.drop(['PassengerId', 'Name'],axis=1)
    # null 제거
    df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age', 'RoomService']] = df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age', 'RoomService']].fillna(0)
    df[['VIP', 'CryoSleep']]=df[['VIP', 'CryoSleep']].astype(bool)
    #df[['VIP', 'CryoSleep']]=df[['VIP', 'CryoSleep']].fillna(0).astype(bool)
    #df[['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]=df[['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(df[['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].mean())
    if 'Transported' in df.columns:
        df['Transported']=df['Transported'].astype(float)
    # HomePlanet, Destination 문자열 숫자로 변환
    df[['HomePlanet', 'Destination']]=df[['HomePlanet', 'Destination']].fillna(0)
    df['HomePlanet']=df['HomePlanet'].astype('category').cat.codes.astype(int)
    df['Destination']=df['Destination'].astype('category').cat.codes.astype(int)
    # Cabin 처리
    # Cabin은 deck/num/side로 구성되어 있음
    df[['Deck', 'CabinNum', 'CabinSide']]=df['Cabin'].str.extract(r'(\w)/(\d+)/(\w)')
    df['Deck']=df['Deck'].fillna('Z').astype('category').cat.codes.astype(int)
    df['CabinNum']=df['CabinNum'].fillna(0).astype(int)
    df['CabinSide']=df['CabinSide'].fillna('Z').astype('category').cat.codes.astype(int)
    df=df.drop('Cabin', axis=1)
    return df.astype(float)
train=preprocess(train)
display(train.isnull().sum())
display(train.head())


HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Deck            0
CabinNum        0
CabinSide       0
dtype: int64

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,CabinNum,CabinSide
0,2.0,0.0,3.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,3.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,5.0,0.0,1.0
2,2.0,0.0,3.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,0.0,1.0
3,2.0,0.0,3.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,0.0,1.0
4,1.0,0.0,3.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,5.0,1.0,1.0


In [None]:
# standardization
from sklearn.preprocessing import StandardScaler
cols=train.columns.drop('Transported')
#train[cols]=StandardScaler().fit_transform(train[cols])
#train.head()
#display(train.to_numpy())

In [1]:
# train_test_split
X=train.drop('Transported', axis=1)
y=train['Transported']
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'train' is not defined

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1=nn.Linear(X.shape[0], 64)
        self.fc2=nn.Linear(64, 32)
        self.fc3=nn.Linear(32, 1)
        self.sigmoid=nn.Sigmoid()
    def forward(self, x):
        x=self.fc1(x)
        x=self.fc2(x)
        x=self.fc3(x)
        x=self.sigmoid(x)
        return x
model=Model()
lr=0.01
num_epochs=30
optimizer=torch.optim.Adam(model.parameters(), lr=lr)
criterion=nn.BCELoss()
validation_split=0.2
batch_size=128
#patience=10

X_np=torch.tensor(X.to_numpy()).to('cpu')
y_np=torch.tensor(y.to_numpy()).to('cpu')

X_train, X_val, y_train, y_val=train_test_split(X_np, y_np, test_size=validation_split, random_state=42)



import tqdm
#train model
from torchmetrics.functional import r2_score
device=torch.device('cpu')
def train():
    for epoch in range(num_epochs):
        model.train()
        for i in tqdm.tqdm(range(0, X_train.shape[0], batch_size)):
            X_batch=X_train[i:i+batch_size]
            y_batch=y_train[i:i+batch_size]

            optimizer.zero_grad()
            y_pred=model(X_batch)
            loss=criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            y_pred_val=model(X_val)
            val_loss=criterion(y_pred_val, y_val)
            r2=r2_score(y_val, y_pred_val)
            print(f'Epoch: {epoch}, Loss: {loss.item()}, Val Loss: {val_loss.item()}, R2 Score: {r2}')
train()


In [1]:
# lgbm
lgbm=LGBMClassifier(objective='binary',boosting_type='gbdt' )
lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(70)], )
lgbm_y_pred=lgbm.predict(X_test)
accuracy_score(y_test, lgbm_y_pred)

NameError: name 'LGBMClassifier' is not defined

In [None]:
# cb
cb=CatBoostClassifier()
cb.fit(X_train, y_train)
cb_y_pred=cb.predict(X_test)
accuracy_score(y_test, cb_y_pred)

In [None]:
# xgb
xgb=XGBClassifier()
xgb.fit(X_train, y_train)
xgb_y_pred=xgb.predict(X_test)
accuracy_score(y_test, xgb_y_pred)

In [None]:
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')
test=preprocess(test)
test_pred=lgbm.predict(test)
#test_pred=np.median([lgbm.predict(test).astype(int), cb.predict(test).astype(int), xgb.predict(test).astype(int)], axis=0)
#print(test_pred)
submission['Transported']=test_pred.astype(bool)
submission.to_csv('submission.csv', index=False)