In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.read_csv("./dataset/Space_Titanic/train.csv")
test = pd.read_csv("./dataset/Space_Titanic/test.csv")
sub = pd.read_csv("./dataset/Space_Titanic/sample_submission.csv")

train.shape, test.shape, sub.shape

((8693, 14), (4277, 13), (4277, 2))

In [2]:
train.info(), test.info(), sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  

(None, None, None)

In [3]:
sel = [ 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
X = train[sel]
y = train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier as DTC

In [5]:
imputer = SimpleImputer(strategy = "mean")
scaler = MinMaxScaler()
model = DTC()

In [6]:
from sklearn.pipeline import Pipeline

pipe_line = Pipeline([ ("imputer", imputer), ("scaler", scaler), ("model", model) ])
pipe_line.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()),
                ('model', DecisionTreeClassifier())])

In [7]:
pipe_line.predict(X_test)

array([False,  True, False, ...,  True,  True,  True])

In [8]:
pipe_line.score(X_test, y_test)

0.735050597976081

In [11]:
from sklearn.preprocessing import StandardScaler

imputer2 = SimpleImputer(strategy = "mean")
scaler2 = StandardScaler()
model2 = DTC()

### pipeline 함수 만들기

In [13]:
imputer = SimpleImputer(strategy = "mean")
scaler = MinMaxScaler()
model = DTC()

In [14]:
def pipe_line_fnc(X, imputer, scaler, model):
    X = imputer.transform(X)
    X = scaler.transform(X)
    pred = model.predict(X)
    return pred

pred_Y = pipe_line_fnc(X_test, imputer, scaler, model)

pred_Y[:5]

NotFittedError: This SimpleImputer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

### 학습내용 저장 및 불러오기

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
# from sklearn.externals import joblib
import sklearn.externals
import joblib

In [16]:
sel = [ 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
X = train[sel]
y = train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [17]:
imputer3 = SimpleImputer(strategy = "mean")
scaler3 = MinMaxScaler()
model3 = LogisticRegression()

In [18]:
pipe_line_Log = Pipeline([ ("imputer", imputer3), 
                          ("scaler", scaler3), 
                          ("model", model3) ])

pipe_line_Log.fit(X_train, y_train)
pred = pipe_line_Log.predict(X_test)
pred[0:5]

array([ True,  True, False, False,  True])

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# 정확도 확인
print( accuracy_score(pred, y_test) )

0.7382704691812327


In [20]:
model3 = KNeighborsClassifier()
pipe_line_knn = Pipeline([ ("imputer", imputer3), 
                          ("scaler", scaler3), 
                          ("model", model3) ])

pipe_line_knn.fit(X_train, y_train)
pred = pipe_line_knn.predict(X_test)

# 정확도 확인
print( accuracy_score(pred, y_test) )

0.7723091076356946


### joblib 이용 모델을 파일로 저장하기

In [21]:
import os

joblib.dump(pipe_line_knn, "./dataset/Space_Titanic/model_pipe_knn.joblib" )
os.listdir("./dataset/Space_Titanic/")

['first_sub.csv',
 'first_sub_2209.csv',
 'model_pipe_knn.joblib',
 'sample_submission.csv',
 'test.csv',
 'train.csv']

In [23]:
import sklearn.externals
import joblib

pipe_knn = joblib.load("./dataset/Space_Titanic/model_pipe_knn.joblib")

pred = pipe_knn.predict(X_test)

# 정확도 확인
print( accuracy_score(pred, y_test) )

0.7723091076356946
