In [182]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# read data

In [183]:
df_train = pd.read_csv('../datasets/train.csv')
df_test = pd.read_csv('../datasets/test.csv')

In [184]:
def convert_passengerID_2_index(df: pd.DataFrame):

    df = df.set_index("PassengerId")
    return df

def split_carbin(df: pd.DataFrame):

    for key, value in enumerate(['deck', 'num', 'side']):
        df[f'Carbin_{value}'] = df['Cabin'].apply(lambda x: str(x).split("/")[key] if len(str(x).split("/")) >= key+1 else None)

    df = df.drop('Cabin', axis=1)
    return df

def drop_columns(df: pd.DataFrame, columns:list):
    
    df = df.drop(columns=columns, axis=1)

    return df

def create_total_bill(df: pd.DataFrame):
    
    df['total_bill'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall']+df['Spa'] + df['VRDeck'] 
    df['total_bill'] = df['total_bill'].apply(lambda x: 0.1 if x==0.0 else x)

    return df


In [185]:
df_train_cleansed = (
    df_train.pipe(convert_passengerID_2_index)
    .pipe(split_carbin)
    .pipe(create_total_bill)
    .pipe(drop_columns, ['Name', 'Carbin_num'])
    
)

In [186]:
df_test_cleansed = (
    df_test.pipe(convert_passengerID_2_index)
    .pipe(split_carbin)
    .pipe(create_total_bill)
    .pipe(drop_columns, ['Name', 'Carbin_num'])
    
)

In [187]:
df_train_cleansed.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Carbin_deck,Carbin_side,total_bill
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P,0.1
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S,736.0
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S,10383.0
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S,5176.0
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S,1091.0


# split data into X and y

In [188]:
def split_data_2_x_y(df:pd.DataFrame):

    columns = df.columns 
    X_columns = [i for i in columns if i != 'Transported']
    y_column = 'Transported' if 'Transported' in columns else None 

    if y_column:
        return df[X_columns], df[y_column]
    else:
        return df[X_columns], None

In [189]:
X, y = split_data_2_x_y(df_train_cleansed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [190]:
X_test, _ = split_data_2_x_y(df_test_cleansed)

In [191]:
X_train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Carbin_deck,Carbin_side,total_bill
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1510_01,Mars,False,TRAPPIST-1e,31.0,False,1226.0,0.0,1.0,0.0,0.0,F,S,1227.0
7253_01,Europa,False,TRAPPIST-1e,26.0,False,0.0,896.0,0.0,690.0,1.0,D,P,1587.0
4714_01,Earth,True,TRAPPIST-1e,24.0,False,0.0,0.0,0.0,0.0,0.0,G,P,0.1
7727_01,Earth,False,55 Cancri e,33.0,False,0.0,0.0,0.0,436.0,224.0,E,S,660.0
3237_01,Mars,False,TRAPPIST-1e,21.0,False,1097.0,0.0,80.0,589.0,0.0,D,P,1766.0


# data pipeline

In [192]:
# create datapipeline functions

In [193]:
num_pipeline = Pipeline([
    ("Fill N/A value", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler())
])

cate_pipeline = Pipeline([
    ("Fill N/A value", SimpleImputer(strategy="most_frequent")),
    ("One Hot Encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [194]:
num_attribs = X_train.select_dtypes(include="float").columns
cat_attribs = X_train.select_dtypes(include="object").columns

In [195]:
# preprocessing = ColumnTransformer([
#     ("num", num_pipeline, make_column_selector(dtype_include=np.number)),
#     ("cat", cate_pipeline, make_column_selector(dtype_include=object))
# ])

In [196]:
preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cate_pipeline, cat_attribs)
])

In [197]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7823 entries, 1510_01 to 7775_01
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    7637 non-null   object 
 1   CryoSleep     7624 non-null   object 
 2   Destination   7658 non-null   object 
 3   Age           7659 non-null   float64
 4   VIP           7642 non-null   object 
 5   RoomService   7675 non-null   float64
 6   FoodCourt     7660 non-null   float64
 7   ShoppingMall  7637 non-null   float64
 8   Spa           7669 non-null   float64
 9   VRDeck        7654 non-null   float64
 10  Carbin_deck   7823 non-null   object 
 11  Carbin_side   7647 non-null   object 
 12  total_bill    7030 non-null   float64
dtypes: float64(7), object(6)
memory usage: 855.6+ KB


# Train the model

In [198]:
# Randome Forest Tree
rnd_clf = make_pipeline(preprocessing, RandomForestClassifier(random_state=42))
rnd_clf.fit(X_train, y_train)

In [250]:
# SVC
svc_clf = make_pipeline(preprocessing, SVC(C=5, probability=True, random_state=42))
svc_clf.fit(X_train, y_train)

In [200]:
# knn
knn = make_pipeline(preprocessing, KNeighborsClassifier(n_neighbors = 3))
knn.fit(X_train, y_train)

In [201]:
# xgboost
xg_clf = make_pipeline(preprocessing, XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=200, gamma=0.2))
xg_clf.fit(X_train, y_train)

In [202]:
# LogisticRegression
lg = make_pipeline(preprocessing, LogisticRegression())
lg.fit(X_train, y_train)

# Tunning the model

### GridSearch CV - Randome Forest Tree

In [203]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestClassifier(random_state=42))
])

# param_grid = [
#     {'preprocessing__geo__n_clusters': [5, 8, 10],
#     'random_forest__max_features': [4, 6, 8]},
#     {'preprocessing__geo__n_clusters': [10, 15],
#      'random_forest__max_features': [6, 8, 10]}
# ]
param_grid = [
    {
    'random_forest__max_features': [6, 8, 10],
    'random_forest__n_estimators': [50, 100, 200, 500],
    'random_forest__max_depth': [5, 10, None],
    'random_forest__min_samples_split': [2, 5, 10],
    'random_forest__min_samples_leaf': [1, 2, 4],
    # 'random_forest__max_features': ['auto', 'sqrt']
    }
]
grid_search_rnd_clf = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring="f1", #"roc_auc", 
                           n_jobs=-1)
grid_search_rnd_clf.fit(X_train, y_train)



In [204]:
print('Best hyperparameters:', grid_search_rnd_clf.best_params_)
print('Best score:', grid_search_rnd_clf.best_score_)

Best hyperparameters: {'random_forest__max_depth': 10, 'random_forest__max_features': 8, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 50}
Best score: 0.8092304193284318


In [205]:
cv_res = pd.DataFrame(grid_search_rnd_clf.cv_results_)
cv_res.sort_values(by='mean_test_score', ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_forest__max_depth,param_random_forest__max_features,param_random_forest__min_samples_leaf,param_random_forest__min_samples_split,param_random_forest__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
176,0.186764,0.003492,0.016554,0.002775,10,8,4,10,50,"{'random_forest__max_depth': 10, 'random_fores...",0.8118,0.805225,0.810667,0.80923,0.00287,1
123,1.648165,0.011468,0.122016,0.011823,10,6,2,2,500,"{'random_forest__max_depth': 10, 'random_fores...",0.809115,0.802464,0.814275,0.808618,0.004834,2
137,0.326345,0.007731,0.026162,0.002467,10,6,4,5,100,"{'random_forest__max_depth': 10, 'random_fores...",0.809064,0.801999,0.814078,0.80838,0.004955,3
133,0.324239,0.006073,0.027808,0.002374,10,6,4,2,100,"{'random_forest__max_depth': 10, 'random_fores...",0.809064,0.801999,0.814078,0.80838,0.004955,3
132,0.167916,0.010059,0.014622,2.5e-05,10,6,4,2,50,"{'random_forest__max_depth': 10, 'random_fores...",0.809701,0.803966,0.811429,0.808366,0.00319,5


### GridSearch CV - XGBoost

In [206]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("xg", XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=200, gamma=0.2))
])

param_grid = [
    {
    'xg__learning_rate': [0.01, 0.1, 0.5],
    'xg__max_depth': [3, 4, 5],
    'xg__n_estimators': [100, 200, 500],
    'xg__gamma': [0.05, 0.2, 1]
    }
]
grid_search_xg = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring="f1", #"roc_auc", 
                           n_jobs=-1)
grid_search_xg.fit(X_train, y_train)

In [207]:
print('Best hyperparameters:', grid_search_xg.best_params_)
print('Best score:', grid_search_xg.best_score_)

Best hyperparameters: {'xg__gamma': 0.05, 'xg__learning_rate': 0.1, 'xg__max_depth': 3, 'xg__n_estimators': 200}
Best score: 0.8167461241242533


In [208]:
cv_res = pd.DataFrame(grid_search_xg.cv_results_)
cv_res.sort_values(by='mean_test_score', ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xg__gamma,param_xg__learning_rate,param_xg__max_depth,param_xg__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
10,0.638032,0.026879,0.012155,0.003082,0.05,0.1,3,200,"{'xg__gamma': 0.05, 'xg__learning_rate': 0.1, ...",0.814948,0.809365,0.825926,0.816746,0.00688,1
42,0.520851,0.013876,0.01126,0.004134,0.2,0.1,5,100,"{'xg__gamma': 0.2, 'xg__learning_rate': 0.1, '...",0.817651,0.816403,0.815315,0.816457,0.000954,2
65,1.516886,0.014825,0.008764,0.000168,1.0,0.1,3,500,"{'xg__gamma': 1, 'xg__learning_rate': 0.1, 'xg...",0.812815,0.809701,0.826667,0.816394,0.007374,3
64,0.621012,0.017229,0.007942,0.000129,1.0,0.1,3,200,"{'xg__gamma': 1, 'xg__learning_rate': 0.1, 'xg...",0.812815,0.809683,0.826667,0.816388,0.007379,4
37,0.621316,0.011321,0.008083,0.000175,0.2,0.1,3,200,"{'xg__gamma': 0.2, 'xg__learning_rate': 0.1, '...",0.813853,0.812802,0.820685,0.81578,0.003495,5


### GridSearch CV - SVC

In [209]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("svc", SVC(kernel='rbf', probability=True, random_state=42))
])

param_grid = [
    {
    'svc__gamma': [0.005, 0.01, 0.5],
    'svc__C': [10, 15, 20],
    'svc__coef0': [0.1, 0.5, 1],
    'svc__degree': [3, 5]
    }
]
grid_search_svc = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring="f1", #"roc_auc", 
                           n_jobs=-1)
grid_search_svc.fit(X_train, y_train)



In [210]:
print('Best hyperparameters:', grid_search_svc.best_params_)
print('Best score:', grid_search_svc.best_score_)

Best hyperparameters: {'svc__C': 20, 'svc__coef0': 0.1, 'svc__degree': 3, 'svc__gamma': 0.01}
Best score: 0.8086550192719907


In [211]:
cv_res = pd.DataFrame(grid_search_svc.cv_results_)
cv_res.sort_values(by='mean_test_score', ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__coef0,param_svc__degree,param_svc__gamma,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
43,2.610192,0.061181,0.379539,0.012704,20,0.5,3,0.01,"{'svc__C': 20, 'svc__coef0': 0.5, 'svc__degree...",0.806156,0.803179,0.81663,0.808655,0.005768,1
52,2.692915,0.047655,0.320429,0.001741,20,1.0,5,0.01,"{'svc__C': 20, 'svc__coef0': 1, 'svc__degree':...",0.806156,0.803179,0.81663,0.808655,0.005768,1
37,2.812893,0.05703,0.364842,0.029427,20,0.1,3,0.01,"{'svc__C': 20, 'svc__coef0': 0.1, 'svc__degree...",0.806156,0.803179,0.81663,0.808655,0.005768,1
49,2.573155,0.038431,0.380321,0.056539,20,1.0,3,0.01,"{'svc__C': 20, 'svc__coef0': 1, 'svc__degree':...",0.806156,0.803179,0.81663,0.808655,0.005768,1
40,2.674205,0.040554,0.372555,0.04149,20,0.1,5,0.01,"{'svc__C': 20, 'svc__coef0': 0.1, 'svc__degree...",0.806156,0.803179,0.81663,0.808655,0.005768,1


## Stacking

In [212]:
estimators = [
    ('rnd_best_clf', RandomForestClassifier(max_depth=10, max_features=10, min_samples_leaf=4,
                                       min_samples_split=10, n_estimators =100,
                                       random_state=42)),
    ('svc', SVC(probability=True, random_state=42)),
    ('best_svc', SVC(C=20, coef0=0.1, degree=3, gamma=0.01, probability=True)),
    ('rnd_clf', RandomForestClassifier(random_state=42)),
    ('xgboost', XGBClassifier(gamma=0.05, learning_rate=0.1, max_depth=4, n_estimators=100))
    
]

stacking_classifer = StackingClassifier(
    estimators=estimators,
    final_estimator=XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=200, gamma=0.2),
    cv=5,
    n_jobs=-1
)
stacking_clf = Pipeline([
    ('preprocessing', preprocessing),
    ('stacking_classifier', stacking_classifer)
])
stacking_clf.fit(X_train, y_train)

# Voting

In [213]:
estimators = [
    ('rnd_best_clf', RandomForestClassifier(max_depth=10, max_features=10, min_samples_leaf=4,
                                       min_samples_split=10, n_estimators =100,
                                       random_state=42)),
    ('svc', SVC(C=5, probability=True, random_state=42)),
    ('best_svc', SVC(C=20, coef0=0.1, degree=3, gamma=0.01, probability=True)),
    ('rnd_clf', RandomForestClassifier(random_state=42)),
    ('xgboost', XGBClassifier(gamma=0.05, learning_rate=0.1, max_depth=4, n_estimators=100))
]

voting_classifer = VotingClassifier(
    estimators=estimators,
    voting='soft',
    n_jobs=-1
)
voting_clf = Pipeline([
    ('preprocessing', preprocessing),
    ('stacking_classifier', stacking_classifer)
])
voting_clf.fit(X_train, y_train)

# Evaluate the performance

## Evaluate the performance on the training set

In [214]:
# Randome Forest Tree
y_proba = rnd_clf.predict_proba(X_train)[:, 1]
y_pred = rnd_clf.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9904827159564166
F1 score: 0.9538690476190477


In [251]:
# SVC
y_proba = svc_clf.predict_proba(X_train)[:, 1]
y_pred = svc_clf.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9017773379895455
F1 score: 0.824669603524229


In [216]:
# SVC Grid Search CV
y_proba = grid_search_svc.predict_proba(X_train)[:, 1]
y_pred = grid_search_svc.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.8915791273681106
F1 score: 0.8141351518908866


In [217]:
# Randome Forest Tree Grid Search CV
y_proba = grid_search_rnd_clf.predict_proba(X_train)[:, 1]
y_pred = grid_search_rnd_clf.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9290274744755487
F1 score: 0.8426204819277108


In [218]:
# knn
y_proba = knn.predict_proba(X_train)[:, 1]
y_pred = knn.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9329592584051323
F1 score: 0.8569250317662007


In [219]:
# xgboost
y_proba = xg_clf.predict_proba(X_train)[:, 1]
y_pred = xg_clf.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9149627487339747
F1 score: 0.8305810397553516


In [220]:
# Search CV - xgboost

y_proba = grid_search_xg.predict_proba(X_train)[:, 1]
y_pred = grid_search_xg.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9155833090168615
F1 score: 0.8308408923755821


In [221]:
# lg
y_proba = lg.predict_proba(X_train)[:, 1]
y_pred = lg.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.8819079697367128
F1 score: 0.797305725333666


In [222]:
# Stacking
y_proba = stacking_clf.predict_proba(X_train)[:, 1]
y_pred = stacking_clf.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)


AUC-ROC score: 0.9259033307814112
F1 score: 0.8324847250509165


In [223]:
# voting
y_proba = voting_clf.predict_proba(X_train)[:, 1]
y_pred = voting_clf.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)


AUC-ROC score: 0.9259033307814112
F1 score: 0.8324847250509165


### Analyse Features

In [224]:
grid_search_model = grid_search_rnd_clf.best_estimator_
# feature_importances = grid_search_model
feature_importances = grid_search_model["random_forest"].feature_importances_
feature_importances.round(4)

sorted(zip(feature_importances, grid_search_model["preprocessing"].get_feature_names_out()), reverse=True)[:20]

[(0.1473930751351653, 'cat__CryoSleep_True'),
 (0.13248608804962086, 'num__total_bill'),
 (0.10828634553739785, 'num__Spa'),
 (0.10249393987391306, 'cat__CryoSleep_False'),
 (0.08225238695592116, 'num__FoodCourt'),
 (0.08174521348482224, 'num__VRDeck'),
 (0.07546629878766548, 'num__RoomService'),
 (0.057216926858965914, 'num__Age'),
 (0.053034988315429896, 'num__ShoppingMall'),
 (0.027669876953494876, 'cat__HomePlanet_Earth'),
 (0.021877901675473748, 'cat__Carbin_deck_G'),
 (0.017442484175296924, 'cat__HomePlanet_Europa'),
 (0.016539541406891903, 'cat__Carbin_deck_E'),
 (0.012961977798554633, 'cat__Carbin_deck_F'),
 (0.012605151219294704, 'cat__Carbin_side_S'),
 (0.010522779241722697, 'cat__Carbin_side_P'),
 (0.007637193268740191, 'cat__Carbin_deck_C'),
 (0.007432724855283484, 'cat__HomePlanet_Mars'),
 (0.006184551202688218, 'cat__Destination_55 Cancri e'),
 (0.005986661513862615, 'cat__Destination_TRAPPIST-1e')]

## Evaluate the performance on the validation set

In [225]:
# randome forest tree
y_proba = rnd_clf.predict_proba(X_val)[:, 1]
y_pred = rnd_clf.predict(X_val)
auc_roc = roc_auc_score(y_val, y_proba)
f1 = f1_score(y_val, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.8612055339102669
F1 score: 0.7727272727272727


In [252]:
# SVC
y_proba = svc_clf.predict_proba(X_val)[:, 1]
y_pred = svc_clf.predict(X_val)
auc_roc = roc_auc_score(y_val, y_proba)
f1 = f1_score(y_val, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.8673299464477349
F1 score: 0.7900113507377979


In [227]:
# gridsearchCV - SVC
y_proba = grid_search_svc.predict_proba(X_val)[:, 1]
y_pred = grid_search_svc.predict(X_val)
auc_roc = roc_auc_score(y_val, y_proba)
f1 = f1_score(y_val, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.869013697326616
F1 score: 0.7857935627081022


In [228]:
# gridsearchCV - Random Forest Tree
y_proba = grid_search_rnd_clf.predict_proba(X_val)[:, 1]
y_pred = grid_search_rnd_clf.predict(X_val)
auc_roc = roc_auc_score(y_val, y_proba)
f1 = f1_score(y_val, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.8733882777105216
F1 score: 0.7890535917901939


In [229]:
# knn
y_proba = knn.predict_proba(X_val)[:, 1]
y_pred = knn.predict(X_val)
auc_roc = roc_auc_score(y_val, y_proba)
f1 = f1_score(y_val, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.8462050845575991
F1 score: 0.767962308598351


In [230]:
# xgboost
y_proba = xg_clf.predict_proba(X_train)[:, 1]
y_pred = xg_clf.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9149627487339747
F1 score: 0.8305810397553516


In [231]:
# lg
y_proba = lg.predict_proba(X_train)[:, 1]
y_pred = lg.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.8819079697367128
F1 score: 0.797305725333666


In [232]:
# searchgrid cv - xgboost
y_proba = grid_search_xg.predict_proba(X_train)[:, 1]
y_pred = grid_search_xg.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9155833090168615
F1 score: 0.8308408923755821


In [233]:
# Stacking
y_proba = stacking_clf.predict_proba(X_val)[:, 1]
y_pred = stacking_clf.predict(X_val)
auc_roc = roc_auc_score(y_val, y_proba)
f1 = f1_score(y_val, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)


AUC-ROC score: 0.8720640089659074
F1 score: 0.7808219178082192


In [234]:
# voting
y_proba = voting_clf.predict_proba(X_val)[:, 1]
y_pred = voting_clf.predict(X_val)
auc_roc = roc_auc_score(y_val, y_proba)
f1 = f1_score(y_val, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)


AUC-ROC score: 0.8720640089659074
F1 score: 0.7808219178082192


# output data

In [235]:
# randome forest tree
y_test = rnd_clf.predict(X_test)
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') # 0.7912

In [236]:
# SVC
y_test = svc_clf.predict(X_test)
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') # 0.80406

In [237]:
# gridsearchCV - SVC
y_test = grid_search_svc.predict(X_test)
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') # 0.80173

In [238]:
# gridsearchCV - Random Forest Tree
y_test = grid_search_rnd_clf.predict(X_test)
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') # 0.7877

In [239]:
# knn
y_test = knn.predict(X_test)
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv')

In [240]:
# xgboost
y_test = xg_clf.predict(X_test)
y_test = [[False, True][i] for i in y_test ]
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') # 0.80243

In [241]:
# Search CV - xgboost
y_test = grid_search_xg.predict(X_test)
y_test = [[False, True][i] for i in y_test ]
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') 

In [242]:
# stacking
y_test = stacking_clf.predict(X_test)
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') # 0.79939

In [243]:
# voting
y_test = voting_clf.predict(X_test)
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') 