In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split

<br>
<br>
<br>

### Data collection & inspection

Loading preprocessed data

In [2]:
# training data
X = pd.read_csv("../data/x_train_preprocessed.csv")
y = pd.read_csv("../data/y_train_preprocessed.csv")
# test data
X_test = pd.read_csv("../data/x_test_preprocessed.csv")

<br>

Inspecting data, checking if data is loaded properly

In [3]:
# training data
X.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_T,Side_P,Side_S,CabinNumberCategory_a,CabinNumberCategory_b,CabinNumberCategory_c,CabinNumberCategory_d,CryoSleep,VIP,IsMale
0,0.711945,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161,0,1,0,...,0,1,0,1,0,0,0,0,0,1
1,-0.334037,1.056551,0.112472,0.611179,1.574204,0.716881,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0
2,2.036857,0.722866,2.10372,-0.640175,2.469368,0.754916,-0.491161,0,1,0,...,0,0,1,1,0,0,0,0,1,1
3,0.293552,-0.655221,1.756743,1.633126,2.218411,1.244369,0.457443,0,1,0,...,0,0,1,1,0,0,0,0,0,1
4,-0.891895,1.426747,0.776288,1.289373,1.584463,-0.260719,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0


In [4]:
# testing data
X_test.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_T,Side_P,Side_S,CabinNumberCategory_a,CabinNumberCategory_b,CabinNumberCategory_c,CabinNumberCategory_d,CryoSleep,VIP,IsMale
0,-0.124841,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161,1,0,0,...,0,0,1,1,0,0,0,1,0,0
1,-0.682698,-0.655221,0.112472,-0.640175,2.159451,-0.657316,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0
2,0.154088,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161,0,1,0,...,0,0,1,1,0,0,0,1,0,1
3,0.642213,-0.655221,2.313877,-0.640175,1.178585,1.643438,-0.491161,0,1,0,...,0,0,1,1,0,0,0,0,0,1
4,-0.612966,0.21802,-0.667333,1.839107,-0.683054,-0.657316,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0


In [5]:
# training target
y.head()

Unnamed: 0,Transported
0,0
1,1
2,0
3,0
4,1


<br>

Checking for null values in the datasets

In [6]:
print(X.isna().any().value_counts())
print(X_test.isna().any().value_counts())
print(y.isna().any().value_counts())

False    36
dtype: int64
False    36
dtype: int64
False    1
dtype: int64


<br>

Verifying all columns have numerical datatype

In [7]:
X.dtypes.value_counts()

int64      29
float64     7
dtype: int64

In [8]:
X_test.dtypes.value_counts()

int64      29
float64     7
dtype: int64

In [9]:
y.dtypes

Transported    int64
dtype: object

<br>

Now that we have verified the data is in proper format, let's prepare the data for modelling

As most of the preprocessing of data is done in data_preprocessing notebook, the only thing left is creating a validation set to evalute the performance of model during training

Creating validation set

In [10]:
# splitting training data into 80-20 ratio
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [11]:
# shape of splitted data
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(6954, 36)
(1739, 36)
(6954, 1)
(1739, 1)


In [12]:
# flattening 1D array
y_train = np.ravel(y_train)
y_train.shape

(6954,)

In [13]:
# flattening 1D array
y_val = np.ravel(y_val)
y_val.shape

(1739,)

<br>
<br>
<br>

### Model selection

In [14]:
# supervised learning models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

# for hyperparameter tunning
from sklearn.model_selection import GridSearchCV, StratifiedKFold

  from pandas import MultiIndex, Int64Index


Models we will be trying out:
- Logistic Regression
- k-Nearest Neighbor CLassifier
- Support Vector Machine Classifier
- Random Forest Classifier
- XGBoost Classifier
- LGBM Classifier
- CatBoost Classifier
- Naive Bayes

In [15]:
# dictionary of models
# specifying random_state to conisistency across model runs
classifiers = {
    'logistic_regression': LogisticRegression(random_state=42),
    'k_nearest_neighbor': KNeighborsClassifier(),
    'support_vector_machine': SVC(random_state=42, probability=True),
    'random_forest': RandomForestClassifier(random_state=42),
    # takes too long
    # 'xg_boost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'lgbm': LGBMClassifier(random_state=42),
    'cat_boost': CatBoostClassifier(random_state=42, verbose=False),
    'naive_bayes': GaussianNB()
}

Parameter grid for hypertunning of each classifier

In [16]:
# logistic regression
LR_grid = {'penalty': ['l1','l2'],
           'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
           'max_iter': [50, 100, 150]}

# k nearest neighbor
KNN_grid = {'n_neighbors': [3, 5, 7, 9],
            'p': [1, 2]}

# support vector machine
SVC_grid = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']}

# random forest
RF_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],
        'max_depth': [4, 6, 8, 10, 12]}

# boosting algorithms (xg_boost, lgbm, cat_boost)
boosted_grid = {'n_estimators': [50, 100, 150, 200],
        'max_depth': [4, 8, 12],
        'learning_rate': [0.05, 0.1, 0.15]}

# naive bayes
NB_grid={'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7]}

In [17]:
# mapping models to parameter grid
grid = {
    "logistic_regression" : LR_grid,
    "k_nearest_neighbor" : KNN_grid,
    "support_vector_machine" : SVC_grid,
    "random_forest" : RF_grid,
    "lgbm" : boosted_grid,
    "cat_boost" : boosted_grid,
    "naive_bayes": NB_grid
}

<br>

Training and evaluating models without cross validation, to get a general idea about each models performance.<br>
Best performing models will later be cross validated

In [27]:
# index of current model
i: int =0

# holds best params for classifier
clf_best_params = classifiers.copy()

# holds validation score and training time of each model
valid_scores=pd.DataFrame({'Classifer':classifiers.keys(), 'Validation accuracy': np.zeros(len(classifiers)), 'Training time': np.zeros(len(classifiers))})


for key, classifier in classifiers.items():
    # starting timer
    start = time.time()

    # grid search classifier
    clf = GridSearchCV(estimator=classifier, param_grid=grid[key], n_jobs=-1, cv=None)

    # training the model
    clf.fit(X_train, y_train)

    # saving validation score
    valid_scores.iloc[i,1] = clf.score(X_val, y_val)

    # saving best parameters
    clf_best_params[key] = clf.best_params_
    
    # stopping timer
    stop = time.time()

    # saving time taken
    valid_scores.iloc[i,2] = np.round((stop - start) / 60, 2)
    
    # logging data
    print('model:', key)
    print('training score:', valid_scores.iloc[i,1])
    print('training time (mins):', valid_scores.iloc[i,2])
    print('')
    i+=1

90 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kshit\anaconda3\envs\machine_learning\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\kshit\anaconda3\envs\machine_learning\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\kshit\anaconda3\envs\machine_learning\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' 

model: logistic_regression
training score: 0.7906843013225991
training time (mins): 0.06

model: k_nearest_neighbor
training score: 0.7814836112708453
training time (mins): 0.05

model: support_vector_machine
training score: 0.8010350776308223
training time (mins): 7.3

model: random_forest
training score: 0.8062104657849338
training time (mins): 0.57

model: lgbm
training score: 0.8136860264519838
training time (mins): 0.7



7 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kshit\anaconda3\envs\machine_learning\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\kshit\anaconda3\envs\machine_learning\lib\site-packages\catboost\core.py", line 5128, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "c:\Users\kshit\anaconda3\envs\machine_learning\lib\site-packages\catboost\core.py", line 2355, in _fit
    self._train(
  File "c:\Users\

model: cat_boost
training score: 0.8096607245543416
training time (mins): 11.6

model: naive_bayes
training score: 0.7619321449108684
training time (mins): 0.01



In [28]:
# validation scores
valid_scores

Unnamed: 0,Classifer,Validation accuracy,Training time
0,logistic_regression,0.790684,0.06
1,k_nearest_neighbor,0.781484,0.05
2,support_vector_machine,0.801035,7.3
3,random_forest,0.80621,0.57
4,lgbm,0.813686,0.7
5,cat_boost,0.809661,11.6
6,naive_bayes,0.761932,0.01


In [29]:
# best performing parameters
clf_best_params

{'logistic_regression': {'C': 1.25, 'max_iter': 50, 'penalty': 'l2'},
 'k_nearest_neighbor': {'n_neighbors': 9, 'p': 2},
 'support_vector_machine': {'C': 1.25, 'gamma': 'scale', 'kernel': 'rbf'},
 'random_forest': {'max_depth': 12, 'n_estimators': 300},
 'lgbm': {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200},
 'cat_boost': {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200},
 'naive_bayes': {'var_smoothing': 1e-07}}

<br>
<br>
<br>

### Modelling

Selecting top 2 models for final training on whole training set: LGBM and CATBoost<br>
<br>
Predictions of these models will be ensembled together using soft voting.<br>
This averages the predicted probabilies to produce the most confident predictions.

In [18]:
# best performing classifiers with best set of parameters
best_classifiers = {
    "lgbm" : LGBMClassifier(learning_rate=0.05, max_depth=4, n_estimators=200, random_state=42),
    "cat_boost" : CatBoostClassifier(learning_rate=0.1, max_depth=4, n_estimators=200, random_state=42, verbose=False),
}

Training our best classifiers on complete training set with cross validation

In [19]:
# number of folds in cross validation
folds: int = 10
# will hold prediction probabilities
preds = np.zeros(len(X_test))

for key, classifier in best_classifiers.items():
    # starting timer
    start = time.time()
    
    # 10-fold cross validation
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    
    score = 0

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        # getting training and validation sets
        X_train, X_valid = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[val_idx]

        # training model
        clf = classifier
        clf.fit(X_train, y_train)

        # making predictions and measuring accuracy
        preds += clf.predict_proba(X_test)[:,1]
        score += clf.score(X_valid, y_valid)

    # averaging accuracy    
    score = score / folds
    
    # stopping timer
    stop = time.time()

    # print accuracy and time
    print('Model:', key)
    print('Average validation accuracy:', np.round(100*score,2))
    print('Training time (mins):', np.round((stop - start)/60,2))
    print('')
    
# ensemble predictions
preds = preds / (folds * len(best_classifiers))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model: lgbm
Average validation accuracy: 81.08
Training time (mins): 0.06

Model: cat_boost
Average validation accuracy: 81.13
Training time (mins): 0.15



We got a nice accuracy of 81%

The prediction probabilities are saved in **`preds`** variable

<br>
<br>
<br>

### Predictions post-processing

Let's take a look at prediction probabilities from our model

In [20]:
preds

array([0.53768736, 0.02089198, 0.99408553, ..., 0.95153401, 0.7722816 ,
       0.60342787])

In [21]:
# transported percentage at default (0.5) threshold
print(np.round(100*np.round(preds).sum()/len(preds),2))

52.7


This means our model has predicted that 52.7% of passengers will be transported

But from the EDA, we know that the transportation percentage should be around 50.36%<br>
*(test datasets tend to have same class distribution as training datasets in competitions)*

So we need to use a different threshold (higher) than 0.5 to predict outcome

After some experimenting, a threshold value of 0.520 gives transportation percentage of 50.53%.<br>
We will use this value to predict the outcome

In [None]:
# selected threshold
threshold = 0.520

In [34]:
# tunning predictions and getting outcome
preds_tuned = (preds >= threshold).astype(int)

In [35]:
# final transported percentage
print(np.round(100*np.round(preds_tuned).sum()/len(preds_tuned),2))

50.53


In [36]:
# final outcome
preds_tuned

array([1, 0, 1, ..., 1, 1, 1])

In [37]:
# checking length of predictons
len(preds_tuned)

4277

Now that we have our predictions ready, it's time to submit them

<br>
<br>
<br>

### Creating submission file

We need PassengerId in out submission file. <br>
So let's extract it from our original test data.

In [40]:
# original test data
test_data = pd.read_csv("../data/test.csv")
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [42]:
# checking shape
test_data.shape

(4277, 13)

In [46]:
# getting passengerId
submission_df = test_data[['PassengerId']].copy()
submission_df.head()

Unnamed: 0,PassengerId
0,0013_01
1,0018_01
2,0019_01
3,0021_01
4,0023_01


In [47]:
# appending predictions to submission_df
submission_df['Transported'] = preds_tuned
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,1
1,0018_01,0
2,0019_01,1
3,0021_01,1
4,0023_01,1


In [48]:
# checking for null values
submission_df.isna().any()

PassengerId    False
Transported    False
dtype: bool

In [49]:
# transportation value count
submission_df.Transported.value_counts()

1    2161
0    2116
Name: Transported, dtype: int64

<br>

In the submission file, Transported column should have (True, False) value instead of (1, 0)

So converting (1, 0) to (True, False)

In [50]:
# converting transportation from int to boolean
submission_df.Transported = submission_df.Transported.apply(lambda x: True if x == 1 else False)

In [51]:
# inspecting final dataframe
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [52]:
# verifying value counts
submission_df.Transported.value_counts()

True     2161
False    2116
Name: Transported, dtype: int64

In [53]:
# saving as csv file which will be submitted to the competition
submission_df.to_csv("../data/submission_1.csv", index=None)

<br>
<br>
<br>

### Conclusion

Final score after submitting the predictions to the competition is 80.5% which is very close to our validation score.<br>
<br>
Rank: **499** /2660<br>
[Spaceship Titanic](https://www.kaggle.com/chickooo/competitions?tab=active)<br>
<br>
Achievements:
- under 500
- top 20%