In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error

In [61]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.dropna(inplace=True)
test.dropna(inplace=True)
df = train.append(test)

In [62]:
df[['Group','Position']] = df[['PassengerId']].apply(lambda row: row.PassengerId.split("_"), axis = 1, result_type = 'expand')
df['GroupSize'] = df.groupby(['Group']).Group.transform('count')
df['GroupSizeCat'] = pd.cut(df['GroupSize'], bins=[1,2,8,10], labels = ['small','med', 'big'], right=False)
df[["FirstName","LastName"]] = df['Name'].str.split(' ', 1, expand=True)

In [63]:
print("Num rows: ", df.shape[0])
print("Num attributes: ", df.shape[1])

Num rows:  9887
Num attributes:  20


# Do Exploratory Data Analysis here

In [64]:
# buncha EDA

# Select features and do ML here

In [65]:
features = ['CryoSleep', 'VIP', 'RoomService', 'Spa']
X = train[features]
y = train['Transported']
train_X, val_X, train_y, val_y = train_test_split(X, y)

In [76]:
#Decision Tree

hyperparameters = {
    'max_depth': [2,3,5,10],
    'min_samples_leaf': [5,10],
    "max_leaf_nodes":[None,10,20],
    'splitter': ["best","random"]
}

dt = DecisionTreeRegressor()
dt_clf = GridSearchCV(dt, hyperparameters, cv=10)

dt_model = dt_clf.fit(train_X, train_y)
print('Best max_depth:', dt_model.best_estimator_.get_params()['max_depth'])
print('Best min_samples_leaf:', dt_model.best_estimator_.get_params()['min_samples_leaf'])
print('Best max_leaf_nodes:', dt_model.best_estimator_.get_params()['max_leaf_nodes'])
print('Best splitter:', dt_model.best_estimator_.get_params()['splitter'])
print('MAE: ', mean_absolute_error(dt_model.predict(val_X), val_y))

Best max_depth: 10
Best min_samples_leaf: 10
Best max_leaf_nodes: 10
Best splitter: best
MAE:  0.36518501335575715


In [80]:
#K-nearest neighbors

hyperparameters = {
    'leaf_size': [1,2,3,4,5],
    'n_neighbors': [1,2,3,4,5],
    "p":[1,2],
}

knn = KNeighborsRegressor()
knn_clf = GridSearchCV(knn, hyperparameters, cv=10)

knn_model = knn_clf.fit(train_X, train_y)
print('Best leaf_size:', knn_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', knn_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', knn_model.best_estimator_.get_params()['n_neighbors'])
print('MAE: ', mean_absolute_error(knn_model.predict(val_X), val_y))

Best leaf_size: 2
Best p: 2
Best n_neighbors: 5
MAE:  0.326271186440678


In [81]:
#SVM

hyperparameters = {
    'C': [0.1], 
    'gamma': [1], 
    'kernel': ['rbf']
}

svr = SVR()
svr_clf = GridSearchCV(svr, hyperparameters, cv=10)

svr_model = svr_clf.fit(train_X, train_y)
print('Best C:', svr_model.best_estimator_.get_params()['C'])
print('Best gamma:', svr_model.best_estimator_.get_params()['gamma'])
print('Best kernel:', svr_model.best_estimator_.get_params()['kernel'])
print('MAE: ', mean_absolute_error(svr_model.predict(val_X), val_y))

Best C: 0.1
Best gamma: 1
Best kernel: rbf
MAE:  0.3306048994060747
