<a href="https://www.kaggle.com/cicostos/spaceship-titanic?scriptVersionId=89425865" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing and wieving training data**

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')

df_train = pd.DataFrame(data=train)

df_train.head(10)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,3))
sns.countplot(y = df_train['HomePlanet'], palette='cool')
plt.title('Checking Home Planet')

In [None]:
print(df_train.groupby('CryoSleep').size())

plt.figure(figsize=(8,3))
sns.countplot(y = df_train['CryoSleep'], palette='cool')
plt.title('Checking Cryo Sleep')

In [None]:
print(df_train.groupby('Cabin').size())

In [None]:
plt.figure(figsize=(8,3))
sns.countplot(y = df_train['Destination'], palette='cool')
plt.title('Checking Destination')

In [None]:
plt.figure(figsize=(10,15))
sns.countplot(y = df_train['Age'], palette='cool')
plt.title('Checking Age')

In [None]:
plt.figure(figsize=(8,3))
sns.countplot(y = df_train['VIP'], palette='cool')
plt.title('Checking Vip')

In [None]:
df_train.isnull().sum()

In [None]:
plt.figure(figsize=(12,5))
sns.heatmap(df_train.corr(), cmap='rocket', annot=True, linewidth=0.2, linecolor='black')

Preprocessing

In [None]:
df_train['HomePlanet'].fillna('Earth', inplace=True)
df_train['CryoSleep'].fillna(False, inplace=True)
df_train['Destination'].fillna('TRAPPIST-1e', inplace=True)
df_train['VIP'].fillna(False, inplace=True)
df_train['Age'].fillna(24.0, inplace=True)

l = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for i in l:
    value = df_train[i].mean()
    df_train[i] = df_train[i].fillna(value)

In [None]:
df_train.isnull().sum()

In [None]:
names = df_train['Name'].str.split(' ', expand=True)
names.columns = ['FirstName', 'LastName']

df_train = pd.concat([df_train, names['LastName']],axis=1)
df_train['LastName'].fillna('None', inplace=True)

df_train.head()

In [None]:
df_train[['Group', 'GroupNumber']] = df_train['PassengerId'].str.split('_', expand=True).astype('int32')

df_train.head()

In [None]:
df_train[['Group', 'LastName', 'Transported']].head(20)

In [None]:
df_train[['CabinDeck', 'CabinNum', 'CabinSide']] = df_train['Cabin'].str.split('/', expand=True)
df_train.drop(['Name', 'Cabin'], axis=1, inplace=True)

df_train.head()

In [None]:
fig, ax = plt.subplots(1,3,figsize=(18,5))
l = ['CabinDeck', 'CabinNum', 'CabinSide']
sns.countplot(df_train[l[0]], ax=ax[0], palette='cool_r')
sns.distplot(df_train[l[1]], ax=ax[1], color='blue')
sns.countplot(df_train[l[2]], ax=ax[2], palette='cool', hue=df_train['Transported'])
plt.suptitle('Cabin Feature')

In [None]:
df_train['AgeGroup'] = pd.cut(df_train['Age'], bins=10, labels=[i for i in range(1,11)])

df_train.head()

In [None]:
plt.figure(figsize=(8,3))
sns.countplot(y = df_train['AgeGroup'], palette='cool')
plt.title('Checking Age Group')

In [None]:
df_train['CabinDeck'] = df_train['CabinDeck'].fillna('F')
df_train['CabinSide'] = df_train['CabinSide'].fillna('S')
df_train['CabinNum'] = df_train['CabinNum'].fillna('82')

df_train.isnull().sum()

In [None]:
df_train['LastName'].fillna('None', inplace=True)

df_train.isnull().sum()

In [None]:
df_train['CabinNum'] = df_train['CabinNum'].astype('int64')
df_train['AgeGroup'] = df_train['AgeGroup'].astype('int64')

df_train['TotalAmount'] = df_train['RoomService'] + df_train['FoodCourt'] + df_train['ShoppingMall'] + df_train['Spa'] + df_train['VRDeck']

for i in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    df_train[f'{i}Used'] = df_train[i].apply(lambda x:1 if x>0 else 0)

df_train.head()

# **Feature Encoding**

In [None]:
planeti = pd.get_dummies(df_train['HomePlanet'], drop_first=True)
destinacija = pd.get_dummies(df_train['Destination'], drop_first=True)
side = pd.get_dummies(df_train['CabinSide'], drop_first=True)

df_train = pd.concat([df_train, planeti, destinacija, side], axis=1)

df_train.drop(['HomePlanet', 'Destination', 'CabinSide'], axis=1, inplace=True)

df_train.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

df_train['LastName'] = label.fit_transform(df_train['LastName'])
df_train['CabinDeck'] = label.fit_transform(df_train['CabinDeck'])

In [None]:
df_train['CryoSleep'] = df_train['CryoSleep'].map({True:1, False:0})
df_train['VIP'] = df_train['VIP'].map({True:1, False:0})
df_train['Transported'] = df_train['Transported'].map({True:1, False:0})

df_train.head()

In [None]:
# another correlarion check due to new features
plt.figure(figsize=(20,20))
sns.heatmap(df_train.corr(), cmap='rocket', annot=True, linewidth=0.2, linecolor='black')

**Conclustions**
CabinDeck and Europa are highly correlated so I'm droping Europa.

In [None]:
df_train.drop(['Europa'], axis=1, inplace=True)

In [None]:
df_train.to_csv('df_train_cleaned.csv')

# **Test data**

In [None]:
test = pd.read_csv('../input/spaceship-titanic/test.csv')

df_test = pd.DataFrame(data=test)

df_test.head()

In [None]:
df_test['HomePlanet'].fillna('Earth', inplace=True)
df_test['CryoSleep'].fillna(False, inplace=True)
df_test['Destination'].fillna('TRAPPIST-1e', inplace=True)
df_test['VIP'].fillna(False, inplace=True)
df_test['Age'].fillna(24.0, inplace=True)

df_test.head()

In [None]:
l = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for i in l:
    value = df_test[i].mean()
    df_test[i] = df_test[i].fillna(value)

df_test.head()

In [None]:
names = df_test['Name'].str.split(' ', expand=True)
names.columns = ['FirstName', 'LastName']

df_test = pd.concat([df_test, names['LastName']],axis=1)
df_test['LastName'].fillna('None', inplace=True)

df_test.head()

In [None]:
df_test[['Group', 'GroupNumber']] = df_test['PassengerId'].str.split('_', expand=True).astype('int32')

df_test[['CabinDeck', 'CabinNum', 'CabinSide']] = df_test['Cabin'].str.split('/', expand=True)
df_test.drop(['Name', 'Cabin'], axis=1, inplace=True)

df_test['AgeGroup'] = pd.cut(df_test['Age'], bins=10, labels=[i for i in range(1,11)])

df_test['CabinDeck'] = df_test['CabinDeck'].fillna('F')
df_test['CabinSide'] = df_test['CabinSide'].fillna('S')
df_test['CabinNum'] = df_test['CabinNum'].fillna('82')

df_test['CabinNum'] = df_test['CabinNum'].astype('int64')
df_test['AgeGroup'] = df_test['AgeGroup'].astype('int64')

df_test['TotalAmount'] = df_test['RoomService'] + df_test['FoodCourt'] + df_test['ShoppingMall'] + df_test['Spa'] + df_test['VRDeck']

for i in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    df_test[f'{i}Used'] = df_test[i].apply(lambda x:1 if x>0 else 0)

planeti = pd.get_dummies(df_test['HomePlanet'], drop_first=True)
destinacije = pd.get_dummies(df_test['Destination'], drop_first=True)
side = pd.get_dummies(df_test['CabinSide'], drop_first=True)

df_test = pd.concat([df_test, planeti, destinacije, side], axis=1)
df_test.drop(['HomePlanet', 'Destination', 'CabinSide'], axis=1, inplace=True)

df_test['LastName'] = label.fit_transform(df_test['LastName'])
df_test['CabinDeck'] = label.fit_transform(df_test['CabinDeck'])

df_test['CryoSleep'] = df_test['CryoSleep'].map({True:1, False:0})
df_test['VIP'] = df_test['VIP'].map({True:1, False:0})

df_test.head()

In [None]:
df_test.drop('Europa', axis=1, inplace=True)

df_test.head()

In [None]:
X_test = df_test.drop('PassengerId', axis=1)

X_test.head()

In [None]:
df_test.to_csv('df_test_cleaned.csv')

Modeling

In [None]:
from sklearn.model_selection import train_test_split

X = df_train.drop(['PassengerId', 'Transported'], axis=1)
y = df_train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1312)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
def get_score(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    return model.score(X_test,y_test)

In [None]:
svc_score = get_score(SVC(),X_train,X_test,y_train,y_test)

print('svc_score ' + str(svc_score))

LogisticRegression_score = get_score(LogisticRegression(),X_train,X_test,y_train,y_test)

print('LogisticRegression_score ' + str(LogisticRegression_score))

RandomForestClassifier_score = get_score(RandomForestClassifier(),X_train,X_test,y_train,y_test)

print('RandomForestClassifier_score ' + str(RandomForestClassifier_score))

DecisionTreeClassifier_score = get_score(DecisionTreeClassifier(),X_train,X_test,y_train,y_test)

print('DecisionTreeClassifier_score ' + str(DecisionTreeClassifier_score))

In [None]:
from sklearn.model_selection import GridSearchCV 

#forest = RandomForestClassifier()

# Create the grid parameter:
#grid_rf = {'n_estimators': [100, 120, 150],
#           'criterion': ['entropy', 'gini'], 
#           'max_depth': [None,1,3,5,7,9],
#           'max_features': range(1,11),  
#           'min_samples_split': range(2, 10),
#           'min_samples_leaf': [1,3,5]}

# Create the grid:
#gs_rf = GridSearchCV(forest, grid_rf, cv=3, n_jobs=-1)

# Fit using grid search:
#gs_rf.fit(X_train, y_train)

# Print best accuracy and best parameters:
#print('Best accuracy: %.3f' % gs_rf.best_score_)
#print('\nBest params:\n', gs_rf.best_params_)

In [None]:
#Best params:
 #{'criterion': 'entropy', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 150}

In [None]:
model = RandomForestClassifier(criterion='entropy', max_depth=None, max_features=4, min_samples_leaf=1, min_samples_split=4, n_estimators=150)

model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred_train=model.predict(X_train)
y_pred_test=model.predict(X_test)


print('\nAccuracy train set:\n', accuracy_score(y_train, y_pred_train))
print('\nAccuracy test set:\n', accuracy_score(y_test, y_pred_test))


In [None]:
df_test.info()

In [None]:
df_test=df_test.drop(["PassengerId"],axis=1)

In [None]:
result = model.predict(df_test)

print(result)

In [None]:
label_res=[]
for i in result:
    if(i==1):
        label_res.append("True")
    elif(i==0):
        label_res.append("False")
        
label_=pd.DataFrame(label_res,columns=["Transported"])

label_.head()

In [None]:
df_sample = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

df_sample.head()

In [None]:
df_sample = df_sample.drop(["Transported"],axis=1)

df_submission = pd.concat([df_sample,label_],axis=1)

df_submission

In [None]:
df_submission.to_csv("submission.csv",index=False)

In [None]:
!kaggle competitions submit -c spaceship-titanic -f submission.csv -m "Message"