In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
train_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
# visualizing the sum of missing data and data types 
def visualization (data):
    info = pd.DataFrame([data.isna().sum(),data.dtypes]).T
    info.columns = ['number of null','data type']
    return info

visualization(train_data)

Unnamed: 0,number of null,data type
PassengerId,0,object
HomePlanet,201,object
CryoSleep,217,object
Cabin,199,object
Destination,182,object
Age,179,float64
VIP,203,object
RoomService,181,float64
FoodCourt,183,float64
ShoppingMall,208,float64


### we should get rid of parameters like : HomePlanet , Destination , Age , RoomService , FoodCourt , ShoppingMall , Spa , VRDeck , Name 
###  becuase these are pointless parameters and dont bring any value to our model 

In [4]:
train_data_clean = train_data.drop( ['HomePlanet' , 'Destination' , 'Age' , 'RoomService' , 'FoodCourt' , 'ShoppingMall' , 'Spa' , 'VRDeck' , 'Name'] , axis = 1)
test_data_clean = train_data.drop( ['HomePlanet' , 'Destination' , 'Age' , 'RoomService' , 'FoodCourt' , 'ShoppingMall' , 'Spa' , 'VRDeck' , 'Name'] , axis = 1)
train_data_clean

Unnamed: 0,PassengerId,CryoSleep,Cabin,VIP,Transported
0,0001_01,False,B/0/P,False,False
1,0002_01,False,F/0/S,False,True
2,0003_01,False,A/0/S,True,False
3,0003_02,False,A/0/S,False,False
4,0004_01,False,F/1/S,False,True
...,...,...,...,...,...
8688,9276_01,False,A/98/P,True,False
8689,9278_01,True,G/1499/S,False,False
8690,9279_01,False,G/1500/S,False,True
8691,9280_01,False,E/608/S,False,False


In [5]:
visualization(train_data_clean)

Unnamed: 0,number of null,data type
PassengerId,0,object
CryoSleep,217,object
Cabin,199,object
VIP,203,object
Transported,0,bool


In [6]:
visualization(test_data_clean)

Unnamed: 0,number of null,data type
PassengerId,0,object
CryoSleep,217,object
Cabin,199,object
VIP,203,object
Transported,0,bool


## now we should convert object type data to numerical data so our model can understand it

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

X = train_data_clean.drop('Transported' , axis = 1) 
y = train_data_clean['Transported']
X_test = test_data_clean.drop('Transported',axis = 1)

categorical_features = ['CryoSleep','Cabin','VIP']

one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                                 remainder = 'passthrough',
                               sparse_threshold=0)

transformed_train_data = transformer.fit_transform(X)
transformed_train_data

array([[1.0, 0.0, 0.0, ..., 0.0, 0.0, '0001_01'],
       [1.0, 0.0, 0.0, ..., 0.0, 0.0, '0002_01'],
       [1.0, 0.0, 0.0, ..., 1.0, 0.0, '0003_01'],
       ...,
       [1.0, 0.0, 0.0, ..., 0.0, 0.0, '9279_01'],
       [1.0, 0.0, 0.0, ..., 0.0, 0.0, '9280_01'],
       [1.0, 0.0, 0.0, ..., 0.0, 0.0, '9280_02']], dtype=object)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

# X_train , X_test , y_train ,y_test = train_test_split(transformed_train_data , y , test_size = 0.2 )

# clf = HistGradientBoostingClassifier()
clf = RandomForestClassifier()
# clf = svm.LinearSVC();
# clf = KNeighborsClassifier(n_neighbors= 3)

# clf.fit(X_train , y_train)
# clf.score(X_test,y_test)

clf.fit(transformed_train_data,y)

In [9]:
X_test = test_data_clean.drop('Transported',axis = 1)

transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                                 remainder = 'passthrough',
                               sparse_threshold=0)

transformed_test_data = transformer.fit_transform(X_test)

In [10]:
transformed_test_data.shape,transformed_train_data.shape

((8693, 6568), (8693, 6568))

In [11]:
predictions = clf.predict(transformed_test_data)

In [12]:
result = pd.DataFrame({'PassengerId' : train_data['PassengerId'],
                      'Transported':predictions})
result.to_csv('submission.csv', index = False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
