# **Using RandomForest and XGBoost**

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


Importing Training Data from train.csv file

In [4]:
df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB

In [4]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


Here we can see that the dataset have Target veriable Transported.This is what we want to predict using remaining features
Other Features have numerical as well as categorical data. we will transform categorical data.

First we will try droping rows with missing values in data and see what result it gives.

In [8]:
df_with_dropped_vals = df.dropna()
df_with_dropped_vals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6606 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   6606 non-null   object 
 1   HomePlanet    6606 non-null   object 
 2   CryoSleep     6606 non-null   object 
 3   Cabin         6606 non-null   object 
 4   Destination   6606 non-null   object 
 5   Age           6606 non-null   float64
 6   VIP           6606 non-null   object 
 7   RoomService   6606 non-null   float64
 8   FoodCourt     6606 non-null   float64
 9   ShoppingMall  6606 non-null   float64
 10  Spa           6606 non-null   float64
 11  VRDeck        6606 non-null   float64
 12  Name          6606 non-null   object 
 13  Transported   6606 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 729.0+ KB


# Performing basic trnasformations on categorical data to be used for training

In [24]:
from sklearn.preprocessing import LabelEncoder
def splitcolumn(collum):
    try:
        # Check if collum is a valid string
        if isinstance(collum, str):
            info = collum.split('/')
            # Ensure info has exactly 3 parts
            if len(info) == 3:
                return pd.Series([info[0], int(info[1]), info[2]], index=('Deck', 'cabin_no', 'side'))
        return pd.Series([None, None, None], index=('Deck', 'cabin_no', 'side'))
    except Exception as e:
        # Handle parsing issues gracefully
        return pd.Series([None, None, None], index=('Deck', 'cabin_no', 'side'))
def IDConerter(column):
    id = int(column.replace('_',''))
    return id
    
le = LabelEncoder()
df_with_dropped_vals[['Deck','cabin_no','side']]=df_with_dropped_vals['Cabin'].apply(splitcolumn)

df_with_dropped_vals['VIP']=le.fit_transform(df_with_dropped_vals['VIP'])
df_with_dropped_vals['CryoSleep']=le.fit_transform(df_with_dropped_vals['CryoSleep'])
X = df_with_dropped_vals[['PassengerId','CryoSleep','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]
y = df_with_dropped_vals['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_dropped_vals[['Deck','cabin_no','side']]=df_with_dropped_vals['Cabin'].apply(splitcolumn)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_dropped_vals['VIP']=le.fit_transform(df_with_dropped_vals['VIP'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_dropped_vals['CryoSleep']

# Define a function which checks the accuracy of our model using accuracy_score()

In [13]:
def getAccuracy(x_train,x_test,y_train,y_test):    
    model = RandomForestClassifier(random_state=1)
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    return accuracy_score(y_test,pred)    

In [14]:
getAccuracy(X_train, X_test, y_train, y_test)

0.7628657921291625


**As expected, Deleting rows from dataset doesn't perform very well.Second and more efficient way to handle missing values is to use simpleImputer**


In [18]:
df_fill_vals= df.copy()
df_fill_vals = df_fill_vals[['PassengerId','CryoSleep','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Transported']]
df_fill_vals.PassengerId.apply(IDConerter)
df_fill_vals['VIP']=le.fit_transform(df_fill_vals['VIP'])
df_fill_vals['CryoSleep']=le.fit_transform(df_fill_vals['CryoSleep'])
df_fill_vals['Age']=df_fill_vals['Age'].astype(int, errors='ignore')

In [19]:
df_fill_vals.Age.fillna(df_fill_vals.Age.mean(),inplace=True)
df_fill_vals.VIP.fillna(method='ffill',inplace= True)
df_fill_vals.RoomService.fillna(df_fill_vals.RoomService.mean(),inplace = True)
df_fill_vals.FoodCourt.fillna(df_fill_vals.FoodCourt.mean(),inplace= True)
df_fill_vals.ShoppingMall.fillna(df_fill_vals.ShoppingMall.mean(),inplace= True)
df_fill_vals.Spa.fillna(df_fill_vals.Spa.mean(),inplace= True)
df_fill_vals.VRDeck.fillna(df_fill_vals.VRDeck.mean(),inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fill_vals.Age.fillna(df_fill_vals.Age.mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fill_vals.VIP.fillna(method='ffill',inplace= True)
  df_fill_vals.VIP.fillna(method='ffill',inplace= True)
The behavior will change in pandas 3.0. This inplace method 

In [20]:
df_fill_vals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   CryoSleep     8693 non-null   int64  
 2   Age           8693 non-null   float64
 3   VIP           8693 non-null   int64  
 4   RoomService   8693 non-null   float64
 5   FoodCourt     8693 non-null   float64
 6   ShoppingMall  8693 non-null   float64
 7   Spa           8693 non-null   float64
 8   VRDeck        8693 non-null   float64
 9   Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), int64(2), object(1)
memory usage: 619.8+ KB


Now our data doesn't contain any null values, we can train our model 

In [21]:
X = df_fill_vals.drop('Transported',axis=1)
y = df_fill_vals.Transported
fill_X_train, fill_X_test, fill_y_train, fill_y_test = train_test_split(X, y, test_size=0.3, random_state=0)
getAccuracy(fill_X_train, fill_X_test, fill_y_train, fill_y_test)

0.7664877300613497

# **Results are slightly better than previous approch. We can do even better using XGBoost**

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

# Creating a pipeline to preprocess data efficiently

In [25]:
categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
numerical_cols =[col for col in df.columns if df[col].dtype in ['int64','float64']]
del categorical_cols[0]
categorical_cols.extend(['Deck','cabin_no','side'])
df[['Deck','cabin_no','side']]=df['Cabin'].apply(splitcolumn)

numerical_processor = SimpleImputer(strategy='mean')


categorical_processor = Pipeline(steps = [    
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('Onehot',OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers = [    
    ('num',numerical_processor,numerical_cols),    
    ('cat',categorical_processor,categorical_cols)])

pipeline = Pipeline(steps = [    
    ('processing',preprocessor),    
    ('model',RandomForestClassifier(n_estimators=100))])
X = df.drop('Transported',axis=1)
y = df.Transported.copy()
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0)
pipeline.fit(X_train,y_train)
preds = pipeline.predict(X_test)
print('accuracy = ',accuracy_score(y_test,preds))

accuracy =  0.7809085681426107


# Now Implementing XGBoost to make predictions 

In [27]:
XG_pipeline = Pipeline(steps = [    
    ('processing',preprocessor),    
    ('model',XGBClassifier(n_estimators = 33))])

X_test_transformed = XG_pipeline['processing'].transform(X_test)
XG_pipeline.set_params(model__early_stopping_rounds=10)
XG_pipeline.fit(X_train,y_train,model__eval_set=[(X_test_transformed,y_test)])
preds = XG_pipeline.predict(X_test)
print('accuracy = ',accuracy_score(y_test,preds))

[0]	validation_0-logloss:0.59027
[1]	validation_0-logloss:0.53231
[2]	validation_0-logloss:0.49534
[3]	validation_0-logloss:0.47157
[4]	validation_0-logloss:0.45563
[5]	validation_0-logloss:0.44481
[6]	validation_0-logloss:0.43643
[7]	validation_0-logloss:0.42971
[8]	validation_0-logloss:0.42628
[9]	validation_0-logloss:0.42299
[10]	validation_0-logloss:0.42099
[11]	validation_0-logloss:0.41873
[12]	validation_0-logloss:0.41632
[13]	validation_0-logloss:0.41604
[14]	validation_0-logloss:0.41505
[15]	validation_0-logloss:0.41417
[16]	validation_0-logloss:0.41337
[17]	validation_0-logloss:0.41301
[18]	validation_0-logloss:0.41271
[19]	validation_0-logloss:0.41287
[20]	validation_0-logloss:0.41295
[21]	validation_0-logloss:0.41221
[22]	validation_0-logloss:0.41229
[23]	validation_0-logloss:0.41191
[24]	validation_0-logloss:0.41285
[25]	validation_0-logloss:0.41310
[26]	validation_0-logloss:0.41300
[27]	validation_0-logloss:0.41238
[28]	validation_0-logloss:0.41197
[29]	validation_0-loglos

In [None]:
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test[['Deck','cabin_no','side']]=test['Cabin'].apply(splitcolumn)
xg_preds = XG_pipeline.predict(test)
output = pd.DataFrame({'PassengerId':test.PassengerId,'Transported':xg_preds})
output.to_csv('XG_Pipeline_preds.csv',index = False)

# **Using XGBoost we have developed a model which can predict the passenger is transported in another dimention or not with 80% acccuracy.** 