In [1]:
import pandas as pd
import seaborn as sns

In [2]:
raw_train_data = pd.read_csv('train.csv')
raw_test_data = pd.read_csv('test.csv')

In [3]:
def my_analysis(dataset):
    df = pd.DataFrame(columns=['Datatype', 'NA values %', 'Unique values', 'frequent value', 'contribution %', 'min value'])
    length = len(dataset.index)
    for col in dataset.columns:
        row_df = pd.DataFrame({
                                'Datatype':dataset[col].dtype,
                                'NA percentage':round(dataset[col].isna().sum()*100/length,2),
                                'Unique values':dataset[col].nunique(),
                                'mode':dataset[col].value_counts().index[0],
                                'mode contribution %':round(dataset[col].value_counts()[0]*100/length,2),
                                'min value':'none' if dataset[col].dtype=='object' else dataset[col].min(),
                                'max value':'none' if dataset[col].dtype=='object' else dataset[col].max()
                                },
                                index=[col])
        df = pd.concat([df,row_df])
    return df

#### Extracting important information from some columns

- **PassengerId** column:\
    As per data description, *'This is an unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.'*\
    We will make two new columns as **Group** and **PeopleId** 
    
    
- **Cabin** column:\
    As per data description, *'The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.'*\
    We will split this column into **Deck**, **Num** and **Side**.


In [4]:
train_data = raw_train_data[raw_train_data['Cabin'].notna()]

In [5]:
def split_columns(raw_dataset):
    dataset = raw_dataset.copy()

    dataset['Group'] = dataset['PassengerId'].str[:4].astype(float)
    dataset['PeopleId'] = dataset['PassengerId'].str[-2:].astype(float)
    
    dataset['Deck'] = dataset['Cabin'].str[0]
    dataset['Num'] = dataset['Cabin'].str[2:-2].astype(float)
    dataset['Side'] = dataset['Cabin'].str[-1]

    dataset = dataset.drop(['PassengerId','Cabin'], axis=1)
    return dataset


train_data = split_columns(train_data)
test_data = split_columns(raw_test_data)

##### Removing unneccessary columns
Since column **Name** won't really help in training algorithm, we will remove it from our data

In [7]:
remove_columns = ['Name']

train_data = train_data.drop(remove_columns, axis=1)
test_data = test_data.drop(remove_columns, axis=1)

##### Filling Missing Values

In [9]:
mode_columns = ['HomePlanet','CryoSleep','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

def fillmode(dataset,catcol):
    for col in catcol:
        dataset[col].fillna(dataset[col].mode()[0], inplace=True)

fillmode(train_data,mode_columns)
fillmode(test_data,mode_columns)

##### Categorical columns

In [11]:
boolean_columns = ['CryoSleep','VIP']

def boolean_to_num(dataset,columns):
    for col in columns:
        dataset[col] = dataset[col].astype(int)

boolean_to_num(train_data,boolean_columns)
boolean_to_num(test_data,boolean_columns)

In [16]:
categorical_columns = ['HomePlanet','Destination','Deck','Side']

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
df1 = pd.DataFrame(ohe.fit_transform(train_data[categorical_columns]))
df1.index = train_data.index
train_data = train_data.drop(categorical_columns, axis=1)
train_data = pd.concat([train_data,df1], axis=1)

df2 = pd.DataFrame(ohe.transform(test_data[categorical_columns]))
df2.index = test_data.index
test_data = test_data.drop(categorical_columns, axis=1)
test_data = pd.concat([test_data,df2], axis=1)

In [18]:
from sklearn.model_selection import train_test_split

y = train_data['Transported'].astype(int)
x = train_data.drop('Transported', axis=1)
x_train, x_val, y_train, y_val = train_test_split(x,y,test_size=0.1,random_state=42)

#### Model: XGBoost

In [19]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=500, early_stopping_rounds=10)
xgb.fit(x_train,y_train,eval_set=[(x_val,y_val)])

[0]	validation_0-logloss:0.58744
[1]	validation_0-logloss:0.52452
[2]	validation_0-logloss:0.48801
[3]	validation_0-logloss:0.46351
[4]	validation_0-logloss:0.44382
[5]	validation_0-logloss:0.43195
[6]	validation_0-logloss:0.42102
[7]	validation_0-logloss:0.41256
[8]	validation_0-logloss:0.40661
[9]	validation_0-logloss:0.40019
[10]	validation_0-logloss:0.39528
[11]	validation_0-logloss:0.39280
[12]	validation_0-logloss:0.39145
[13]	validation_0-logloss:0.39054
[14]	validation_0-logloss:0.38917
[15]	validation_0-logloss:0.38843
[16]	validation_0-logloss:0.38735
[17]	validation_0-logloss:0.38718
[18]	validation_0-logloss:0.38764
[19]	validation_0-logloss:0.38763
[20]	validation_0-logloss:0.38685
[21]	validation_0-logloss:0.38582
[22]	validation_0-logloss:0.38658
[23]	validation_0-logloss:0.38566
[24]	validation_0-logloss:0.38737
[25]	validation_0-logloss:0.38670
[26]	validation_0-logloss:0.38563
[27]	validation_0-logloss:0.38531
[28]	validation_0-logloss:0.38499
[29]	validation_0-loglos

In [20]:
result = xgb.predict(test_data)
final_data = pd.read_csv('test.csv')
result = [bool(x) for x in result]

df = pd.DataFrame({'PassengerId': final_data['PassengerId'], 'Transported': result})
df.to_csv('submission.csv',index=False)