In [250]:
import optuna
from sklearn.ensemble import StackingClassifier
from sklearn.experimental import enable_iterative_imputer
import catboost
import numpy as np
import pandas as pd
import plotly.express as px
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [251]:
random_id = 17  #ai23m017 - 17 is the id
random_state = np.random.RandomState(random_id)
np.random.seed(17)

In [252]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S


In [253]:
test_pass_id = test_df.pop('PassengerId')

X_max_index = train_df.shape[0]
y = train_df.Survived

df = train_df.drop(['Survived', 'PassengerId'], axis=1)
df = pd.concat([df, test_df], axis=0).reset_index(drop=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Age       1046 non-null   float64
 3   SibSp     1309 non-null   int64  
 4   Parch     1309 non-null   int64  
 5   Ticket    1309 non-null   object 
 6   Fare      1308 non-null   float64
 7   Cabin     295 non-null    object 
 8   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(4)
memory usage: 92.2+ KB


### Embarked Imputation 

A Simple way of imputing Embarked data is to fill in the most common destination embarked from

In [254]:
most_common_embarked = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(most_common_embarked)

### Age Imputation

In [255]:
np.random.randint(df['Age'].mean() - df['Age'].std(), df['Age'].mean() + df['Age'].std(), 177)

df_age = np.random.randint(df['Age'].mean() - df['Age'].std(), df['Age'].mean() + df['Age'].std(), 177)
df_age = df['Age'].mean()

df.loc[df['Age'].isnull(), 'Age'] = df_age

clean up the fares which are 0 (doesn't make logical sense)

In [256]:
df.loc[df.Fare.eq(0), 'Fare'] = np.nan

In [257]:
df

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",29.881138,0,0,A.5. 3236,8.0500,,S
1305,1,"Oliva y Ocana, Dona. Fermina",39.000000,0,0,PC 17758,108.9000,C105,C
1306,3,"Saether, Mr. Simon Sivertsen",38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,3,"Ware, Mr. Frederick",29.881138,0,0,359309,8.0500,,S


In [258]:
df['Lastname'] = df.Name.str.split(', ').str[0]

In [259]:
df['Title'] = df.Name.str.split(', ').str[1]
df['Title'] = df.Title.str.split('.').str[0]

In [260]:
df.Title.value_counts()

Title
Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Mlle              2
Major             2
Ms                2
Lady              1
Sir               1
Mme               1
Don               1
Capt              1
the Countess      1
Jonkheer          1
Dona              1
Name: count, dtype: int64

#### Similar titles are grouped into fewer titles / title categories

In [261]:
def map_to_sex(value):
    if value in ["Miss", "Mrs"]:
        return "female"
    else:
        return "male"


df['Sex'] = df['Title'].apply(map_to_sex)

In [262]:
mrs = ['Mrs', 'Mme']
df.loc[df.Title.isin(mrs), 'Title'] = 'Mrs'

miss = ['Ms', 'Miss', 'Mlle']
df.loc[df.Title.isin(miss), 'Title'] = 'Miss'

df.loc[(df.Title.eq('Mr') & df.Age.le(18)), 'Title'] = 'Master'

df.loc[(df.Title.eq('Mrs') & df.Age.le(18)), 'Title'] = 'Miss'

df.loc[(~df.Title.isin(['Mrs', 'Miss', 'Mr', 'Master'])
        & df.Sex.eq('male')), 'Title'] = 'Mr'

df.loc[(~df.Title.isin(['Mrs', 'Miss', 'Mr', 'Master'])
        & df.Sex.eq('female')), 'Title'] = 'Mrs'

In [263]:
df

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lastname,Title,Sex
0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,,S,Braund,Mr,male
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs,female
2,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,,S,Heikkinen,Miss,female
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,S,Futrelle,Mrs,female
4,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,,S,Allen,Mr,male
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",29.881138,0,0,A.5. 3236,8.0500,,S,Spector,Mr,male
1305,1,"Oliva y Ocana, Dona. Fermina",39.000000,0,0,PC 17758,108.9000,C105,C,Oliva y Ocana,Mr,male
1306,3,"Saether, Mr. Simon Sivertsen",38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S,Saether,Mr,male
1307,3,"Ware, Mr. Frederick",29.881138,0,0,359309,8.0500,,S,Ware,Mr,male


Split Tickets into Ticket_Series and Ticket_nr

In [264]:
df['Ticket_series'] = [i[0] if len(i) > 1 else 0 for i in df.Ticket.str.split()]
df['Ticket_nr'] = [i[-1] for i in df.Ticket.str.split()]

In [265]:
ticket_dict = df.groupby('Ticket_nr').Lastname.count().to_dict()
df['Passengers_ticket'] = df.Ticket_nr.map(ticket_dict)
df['Price'] = (df.Fare / df.Passengers_ticket).round(1)

df['Deck'] = df.Cabin.str[0]
df.Deck.isna().sum()

1014

### Deck Imputer

The deck feature can serve to determine socio-economic-status as cabins and passenger class is tied to decks

In [266]:
def impute_deck_by(feature):
    for pclass in range(1, 4):
        map_dic = (df[~df.Deck.isna() & df.Pclass.eq(pclass)]
                   .groupby(feature).Deck.unique()
                   .apply(list).to_dict())
        map_dic = {i: j[0] for i, j in map_dic.items()
                   if len(j) == 1}
        df.loc[df.Deck.isna() & df.Pclass.eq(pclass),
        'Deck'] = df[feature].map(map_dic)

    print(df.Deck.isna().sum())

In [267]:
impute_deck_by('Ticket_nr')
impute_deck_by('Lastname')

997
989


### Deck Survival Feature —> based on existing training data

In [268]:
train_df['Deck'] = train_df.Cabin.str[0]

deck_total_survived = train_df.groupby('Deck').Survived.sum()
deck_people = train_df.groupby('Deck').Deck.count()

deck_survived_dict = (deck_total_survived / deck_people).round(2).to_dict()
deck_nan_survived = train_df.loc[train_df.Deck.isna(), 'Survived']

deck_survived_dict['M'] = (deck_nan_survived.value_counts() / deck_nan_survived.count()).round(2)[1]
df['Deck_survive_ratio'] = df.Deck.fillna('M')

df.Deck_survive_ratio = df.Deck_survive_ratio.map(deck_survived_dict).astype('float')

### Manual Price Mapping for outliers

In [269]:
df.loc[df.Deck.eq('T'), 'Deck'] = 'A'

In [270]:
df.groupby(['Pclass', 'Deck']).Price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Pclass,Deck,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,A,22.0,32.945455,6.909671,26.0,27.9,30.6,35.5,50.5
1,B,65.0,42.983077,25.865606,5.0,29.0,37.5,45.5,128.1
1,C,105.0,35.78381,9.73254,25.3,27.7,33.9,42.3,68.4
1,D,42.0,31.77619,9.69865,19.7,26.0,27.15,37.75,66.8
1,E,35.0,26.177143,5.033715,8.8,26.45,26.9,27.7,38.5
2,D,6.0,13.6,0.812404,12.9,13.0,13.4,13.875,15.0
2,E,6.0,9.783333,3.64879,5.2,6.525,11.45,12.4,13.0
2,F,16.0,10.925,1.756701,8.7,9.8,10.5,13.0,13.0
3,E,3.0,6.8,1.03923,6.2,6.2,6.2,7.1,8.0
3,F,11.0,7.518182,0.177866,7.2,7.5,7.6,7.6,7.8


In [271]:
df[df.Deck.eq('B')].sort_values('Price', ascending=False).head(10)

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lastname,Title,Sex,Ticket_series,Ticket_nr,Passengers_ticket,Price,Deck,Deck_survive_ratio
1234,1,"Cardeza, Mrs. James Warburton Martinez (Charlo...",58.0,0,1,PC 17755,512.3292,B51 B53 B55,C,Cardeza,Mrs,female,PC,17755,4,128.1,B,0.74
258,1,"Ward, Miss. Anna",35.0,0,0,PC 17755,512.3292,,C,Ward,Miss,female,PC,17755,4,128.1,B,0.74
737,1,"Lesurer, Mr. Gustave J",35.0,0,0,PC 17755,512.3292,B101,C,Lesurer,Mr,male,PC,17755,4,128.1,B,0.74
679,1,"Cardeza, Mr. Thomas Drake Martinez",36.0,0,1,PC 17755,512.3292,B51 B53 B55,C,Cardeza,Mr,male,PC,17755,4,128.1,B,0.74
118,1,"Baxter, Mr. Quigg Edmond",24.0,0,1,PC 17558,247.5208,B58 B60,C,Baxter,Mr,male,PC,17558,3,82.5,B,0.74
1075,1,"Douglas, Mrs. Frederick Charles (Mary Helene B...",27.0,1,1,PC 17558,247.5208,B58 B60,C,Douglas,Mrs,female,PC,17558,3,82.5,B,0.74
299,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",50.0,0,1,PC 17558,247.5208,B58 B60,C,Baxter,Mrs,female,PC,17558,3,82.5,B,0.74
730,1,"Allen, Miss. Elisabeth Walton",29.0,0,0,24160,211.3375,B5,S,Allen,Miss,female,0,24160,4,52.8,B,0.74
1215,1,"Kreuchen, Miss. Emilie",39.0,0,0,24160,211.3375,,S,Kreuchen,Miss,female,0,24160,4,52.8,B,0.74
779,1,"Robert, Mrs. Edward Scott (Elisabeth Walton Mc...",43.0,0,1,24160,211.3375,B3,S,Robert,Mrs,female,0,24160,4,52.8,B,0.74


In [272]:
df[df.Deck.eq('B')].sort_values('Price').head()

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lastname,Title,Sex,Ticket_series,Ticket_nr,Passengers_ticket,Price,Deck,Deck_survive_ratio
872,1,"Carlsson, Mr. Frans Olof",33.0,0,0,695,5.0,B51 B53 B55,S,Carlsson,Mr,male,0,695,1,5.0,B,0.74
690,1,"Dick, Mr. Albert Adrian",31.0,1,0,17474,57.0,B20,S,Dick,Mr,male,0,17474,3,19.0,B,0.74
781,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",17.0,1,0,17474,57.0,B20,S,Dick,Miss,female,0,17474,3,19.0,B,0.74
1199,1,"Hays, Mr. Charles Melville",55.0,1,1,12749,93.5,B69,S,Hays,Mr,male,0,12749,4,23.4,B,0.74
1281,1,"Payne, Mr. Vivian Ponsonby",23.0,0,0,12749,93.5,B24,S,Payne,Mr,male,0,12749,4,23.4,B,0.74


Price outliers for Deck B were capped

In [273]:
df.loc[(df.Deck.eq('B') & df.Price.lt(19)), 'Price'] = 19
df.loc[(df.Deck.eq('B') & df.Price.gt(68)), 'Price'] = 68

Impute prices based on the average for the entire deck

In [274]:
class_deck_price = pd.DataFrame(df.groupby(['Pclass', 'Deck'])
                                .Price.mean().round(2)).reset_index()

for index, row in df.loc[df.Price.isna(),
['Pclass', 'Deck']].iterrows():
    if not pd.isna(row.Deck):
        new_price = class_deck_price.loc[
            (class_deck_price.Pclass.eq(row.Pclass)
             & class_deck_price.Deck.eq(row.Deck)), 'Price'].mean()
    else:
        new_price = class_deck_price[
            class_deck_price.Pclass.eq(row.Pclass)].Price.mean()

    df.loc[[index], 'Price'] = new_price

### Deck Imputing based on the price ranges

In [275]:
first_cl = {'A': [25, 30],
            'B': [35, 70],
            'C': [30, 35],
            'D': [19, 25],
            'E': [9, 19]}

second_cl = {'D': [13, 17],
             'E': [5, 9],
             'F': [9, 13]}

third_cl = {'E': [8, 9],
            'F': [9, 21],
            'G': [0, 8]}

class_dict = {1: first_cl,
              2: second_cl,
              3: third_cl}

for index, row in df.loc[df.Deck.isna(), ['Pclass', 'Price']].iterrows():
    for c, d in class_dict.items():
        if row.Pclass == c:
            for i, j in d.items():
                if max(j) > row.Price >= min(j):
                    df.loc[[index], 'Deck'] = i

# Encode Deck with it's deck level number counting from the bottom
deck_level = {'G': 1, 'F': 2, 'E': 3, 'D': 4, 'C': 5, 'B': 6, 'A': 7}

df.Deck = df.Deck.replace(deck_level)

In [276]:
df

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lastname,Title,Sex,Ticket_series,Ticket_nr,Passengers_ticket,Price,Deck,Deck_survive_ratio
0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,,S,Braund,Mr,male,A/5,21171,1,7.2,1,0.30
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs,female,PC,17599,2,35.6,5,0.59
2,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,,S,Heikkinen,Miss,female,STON/O2.,3101282,1,7.9,1,0.30
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,S,Futrelle,Mrs,female,0,113803,2,26.6,5,0.59
4,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,,S,Allen,Mr,male,0,373450,1,8.0,3,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",29.881138,0,0,A.5. 3236,8.0500,,S,Spector,Mr,male,A.5.,3236,1,8.0,3,0.30
1305,1,"Oliva y Ocana, Dona. Fermina",39.000000,0,0,PC 17758,108.9000,C105,C,Oliva y Ocana,Mr,male,PC,17758,3,36.3,5,0.59
1306,3,"Saether, Mr. Simon Sivertsen",38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S,Saether,Mr,male,SOTON/O.Q.,3101262,1,7.2,1,0.30
1307,3,"Ware, Mr. Frederick",29.881138,0,0,359309,8.0500,,S,Ware,Mr,male,0,359309,1,8.0,3,0.30


### Escape Density Feature
Since all the lifeboats were on the top deck, everyone was aiming to go there. Hence, It's crucial to check how many people
 one had to go through to get to the lifeboats.

In [277]:
deck_people = df.Deck.value_counts().sort_index()
deck_people_dic = deck_people.to_dict()
deck_people_dic

{1: 537, 2: 167, 3: 193, 4: 172, 5: 112, 6: 76, 7: 52}

In [278]:
escape_density = {}
for i in range(1, 8):
    escape_density[i] = sum(deck_people_dic.values())
    del deck_people_dic[i]

escape_density

{1: 1309, 2: 772, 3: 605, 4: 412, 5: 240, 6: 128, 7: 52}

In [279]:
# Create Escape_density column
df['Escape_density'] = df.Deck.replace(escape_density)

### Family Size = SibSp + Parch

In [280]:
df['Family_size'] = 1 + df.SibSp + df.Parch
df['IsAlone'] = 0
df.loc[df['Family_size'] == 1, 'IsAlone'] = 1

### Family_survivers is a flawed feature due to leakeage which can skew accuracy on training data

In [281]:
X = df[:X_max_index].copy()
test_df = df[X_max_index:].copy()
full_df = pd.concat([X, y], axis=1).copy()

# Check for families that has survivers and create a dictionary with mean value of their family survivability
family_survivers = full_df[['Lastname', 'Survived']].groupby('Lastname').mean().round(2).reset_index()
family_survivers_dict = dict(zip(family_survivers.Lastname, family_survivers.Survived))

# Reduce the dictionary to the list of families that are both in train and test data
common_survivers = {}
for lastname, survived in family_survivers_dict.items():
    if lastname in list(test_df['Lastname'].unique()):
        common_survivers[lastname] = survived

# Create Family_survivers feature
df['Family_survivers'] = df.Lastname.map(common_survivers)

# For the families that are not present in both train and test we will impute the overall mean value
df.Family_survivers = df.Family_survivers.fillna(df.Family_survivers.mean())

In [282]:
df.Pclass = df.Pclass.astype('object')

In [283]:
col_drop = ['Name', 'Ticket', 'Ticket_nr', 'Ticket_series',
            'Fare', 'Cabin', 'Lastname', 'Passengers_ticket']
df = df.drop(col_drop, axis=1)

### Categorical Feature encoding + imputing

In [284]:
categ_cols = list(df.select_dtypes(exclude=[np.number]).columns)
cat_imputer = SimpleImputer(strategy='most_frequent')

df_cat = pd.DataFrame(cat_imputer.fit_transform(df[categ_cols]),
                      columns=df[categ_cols].columns)
df_cat = pd.get_dummies(df_cat)

### Numerical Feature encoding + imputing

In [285]:
num_cols = list(df.select_dtypes([np.number]).columns)
it_imp = IterativeImputer()

df_num = pd.DataFrame(it_imp.fit_transform(df[num_cols]),
                      columns=df[num_cols].columns)

df = pd.concat([df_cat, df_num], axis=1)

In [286]:
X = df[:X_max_index]
full_df = pd.concat([X, y], axis=1)


# Function for kde plotting
def survive_chance(feature, xticks=None, xlim=None):
    # Filtering the DataFrame based on survival
    survived = full_df[full_df['Survived'] == 1]
    not_survived = full_df[full_df['Survived'] == 0]

    combined = pd.concat([survived.assign(Survival='Survived'), not_survived.assign(Survival='Not Survived')])
    fig = px.histogram(combined, x=feature, color="Survival",
                       marginal="violin",
                       hover_data=combined.columns,
                       nbins=50,
                       opacity=0.6,
                       barmode='overlay',
                       histnorm='density')

    if xticks is not None:
        fig.update_xaxes(tickvals=xticks)
    if xlim is not None:
        fig.update_xaxes(range=xlim)

    fig.update_layout(
        title_text=f'Distribution of {feature} by Survival',
        xaxis_title_text=feature,
        yaxis_title_text='Density',
        bargap=0.2,
        bargroupgap=0.1
    )
    fig.show()

In [287]:
survive_chance('Age', np.arange(0, 80, 5), (0, 80))

In [288]:
df['Age_group'] = pd.cut(x=df.Age, labels=[5, 1, 4, 3, 2],
                         bins=[-1, 15, 33, 45, 60, df.Age.max()]).astype('float')

In [289]:
survive_chance('Family_size', np.arange(0, 10, 1), (0, 10))

In [290]:
df['Family_group'] = pd.cut(x=df.Family_size, labels=[1, 3, 2],
                            bins=[-1, 1, 4, df.Family_size.max()]).astype('float')

In [291]:
survive_chance('Family_survivers', np.arange(0, 1, 0.1), (0, 1))

In [292]:
df['Lucky_family'] = pd.cut(x=df.Family_survivers, labels=[2, 3, 1, 4],
                            bins=[-1, 0.22, 0.35, 0.49, df.Family_survivers.max()]).astype('float')

In [293]:
df

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,...,Price,Deck,Deck_survive_ratio,Escape_density,Family_size,IsAlone,Family_survivers,Age_group,Family_group,Lucky_family
0,False,False,True,False,False,True,False,False,True,False,...,7.2,1.0,0.30,1309.0,2.0,0.0,0.449863,1.0,3.0,1.0
1,True,False,False,True,False,False,False,False,False,True,...,35.6,5.0,0.59,240.0,2.0,0.0,1.000000,4.0,3.0,4.0
2,False,False,True,False,False,True,False,True,False,False,...,7.9,1.0,0.30,1309.0,1.0,1.0,0.449863,1.0,1.0,1.0
3,True,False,False,False,False,True,False,False,False,True,...,26.6,5.0,0.59,240.0,2.0,0.0,0.449863,4.0,3.0,1.0
4,False,False,True,False,False,True,False,False,True,False,...,8.0,3.0,0.30,605.0,1.0,1.0,0.449863,4.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,False,False,True,False,False,True,False,False,True,False,...,8.0,3.0,0.30,605.0,1.0,1.0,0.449863,1.0,1.0,1.0
1305,True,False,False,True,False,False,False,False,True,False,...,36.3,5.0,0.59,240.0,1.0,1.0,0.449863,4.0,1.0,1.0
1306,False,False,True,False,False,True,False,False,True,False,...,7.2,1.0,0.30,1309.0,1.0,1.0,0.449863,4.0,1.0,1.0
1307,False,False,True,False,False,True,False,False,True,False,...,8.0,3.0,0.30,605.0,1.0,1.0,0.449863,1.0,1.0,1.0


### Standardize data

In [294]:
df.Price = df.Price.apply(np.log1p)

# Standardize 
std_scaler = StandardScaler()

df_scaled = std_scaler.fit_transform(df)
df = pd.DataFrame(df_scaled, columns=df.columns)

In [295]:
df.drop(["Family_survivers"], axis=1, inplace=True)

In [296]:
X = df.iloc[:X_max_index]
test_df = df.iloc[X_max_index:]

correlation = full_df.corr()['Survived'].sort_values(ascending=False)

correlation = correlation[1:]

fig = px.bar(correlation,
             y=correlation.values,
             x=correlation.index,
             text_auto='.2f',
             labels={'x': 'Feature', 'y': 'Correlation with Survival'},
             title='Survivability Dependency on Features based on correlation Matrix')

fig.update_layout(
    xaxis_title="Features",
    yaxis_title="Correlation with Survival",
    xaxis_tickangle=-45,
    width=1300,
    height=700
)

fig.show()

In [297]:

X_train, X_test, y_train, y_test = train_test_split(X, train_df["Survived"], test_size=0.2)

In [298]:
# # Define model
'''
cat_model = CatBoostClassifier()

# # Define parameters' grid
grid = {'verbose': [False],
         'thread_count': [-1],
         'depth': [3, 4, 5, 6],
         'iterations': [500, 1000, 2000, 3000],
         'learning_rate': [0.0001, 0.001, 0.01]
        }

# # Define GridSearchCV
grid_cat = GridSearchCV(estimator=cat_model, param_grid=grid, cv=3, n_jobs=-1)
grid_cat.fit(X,y)
catboost_params = grid_cat.best_params_

print('\n Best Score:\n', grid_cat.best_score_)
print('\n Best parameters:\n', catboost_params)
'''

"\ncat_model = CatBoostClassifier()\n\n# # Define parameters' grid\ngrid = {'verbose': [False],\n         'thread_count': [-1],\n         'depth': [3, 4, 5, 6],\n         'iterations': [500, 1000, 2000, 3000],\n         'learning_rate': [0.0001, 0.001, 0.01]\n        }\n\n# # Define GridSearchCV\ngrid_cat = GridSearchCV(estimator=cat_model, param_grid=grid, cv=3, n_jobs=-1)\ngrid_cat.fit(X,y)\ncatboost_params = grid_cat.best_params_\n\nprint('\n Best Score:\n', grid_cat.best_score_)\nprint('\n Best parameters:\n', catboost_params)\n"

In [299]:
'''
def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 600, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.0005, 0.001),
        'depth': trial.suggest_int('depth', 4, 4),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 50),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_state': random_id,
        'verbose': False,
        'thread_count': -1,
    }

    # Perform stratified k-fold cross-validation
    num_folds = 5
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_id)

    accuracy_list = []
    for train_index, test_index in kf.split(X_train, y_train):
        
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        train_data = catboost.Pool(data=X_train_fold, label=y_train_fold)
        valid_data = catboost.Pool(data=X_valid_fold, label=y_valid_fold)

        model = CatBoostClassifier(**param)
        model.fit(train_data, eval_set=valid_data, early_stopping_rounds=10, use_best_model=True)

        preds = model.predict(X_valid_fold)
        accuracy = accuracy_score(y_valid_fold, preds)
        accuracy_list.append(accuracy)
    
    mean_accuracy = sum(accuracy_list) / num_folds
    return mean_accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
'''

"\ndef objective(trial):\n    param = {\n        'iterations': trial.suggest_int('iterations', 600, 1000),\n        'learning_rate': trial.suggest_float('learning_rate', 0.0005, 0.001),\n        'depth': trial.suggest_int('depth', 4, 4),\n        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 50),\n        'random_strength': trial.suggest_float('random_strength', 1e-9, 10),\n        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),\n        'border_count': trial.suggest_int('border_count', 32, 255),\n        'random_state': random_id,\n        'verbose': False,\n        'thread_count': -1,\n    }\n\n    # Perform stratified k-fold cross-validation\n    num_folds = 5\n    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_id)\n\n    accuracy_list = []\n    for train_index, test_index in kf.split(X_train, y_train):\n        \n        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[test_index]\n        y_train_fold,

In [300]:

### Most Ideal params found during all the optuna trials
catboost_params = {
    'verbose': False,
    'thread_count': -1,
    'depth': 4,
    'iterations': 1000,
    'learning_rate': 0.0005,
}

# Perform cross-validation with CatBoost
num_folds = 5
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_id)

best_test_accuracy = 0
best_test_predictions = []

for train_index, test_index in kf.split(X_train, y_train):
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Create CatBoost datasets
    train_data_fold = catboost.Pool(X_train_fold, label=y_train_fold)
    valid_data_fold = catboost.Pool(X_valid_fold, label=y_valid_fold)

    catboost_model = CatBoostClassifier(**catboost_params)
    catboost_model.fit(train_data_fold, eval_set=valid_data_fold, early_stopping_rounds=10)

    valid_predictions_fold = catboost_model.predict(X_valid_fold)
    valid_accuracy_fold = accuracy_score(valid_predictions_fold, y_valid_fold)

    if valid_accuracy_fold > best_test_accuracy:
        best_test_accuracy = valid_accuracy_fold
        best_test_predictions = catboost_model.predict(X_test)

print(f'Best Testing Accuracy: {best_test_accuracy}')

Best Testing Accuracy: 0.8591549295774648


In [301]:
predictions = catboost_model.predict(test_df)
output = pd.DataFrame({'PassengerId': test_pass_id,
                       'Survived': predictions})
output.set_index("PassengerId", inplace=True)

In [302]:
output.to_csv("submission_test.csv")