In [1276]:
import catboost
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [1277]:
random_id = 17 #ai23m017 - 17 is the id
random_state = np.random.RandomState(random_id)

In [1278]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S


In [1279]:
test_pass_id = test_df.pop('PassengerId')

X_max_index = train_df.shape[0]
y = train_df.Survived

df = train_df.drop(['Survived', 'PassengerId'], axis=1)
df = pd.concat([df, test_df], axis=0).reset_index(drop=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Age       1046 non-null   float64
 3   SibSp     1309 non-null   int64  
 4   Parch     1309 non-null   int64  
 5   Ticket    1309 non-null   object 
 6   Fare      1308 non-null   float64
 7   Cabin     295 non-null    object 
 8   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(4)
memory usage: 92.2+ KB


In [1280]:
most_common_embarked = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(most_common_embarked)

In [1281]:
np.random.randint(df['Age'].mean()-df['Age'].std(),df['Age'].mean() + df['Age'].std(),177)

df_age = np.random.randint(df['Age'].mean()-df['Age'].std(),df['Age'].mean() + df['Age'].std(),177)
df_age = df['Age'].mean()

df.loc[df['Age'].isnull(), 'Age'] = df_age

In [1282]:
df.loc[df.Fare.eq(0), 'Fare'] = np.nan

In [1283]:
df

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",29.881138,0,0,A.5. 3236,8.0500,,S
1305,1,"Oliva y Ocana, Dona. Fermina",39.000000,0,0,PC 17758,108.9000,C105,C
1306,3,"Saether, Mr. Simon Sivertsen",38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,3,"Ware, Mr. Frederick",29.881138,0,0,359309,8.0500,,S


In [1284]:
df['Lastname'] = df.Name.str.split(', ').str[0]

In [1285]:
df['Title'] = df.Name.str.split(', ').str[1]
df['Title'] = df.Title.str.split('.').str[0]

In [1286]:
df.Title.value_counts()

Title
Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Mlle              2
Major             2
Ms                2
Lady              1
Sir               1
Mme               1
Don               1
Capt              1
the Countess      1
Jonkheer          1
Dona              1
Name: count, dtype: int64

#### Similar titles are grouped into fewer titles

In [1287]:
def map_to_sex(value):
    if value in ["Miss","Mrs"]:
        return "female"
    else:
        return "male"
df['Sex'] = df['Title'].apply(map_to_sex)

In [1288]:
mrs = ['Mrs', 'Mme']
df.loc[df.Title.isin(mrs), 'Title'] = 'Mrs'

miss = ['Ms', 'Miss', 'Mlle']
df.loc[df.Title.isin(miss), 'Title'] = 'Miss'

df.loc[(df.Title.eq('Mr') & df.Age.le(18)), 'Title'] = 'Master'

df.loc[(df.Title.eq('Mrs') & df.Age.le(18)), 'Title'] = 'Miss'

df.loc[(~df.Title.isin(['Mrs', 'Miss', 'Mr', 'Master']) 
        & df.Sex.eq('male')), 'Title'] = 'Mr'

df.loc[(~df.Title.isin(['Mrs', 'Miss', 'Mr', 'Master']) 
        & df.Sex.eq('female')), 'Title'] = 'Mrs'

In [1289]:
df

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lastname,Title,Sex
0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,,S,Braund,Mr,male
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs,female
2,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,,S,Heikkinen,Miss,female
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,S,Futrelle,Mrs,female
4,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,,S,Allen,Mr,male
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",29.881138,0,0,A.5. 3236,8.0500,,S,Spector,Mr,male
1305,1,"Oliva y Ocana, Dona. Fermina",39.000000,0,0,PC 17758,108.9000,C105,C,Oliva y Ocana,Mr,male
1306,3,"Saether, Mr. Simon Sivertsen",38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S,Saether,Mr,male
1307,3,"Ware, Mr. Frederick",29.881138,0,0,359309,8.0500,,S,Ware,Mr,male


Split Tickets into Ticket_Series and Ticket_nr

In [1290]:
df['Ticket_series'] = [i[0] if len(i) > 1 else 0 for i in df.Ticket.str.split()]
df['Ticket_nr'] = [i[-1] for i in df.Ticket.str.split()]

In [1291]:
ticket_dict = df.groupby('Ticket_nr').Lastname.count().to_dict()
df['Passengers_ticket'] = df.Ticket_nr.map(ticket_dict)
df['Price'] = (df.Fare / df.Passengers_ticket).round(1)

df['Deck'] = df.Cabin.str[0]
df.Deck.isna().sum()

1014

### Deck Imputer

In [1292]:
def impute_deck_by(feature):
    for pclass in range(1, 4):
        # Create a mapping dictionary
        map_dic = (df[~df.Deck.isna() & df.Pclass.eq(pclass)]
                   .groupby(feature).Deck.unique()
                   .apply(list).to_dict())

        # Keep just the keys with a single deck to avoid 
        # the same key on different decks
        map_dic = {i:j[0] for i, j in map_dic.items() 
                   if len(j) == 1}

        # Imputing Deck from map_dic
        df.loc[df.Deck.isna() & df.Pclass.eq(pclass), 
               'Deck'] = df[feature].map(map_dic)

    # Check how many missing values we have at this step
    print(df.Deck.isna().sum())

In [1293]:
impute_deck_by('Ticket_nr')
impute_deck_by('Lastname')

997
989


In [1294]:
train_df['Deck'] = train_df.Cabin.str[0]

# Total Survived by available Deck data
deck_total_survived = train_df.groupby('Deck').Survived.sum()

# Total passengers by available Deck data
deck_people = train_df.groupby('Deck').Deck.count()

# Deck and surviving chance pairs 
deck_survived_dict = (deck_total_survived / deck_people).round(2).to_dict()

# Total Survived where Deck data is missing
deck_nan_survived = train_df.loc[train_df.Deck.isna(), 'Survived']

# Add to dictionary an additional entry for missing Deck surviving chance
deck_survived_dict['M'] = (deck_nan_survived.value_counts() / deck_nan_survived.count()).round(2)[1]

# Create Deck_survive_ratio
df['Deck_survive_ratio'] = df.Deck.fillna('M')

df.Deck_survive_ratio = df.Deck_survive_ratio.map(deck_survived_dict).astype('float')

### Manual Deck Mapping for outliers

In [1295]:
df.loc[df.Deck.eq('T'), 'Deck'] = 'A'

In [1296]:
df.groupby(['Pclass', 'Deck']).Price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Pclass,Deck,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,A,22.0,32.945455,6.909671,26.0,27.9,30.6,35.5,50.5
1,B,65.0,42.983077,25.865606,5.0,29.0,37.5,45.5,128.1
1,C,105.0,35.78381,9.73254,25.3,27.7,33.9,42.3,68.4
1,D,42.0,31.77619,9.69865,19.7,26.0,27.15,37.75,66.8
1,E,35.0,26.177143,5.033715,8.8,26.45,26.9,27.7,38.5
2,D,6.0,13.6,0.812404,12.9,13.0,13.4,13.875,15.0
2,E,6.0,9.783333,3.64879,5.2,6.525,11.45,12.4,13.0
2,F,16.0,10.925,1.756701,8.7,9.8,10.5,13.0,13.0
3,E,3.0,6.8,1.03923,6.2,6.2,6.2,7.1,8.0
3,F,11.0,7.518182,0.177866,7.2,7.5,7.6,7.6,7.8


In [1297]:
df.loc[(df.Deck.eq('B') & df.Price.lt(19)), 'Price'] = 19
df.loc[(df.Deck.eq('B') & df.Price.gt(68)), 'Price'] = 68

In [1298]:
# Create a data frame of mean prices by Pclass and Deck 
class_deck_price = pd.DataFrame(df.groupby(['Pclass', 'Deck'])
                                .Price.mean().round(2)).reset_index()

# Impute missing prices 
# Where Deck is missing we will use the mean price by Pclass only
for index, row in df.loc[df.Price.isna(), 
                         ['Pclass', 'Deck']].iterrows():
    if not pd.isna(row.Deck):
        new_price = class_deck_price.loc[
            (class_deck_price.Pclass.eq(row.Pclass) 
            & class_deck_price.Deck.eq(row.Deck)), 'Price'].mean()
    else:
        new_price = class_deck_price[
            class_deck_price.Pclass.eq(row.Pclass)].Price.mean()

    df.loc[[index], 'Price'] = new_price

In [1299]:
first_cl = {'A': [25, 30],
            'B': [35, 70],
            'C': [30, 35],
            'D': [19, 25],
            'E': [9, 19]}

second_cl = {'D': [13, 17],
             'E': [5, 9],
             'F': [9, 13]}

third_cl = {'E': [8, 9],
            'F': [9, 21],
            'G': [0, 8]}

# Create a dictionary pairing Pclass and respective price dictionary
class_dict = {1: first_cl,
              2: second_cl,
              3: third_cl}

# Impute missing Deck values 
for index, row in df.loc[df.Deck.isna(), ['Pclass', 'Price']].iterrows():
    for c, d in class_dict.items():
        if row.Pclass == c:
            for i, j in d.items():
                if max(j) > row.Price >= min(j):
                    df.loc[[index], 'Deck'] = i

# Encode Deck with it's deck level number counting from the bottom
deck_level = {'G': 1, 'F': 2, 'E': 3, 'D': 4, 'C': 5, 'B': 6, 'A': 7}

df.Deck = df.Deck.replace(deck_level)

In [1300]:
df

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lastname,Title,Sex,Ticket_series,Ticket_nr,Passengers_ticket,Price,Deck,Deck_survive_ratio
0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,,S,Braund,Mr,male,A/5,21171,1,7.2,1,0.30
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs,female,PC,17599,2,35.6,5,0.59
2,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,,S,Heikkinen,Miss,female,STON/O2.,3101282,1,7.9,1,0.30
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,S,Futrelle,Mrs,female,0,113803,2,26.6,5,0.59
4,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,,S,Allen,Mr,male,0,373450,1,8.0,3,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",29.881138,0,0,A.5. 3236,8.0500,,S,Spector,Mr,male,A.5.,3236,1,8.0,3,0.30
1305,1,"Oliva y Ocana, Dona. Fermina",39.000000,0,0,PC 17758,108.9000,C105,C,Oliva y Ocana,Mr,male,PC,17758,3,36.3,5,0.59
1306,3,"Saether, Mr. Simon Sivertsen",38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S,Saether,Mr,male,SOTON/O.Q.,3101262,1,7.2,1,0.30
1307,3,"Ware, Mr. Frederick",29.881138,0,0,359309,8.0500,,S,Ware,Mr,male,0,359309,1,8.0,3,0.30


In [1301]:
deck_people = df.Deck.value_counts().sort_index()
deck_people_dic = deck_people.to_dict()
deck_people_dic

{1: 537, 2: 167, 3: 193, 4: 172, 5: 112, 6: 76, 7: 52}

In [1302]:
escape_density = {}
for i in range(1, 8):
    escape_density[i] = sum(deck_people_dic.values())
    del deck_people_dic[i]
    
escape_density

{1: 1309, 2: 772, 3: 605, 4: 412, 5: 240, 6: 128, 7: 52}

In [1303]:
# Create Escape_density column
df['Escape_density'] = df.Deck.replace(escape_density)

In [1304]:
df['Family_size'] = 1 + df.SibSp + df.Parch

In [1305]:
X = df[:X_max_index].copy()
test_df = df[X_max_index:].copy()
full_df = pd.concat([X, y], axis=1).copy()

# Check for families that has survivers and create a dictionary with mean value of their family survivability
family_survivers = full_df[['Lastname', 'Survived']].groupby('Lastname').mean().round(2).reset_index()
family_survivers_dict = dict(zip(family_survivers.Lastname, family_survivers.Survived))

# Reduce the dictionary to the list of families that are both in train and test data
common_survivers = {}
for lastname, survived in family_survivers_dict.items():
    if lastname in list(test_df['Lastname'].unique()):
        common_survivers[lastname] = survived

# Create Family_survivers feature
df['Family_survivers'] = df.Lastname.map(common_survivers)

# For the families that are not present in both train and test we will impute the overall mean value
df.Family_survivers = df.Family_survivers.fillna(df.Family_survivers.mean())

In [1306]:
df.Pclass = df.Pclass.astype('object')

In [1307]:
col_drop = ['Name', 'Ticket', 'Ticket_nr', 'Ticket_series', 
            'Fare', 'Cabin', 'Lastname', 'Passengers_ticket']
df = df.drop(col_drop, axis=1)

In [1308]:
# List of categorical columns
categ_cols = list(df.select_dtypes(exclude=[np.number]).columns)

# Impute categoricals with most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')

df_cat = pd.DataFrame(cat_imputer.fit_transform(df[categ_cols]), 
                      columns=df[categ_cols].columns)

# Encode categorical
df_cat = pd.get_dummies(df_cat)

In [1309]:
# List of numerical columns
num_cols = list(df.select_dtypes([np.number]).columns)

# Impute numericals
it_imp = IterativeImputer()

df_num = pd.DataFrame(it_imp.fit_transform(df[num_cols]),
                      columns=df[num_cols].columns)

# Concatenate with encoded categorical columns
df = pd.concat([df_cat, df_num], axis=1)

In [1310]:
df['Age_group'] = pd.cut(x=df.Age, labels=[5, 1, 4, 3, 2],
                         bins=[-1, 15, 33, 45, 60, df.Age.max()]).astype('float')

In [1311]:
df['Family_group'] = pd.cut(x=df.Family_size, labels=[1, 3, 2], 
                            bins=[-1, 1, 4, df.Family_size.max()]).astype('float')

In [1312]:
df['Lucky_family'] = pd.cut(x=df.Family_survivers, labels=[2, 3, 1, 4],
                            bins=[-1, 0.22, 0.35, 0.49, df.Family_survivers.max()]).astype('float')

In [1313]:
df

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,...,Parch,Price,Deck,Deck_survive_ratio,Escape_density,Family_size,Family_survivers,Age_group,Family_group,Lucky_family
0,False,False,True,False,False,True,False,False,True,False,...,0.0,7.2,1.0,0.30,1309.0,2.0,0.449863,1.0,3.0,1.0
1,True,False,False,True,False,False,False,False,False,True,...,0.0,35.6,5.0,0.59,240.0,2.0,1.000000,4.0,3.0,4.0
2,False,False,True,False,False,True,False,True,False,False,...,0.0,7.9,1.0,0.30,1309.0,1.0,0.449863,1.0,1.0,1.0
3,True,False,False,False,False,True,False,False,False,True,...,0.0,26.6,5.0,0.59,240.0,2.0,0.449863,4.0,3.0,1.0
4,False,False,True,False,False,True,False,False,True,False,...,0.0,8.0,3.0,0.30,605.0,1.0,0.449863,4.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,False,False,True,False,False,True,False,False,True,False,...,0.0,8.0,3.0,0.30,605.0,1.0,0.449863,1.0,1.0,1.0
1305,True,False,False,True,False,False,False,False,True,False,...,0.0,36.3,5.0,0.59,240.0,1.0,0.449863,4.0,1.0,1.0
1306,False,False,True,False,False,True,False,False,True,False,...,0.0,7.2,1.0,0.30,1309.0,1.0,0.449863,4.0,1.0,1.0
1307,False,False,True,False,False,True,False,False,True,False,...,0.0,8.0,3.0,0.30,605.0,1.0,0.449863,1.0,1.0,1.0


In [1314]:
df.Price = df.Price.apply(np.log1p)

# Standardize 
std_scaler = StandardScaler()

df_scaled = std_scaler.fit_transform(df)
df = pd.DataFrame(df_scaled, columns=df.columns)

In [1315]:
df.drop(["Family_survivers"], axis=1, inplace = True)

In [1316]:
X = df.iloc[:X_max_index]
test_df = df.iloc[X_max_index:]


X_train, X_test, y_train, y_test = train_test_split(X, train_df["Survived"], test_size=0.2)

In [1317]:
# # Define model
'''
cat_model = CatBoostClassifier()

# # Define parameters' grid
grid = {'verbose': [False],
         'thread_count': [-1],
         'depth': [3, 4, 5, 6],
         'iterations': [500, 1000, 2000, 3000],
         'learning_rate': [0.0001, 0.001, 0.01]
        }

# # Define GridSearchCV
grid_cat = GridSearchCV(estimator=cat_model, param_grid=grid, cv=3, n_jobs=-1)
grid_cat.fit(X,y)
catboost_params = grid_cat.best_params_

print('\n Best Score:\n', grid_cat.best_score_)
print('\n Best parameters:\n', catboost_params)
'''

"\ncat_model = CatBoostClassifier()\n\n# # Define parameters' grid\ngrid = {'verbose': [False],\n         'thread_count': [-1],\n         'depth': [3, 4, 5, 6],\n         'iterations': [500, 1000, 2000, 3000],\n         'learning_rate': [0.0001, 0.001, 0.01]\n        }\n\n# # Define GridSearchCV\ngrid_cat = GridSearchCV(estimator=cat_model, param_grid=grid, cv=3, n_jobs=-1)\ngrid_cat.fit(X,y)\ncatboost_params = grid_cat.best_params_\n\nprint('\n Best Score:\n', grid_cat.best_score_)\nprint('\n Best parameters:\n', catboost_params)\n"

In [1318]:


catboost_params = {
    'verbose': False,
    'thread_count': -1,
    'depth': 4, 
    'iterations': 1000, 
    'learning_rate': 0.0005,
}


# Perform cross-validation with CatBoost
num_folds = 5
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_id)

best_test_accuracy = 0
best_test_predictions = []

for train_index, test_index in kf.split(X_train, y_train):
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Create CatBoost datasets
    train_data_fold = catboost.Pool(X_train_fold, label=y_train_fold)
    valid_data_fold = catboost.Pool(X_valid_fold, label=y_valid_fold)

    # Train the CatBoost model
    catboost_model = CatBoostClassifier(**catboost_params)
    catboost_model.fit(train_data_fold, eval_set=valid_data_fold, early_stopping_rounds=10)

    # Make predictions on the validation set
    valid_predictions_fold = catboost_model.predict(X_valid_fold)

    # Calculate accuracy on the validation set
    valid_accuracy_fold = accuracy_score(valid_predictions_fold, y_valid_fold)

    # If the current model has a higher accuracy on the validation set, update the best model
    if valid_accuracy_fold > best_test_accuracy:
        best_test_accuracy = valid_accuracy_fold
        best_test_predictions = catboost_model.predict(X_test)

# Print best testing accuracy
print(f'Best Testing Accuracy: {best_test_accuracy}')

Best Testing Accuracy: 0.8309859154929577


In [1319]:
predictions = catboost_model.predict(test_df)
output = pd.DataFrame({'PassengerId': test_pass_id,
                       'Survived': predictions})
output.set_index("PassengerId", inplace=True)

In [1320]:
actual_data = pd.read_csv("test_results.csv",index_col="PassengerId")
actual_data["survived"] = actual_data["survived"].astype(int)

In [1321]:
merged_df = pd.merge(actual_data, output, on='PassengerId', how ="left")
merged_df

Unnamed: 0_level_0,survived,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
892,0,0
892,0,0
893,1,1
894,0,0
895,0,0
...,...,...
1305,0,0
1306,1,0
1307,0,0
1308,0,0


In [1322]:
accuracy = accuracy_score(merged_df['survived'], merged_df['Survived'])

print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.8065326633165829


In [1323]:
#output.to_csv("submission_v2.csv")