In [40]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [41]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [42]:
X_full = train_data.copy()
X_test_full = test_data.copy()

#Separate target from predictors
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

1. Quick Data Exploration

In [43]:
X_full.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [44]:
#checking for correlation between numerical variables 
cor_numeric = X_full[['Age','Pclass','SibSp', 'Parch', 'Fare']]
correlog = cor_numeric.corr()
correlog #nothing higher than 0.7 so nothing to worry

Unnamed: 0,Age,Pclass,SibSp,Parch,Fare
Age,1.0,-0.369226,-0.308247,-0.189119,0.096067
Pclass,-0.369226,1.0,0.083081,0.018443,-0.5495
SibSp,-0.308247,0.083081,1.0,0.414838,0.159651
Parch,-0.189119,0.018443,0.414838,1.0,0.216225
Fare,0.096067,-0.5495,0.159651,0.216225,1.0


In [45]:
X_full.info() # we can see that age, cabin, and embarked have missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [46]:
X_full['Ticket'].value_counts() #better drop this one, high cardinality

347082      7
CA. 2343    7
1601        7
347088      6
CA 2144     6
           ..
PC 17756    1
343276      1
349223      1
392091      1
367229      1
Name: Ticket, Length: 681, dtype: int64

In [47]:
X_full['Cabin'].value_counts() #also better drop, high cardinality and too many missing values

C23 C25 C27    4
G6             4
B96 B98        4
F2             3
D              3
              ..
D56            1
B94            1
D9             1
B50            1
B4             1
Name: Cabin, Length: 147, dtype: int64

In [48]:
#Remove variables that won't be used

variables_drop = [ 'Name', 'Cabin', 'Ticket']
X_full.drop(variables_drop, axis=1, inplace=True)


In [49]:
# X and X_test should have the same columns
X_test_full.drop(variables_drop, axis=1, inplace=True)


In [50]:
X_train = X_full
X_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.25,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.925,S
3,4,1,female,35.0,1,0,53.1,S
4,5,3,male,35.0,0,0,8.05,S


In [51]:
X_test = X_test_full
X_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


Data preprocessing

In [52]:
# Numerical and categorical columns shoud be treated separately
num_columns = X_train.select_dtypes(exclude='object').columns
cat_columns = X_train.select_dtypes(include='object').columns

In [53]:
num_columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [54]:
cat_columns

Index(['Sex', 'Embarked'], dtype='object')

Defining tranformes and pipelines

In [55]:
#all numerical columns will receive the same imputer and transformers
#categorical columns sex and embarked will be added constant to missing values and onehot encoder

oneHot_cat_cols = ['Sex', 'Embarked']
transf_num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
my_cols = (transf_num_cols + oneHot_cat_cols)

    

In [56]:
# Define transformers
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data

categorical_transformer_onehot =  Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, transf_num_cols),
        ('cat_onehot', categorical_transformer_onehot, oneHot_cat_cols)
    ])

In [57]:
# Copying data
X_selected_train = X_train[my_cols].copy()
X_selected_test = X_test[my_cols].copy()

Choosing a model

In [58]:
# Test different models

from xgboost import XGBRegressor

xgb_model = XGBRegressor(learning_rate = 0.05,
                            n_estimators=1000,
                            max_depth=5,
                            random_state=0)

# Create the Pipeline
# Bundle preprocessing and modeling code in a pipeline
my_pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                            ('model', xgb_model)
                             ])

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline_xgb, X_selected_train, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE score:\n", scores)
print("MAE mean: {}".format(scores.mean()))

MAE score:
 [0.27959029 0.27062043 0.22936801 0.27750011 0.24477281]
MAE mean: 0.2603703295586127


In [59]:
from sklearn.svm import SVC

SVC_model = SVC(C=30, kernel = 'poly', random_state=0)

# Create the Pipeline
# Bundle preprocessing and modeling code in a pipeline
my_pipeline_SVC = Pipeline(steps=[('preprocessor', preprocessor),
                            ('model', SVC_model)
                             ])

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline_SVC, X_selected_train, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE score:\n", scores)
print("MAE mean: {}".format(scores.mean()))

MAE score:
 [0.22905028 0.20786517 0.17977528 0.20786517 0.14606742]
MAE mean: 0.1941246626074948


In [60]:
# Fit the best model
my_pipeline_SVC.fit(X_selected_train, y)

# Get predictions
preds = my_pipeline_SVC.predict(X_selected_test)

In [65]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId,
                       'Survived': preds})

output.to_csv('submission.csv', index=False)

In [66]:
import os
os.chdir(r'../working')
from IPython.display import FileLink
FileLink(r'submission.csv')