# Titanic ML competition

https://www.kaggle.com/competitions/titanic/

In [84]:
import pandas as pd
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [85]:
train = pd.read_csv(Path('train.csv'))
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [86]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


**Data Definitions**

- survived: 0 = No, 1 = Yes
- pclass: 	Ticket class 	1 = 1st, 2 = 2nd, 3 = 3rd
- sex: 	Sex 	
- Age: 	Age in years 	
- sibsp: 	# of siblings / spouses aboard the Titanic 	
- parch: 	# of parents / children aboard the Titanic 	
- ticket: 	Ticket number 	
- fare: 	Passenger fare 	
- cabin: 	Cabin number 	
- embarked: 	Port of Embarkation 	C = Cherbourg, Q = Queenstown, S = Southampton

In [87]:
# Get labels
y = train['Survived']

### Baseline Model

We'll start with just a few features for the baseline model.

In [88]:
# Select features
features = ["Pclass", "Sex", "SibSp", "Parch", "Age"]
num_features = ['Age']
cat_features = [f for f in features if f not in num_features]

# Check fraction of nulls
train[features].isnull().mean()

Pclass    0.000000
Sex       0.000000
SibSp     0.000000
Parch     0.000000
Age       0.198653
dtype: float64

Rather than dealing with the Age nulls now, let's choose a simpler feature set for the baseline model and address the nulls later.

In [89]:
# Select features
features = ["Pclass", "Sex", "SibSp"]
num_features = []
cat_features = [f for f in features if f not in num_features]

# Check fraction of nulls
train[features].isnull().mean()

Pclass    0.0
Sex       0.0
SibSp     0.0
dtype: float64

In [90]:
# Test train split
X_train, X_test, y_train, y_test = train_test_split(
    train[features], y, test_size=0.2, random_state=42)

In [91]:
ohe = ColumnTransformer([
    ('ohe_features', OneHotEncoder(), cat_features),
    ('scaled_num', StandardScaler(), num_features)
])

lr_pipe = Pipeline([('ohe', ohe),
                    ('lr', LogisticRegression())])

lr_pipe.fit(X_train, y_train)

predictions = lr_pipe.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       105
           1       0.78      0.70      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



In [92]:
# Output predictions using test dataset
test = pd.read_csv(Path('test.csv'))
predictions = lr_pipe.predict(test[features])

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
output.to_csv('submission.csv', index=False)