# Titanic - Machine Learning from Disaster

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

## Load datasets
Load all datasets for compeition from files.<br>
"train.csv" is the train data and "test.csv" is test data.

### load train data

In [2]:
df_train = pd.read_csv("../input/titanic/train.csv", index_col='PassengerId')
df_train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### load test data

In [3]:
df_test = pd.read_csv("../input/titanic/test.csv", index_col='PassengerId')
df_test

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


### join train and test data

In [4]:
df_full = pd.concat([df_train, df_test])
df_full

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## analyze data
Analyze full data to select and create useful features.

### find all missing features

In [5]:
missing_features = [column for column in df_full.columns if df_full[column].isnull().any()]
missing_features.remove('Survived')
for feature in missing_features:
    print(feature, '---', df_full[feature].isnull().sum(), '/', df_full.shape[0])

Age --- 263 / 1309
Fare --- 1 / 1309
Cabin --- 1014 / 1309
Embarked --- 2 / 1309


### calculate cardinality of categorical features

In [6]:
for column in df_full.columns:
    if df_full[column].dtype == object:
        print(column, '---', len(df_full[column].unique()))

Name --- 1307
Sex --- 2
Ticket --- 929
Cabin --- 187
Embarked --- 4


## Split data
Split data into input and label data and also split train data into train and validation data.

### Split data into input and label data

In [7]:
input_data = df_train.drop('Survived', axis=1)
label_data = df_train['Survived']
x_test = df_test.copy()
print(input_data.shape)
print(x_test.shape)

(891, 10)
(418, 10)


### Split train data into train and validation data

In [8]:
x_train, x_valid, y_train, y_valid = train_test_split(input_data, label_data, test_size=0.2, random_state=0)
x_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
141,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C
440,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S
818,2,"Mallet, Mr. Albert",male,31.0,1,1,S.C./PARIS 2079,37.0042,,C
379,3,"Betros, Mr. Tannous",male,20.0,0,0,2648,4.0125,,C
492,3,"Windelov, Mr. Einar",male,21.0,0,0,SOTON/OQ 3101317,7.25,,S


## Data cleaning
Clean data for the best performance.

### Drop missing features

In [9]:
missing_features = ['Age', 'Cabin']
x_train = x_train.drop(missing_features, axis=1)
x_valid = x_valid.drop(missing_features, axis=1)
x_test = x_test.drop(missing_features, axis=1)
print(x_train.columns)
print(x_valid.columns)
print(x_test.columns)

Index(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Embarked'],
      dtype='object')
Index(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Embarked'],
      dtype='object')
Index(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Embarked'],
      dtype='object')


### Drop high-cardinality categorical features

In [10]:
high_cardinality_features = [column for column in x_train.columns if x_train[column].dtype == object and len(x_train[column].unique()) > 10]
x_train = x_train.drop(high_cardinality_features, axis=1)
x_valid = x_valid.drop(high_cardinality_features, axis=1)
x_test = x_test.drop(high_cardinality_features, axis=1)
print(x_train.columns)
print(x_valid.columns)
print(x_test.columns)

Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')


### Fill missing values (Fare)

In [11]:
# find all numerical features
numerical_features = [column for column in x_train.columns if x_train[column].dtype != object]
print(numerical_features)

# fill missing values
imputer = KNNImputer(n_neighbors=5)
x_train[numerical_features] = imputer.fit_transform(x_train[numerical_features])
x_valid[numerical_features] = imputer.transform(x_valid[numerical_features])
x_test[numerical_features] = imputer.transform(x_test[numerical_features])

['Pclass', 'SibSp', 'Parch', 'Fare']


### Define one-hot encoder for categorical features

In [12]:
# find all categorical features
categorical_features = [column for column in x_train.columns if x_train[column].dtype == object]
print(categorical_features)

# encode object columns
encoder = OneHotEncoder()
encoder.fit(x_train[categorical_features])
new_feature_names = encoder.get_feature_names_out(categorical_features)
print(new_feature_names)
print(encoder.categories_)

['Sex', 'Embarked']
['Sex_female' 'Sex_male' 'Embarked_C' 'Embarked_Q' 'Embarked_S'
 'Embarked_nan']
[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S', nan], dtype=object)]


### Encode categorical features

In [13]:
x_train_encoded = pd.DataFrame(encoder.transform(x_train[categorical_features]).toarray(), columns=new_feature_names, index=x_train.index)
x_valid_encoded = pd.DataFrame(encoder.transform(x_valid[categorical_features]).toarray(), columns=new_feature_names, index=x_valid.index)
x_test_encoded = pd.DataFrame(encoder.transform(x_test[categorical_features]).toarray(), columns=new_feature_names, index=x_test.index)
x_train_encoded.head()

Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
141,1.0,0.0,1.0,0.0,0.0,0.0
440,0.0,1.0,0.0,0.0,1.0,0.0
818,0.0,1.0,1.0,0.0,0.0,0.0
379,0.0,1.0,1.0,0.0,0.0,0.0
492,0.0,1.0,0.0,0.0,1.0,0.0


### Replace categorical features with encoded values

In [14]:
x_train = x_train.drop(categorical_features, axis=1)
x_valid = x_valid.drop(categorical_features, axis=1)
x_test = x_test.drop(categorical_features, axis=1)

x_train = x_train.join(x_train_encoded)
x_valid = x_valid.join(x_valid_encoded)
x_test = x_test.join(x_test_encoded)

x_train.head()

Unnamed: 0_level_0,Pclass,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
141,3.0,0.0,2.0,15.2458,1.0,0.0,1.0,0.0,0.0,0.0
440,2.0,0.0,0.0,10.5,0.0,1.0,0.0,0.0,1.0,0.0
818,2.0,1.0,1.0,37.0042,0.0,1.0,1.0,0.0,0.0,0.0
379,3.0,0.0,0.0,4.0125,0.0,1.0,1.0,0.0,0.0,0.0
492,3.0,0.0,0.0,7.25,0.0,1.0,0.0,0.0,1.0,0.0


## Train and test model
Create a machine learning model and train and test data.

### Create a model

In [15]:
model = RandomForestClassifier(random_state=0, n_estimators=150)

### train model

In [16]:
model.fit(x_train, y_train)

RandomForestClassifier(n_estimators=150, random_state=0)

### evaluate model

In [17]:
y_predict = model.predict(x_valid)
score = accuracy_score(y_valid, y_predict)
print(score)

0.8435754189944135


## Submit result

In [18]:
y_test = model.predict(x_test)
result_data = pd.DataFrame({'PassengerId': x_test.index, 'Survived':y_test})
result_data.to_csv("submission.csv", index=False)