In [311]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import optuna as opt

1. Create a model that predicts the age of a passenger on the titanic based on the features in the dataset.
Starting by creating dataframes from the csv files. 

In [312]:
df_train = pd.read_csv("../data/train_data.csv")

In [313]:
df_test = pd.read_csv("../data/test_data.csv")

Exploring the data:
We assume that test_data is clean and have to change train_data to be similar to test_data. 

In [314]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1493 entries, 0 to 1492
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1493 non-null   int64  
 1   Survived     1493 non-null   int64  
 2   Pclass       1493 non-null   int64  
 3   Sex          1493 non-null   int64  
 4   SibSp        1493 non-null   int64  
 5   Parch        1493 non-null   int64  
 6   Ticket       1493 non-null   object 
 7   Fare         1493 non-null   object 
 8   Cabin        1493 non-null   int64  
 9   Embarked     1493 non-null   int64  
 10  Age          1488 non-null   float64
dtypes: float64(1), int64(8), object(2)
memory usage: 128.4+ KB


In [315]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Cabin,Embarked,Age
count,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1488.0
mean,597.697254,0.327528,2.261219,0.690556,0.184863,0.178835,5.994642,1.834561,215117.4
std,251.365652,0.469469,0.628908,0.462419,0.594308,0.592203,24.214966,0.538178,8295612.0
min,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-32000.0
25%,377.0,0.0,2.0,0.0,0.0,0.0,-1.0,2.0,-35.0
50%,758.0,0.0,2.0,1.0,0.0,0.0,-1.0,2.0,16.0
75%,813.0,1.0,3.0,1.0,0.0,0.0,-1.0,2.0,24.0
max,891.0,1.0,3.0,1.0,5.0,6.0,133.0,2.0,320000000.0


# Removing ages

In [316]:
# Removed ages over 110 and under 0. 
df_train = df_train[df_train["Age"]<110]
df_train = df_train[df_train["Age"]>0]

In [317]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  143 non-null    int64  
 1   Survived     143 non-null    int64  
 2   Pclass       143 non-null    int64  
 3   Sex          143 non-null    int64  
 4   SibSp        143 non-null    int64  
 5   Parch        143 non-null    int64  
 6   Ticket       143 non-null    int64  
 7   Fare         143 non-null    float64
 8   Cabin        143 non-null    int64  
 9   Embarked     143 non-null    int64  
 10  Age          143 non-null    float64
dtypes: float64(2), int64(9)
memory usage: 12.4 KB


In [318]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Cabin,Embarked,Age
count,803.0,803.0,803.0,803.0,803.0,803.0,803.0,803.0,803.0
mean,427.211706,0.58655,2.483188,0.443337,0.331258,0.315068,11.158157,1.714819,27.523873
std,217.210983,0.492759,0.774052,0.497089,0.771924,0.756755,30.982114,0.680778,12.739607
min,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.42
25%,327.5,0.0,2.0,0.0,0.0,0.0,-1.0,2.0,22.0
50%,377.0,1.0,3.0,0.0,0.0,0.0,-1.0,2.0,22.0
75%,582.5,1.0,3.0,1.0,0.0,0.0,-1.0,2.0,33.0
max,891.0,1.0,3.0,1.0,5.0,6.0,133.0,2.0,80.0


## Feature engineering: remove NaN and strings from Fare. 

In [319]:
df_train = df_train.dropna() #fjerne for Age, men fylle de andre med 
df_train = df_train.drop_duplicates()

## Categorically encoding classes

In [320]:
df_train["Ticket"]=df_train["Ticket"].astype("category").cat.codes

In [321]:
#Function for å sjekke. Try (return float), else (return string.)
def toFloat(x):
    try:
        return float(x)
    except:
        return np.nan

In [322]:
df_train["Fare"]=df_train["Fare"].apply(lambda x: toFloat(x))

In [323]:
df_train["Fare"] = df_train["Fare"].fillna(df_train["Fare"].mean())

In [324]:
X_train = df_train.drop(columns="Age")
y_train = df_train["Age"]

In [325]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 541 entries, 0 to 1492
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  541 non-null    int64  
 1   Survived     541 non-null    int64  
 2   Pclass       541 non-null    int64  
 3   Sex          541 non-null    int64  
 4   SibSp        541 non-null    int64  
 5   Parch        541 non-null    int64  
 6   Ticket       541 non-null    int16  
 7   Fare         541 non-null    float64
 8   Cabin        541 non-null    int64  
 9   Embarked     541 non-null    int64  
 10  Age          541 non-null    float64
dtypes: float64(2), int16(1), int64(8)
memory usage: 47.5 KB


In [326]:
#Creating model
model = xgb.XGBRegressor()

In [327]:
# use the training set (X_train, y_train) to train the model by calling the .fit() method
model.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [328]:
X_test = df_test.drop(columns="Age")
y_test = df_test["Age"]

In [329]:
preds = model.predict(X_test)

In [330]:
# find the mean squared error for the predictions (a value to see the value of the predictions, lower is better)
# find the error between the y_test and the preds
mse = mean_squared_error(y_test, preds)

In [331]:
mse

155.10307353681102

Mean square root: 

In [332]:
np.sqrt(mse)

12.454038442883137

# Hyperparameter tuning

In [333]:
params = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7],
    "n_estimators": [100, 200, 300, 400, 500, 900, 1100, 1500],
}

In [334]:
model2 = xgb.XGBRegressor()
random_search = RandomizedSearchCV(model2, param_distributions=params, n_iter=110, scoring="neg_mean_squared_error", n_jobs=-1, cv=5)

In [335]:
random_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
model_new = random_search

In [None]:
preds2 = model_new.predict(X_test)

In [None]:
# Get the new mean square error
mse_new = mean_squared_error(y_test, preds2)

mse_new

130.22233915093219

## Root mean square

In [None]:
np.sqrt(mse_new)

11.41150030236744