In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from utils import display_dataset_info, evaluate_model, plot_metrics, plot_cv_accuracy, impute_mean, impute_median, impute_mode, drop_features

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import

In [2]:
# get train and test DataFrame
train_df = pd.read_csv('datasets/titanic_dataset/train.csv')
test_df = pd.read_csv('datasets/titanic_dataset/test.csv')

In [3]:
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

In [4]:
display_dataset_info(
    train_df,
    'Survived',
    features=True,
    target=True,
    null_count=True,
    class_dist=True,
    info=True,
    describe=True
)

Shape: (891, 12)

------------- [ Features ] -------------
• PassengerId
• Pclass
• Name
• Sex
• Age
• SibSp
• Parch
• Ticket
• Fare
• Cabin
• Embarked

-------------- [ Target ] --------------
• Survived

------------ [ Null Count ] ------------
             Null Count  Percentage
PassengerId           0        0.00
Survived              0        0.00
Pclass                0        0.00
Name                  0        0.00
Sex                   0        0.00
Age                 177       19.87
SibSp                 0        0.00
Parch                 0        0.00
Ticket                0        0.00
Fare                  0        0.00
Cabin               687       77.10
Embarked              2        0.22

-------- [ Class Distribution ] --------
• 0: 549 samples (61.62%)
• 1: 342 samples (38.38%)

--------------- [ Info ] ---------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
--

In [5]:
# calculate for IQR
# lower_bound=Q1-1.5*IQR
# upper_bound=Q3+1.5*IQR

In [6]:
# remove unnecessary features
features_to_remove = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train_df = drop_features(train_df, features_to_remove)

In [7]:
# encode categorical columns
categorical_cols = ['Pclass']
true_categorical_cols = ['Sex', 'Embarked']

onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)
ordinal_encoder = OrdinalEncoder()

encoded_df = train_df.copy()

# Ordinal Encoder
encoded_df[categorical_cols] = ordinal_encoder.fit_transform(encoded_df[categorical_cols])

# One-Hot Encoder
onehot_encoded = pd.DataFrame(
    onehot_encoder.fit_transform(encoded_df[true_categorical_cols]),
    columns=onehot_encoder.get_feature_names_out(true_categorical_cols),
    index=encoded_df.index
)

encoded_df = pd.concat(
    [encoded_df.drop(columns=true_categorical_cols), onehot_encoded],
    axis=1)

In [8]:
# impute missing values
impute_median(train_df, 'Age')
impute_mode(train_df, 'Embarked')
display_dataset_info(train_df, null_count=True)

Shape: (891, 8)

------------ [ Null Count ] ------------
          Null Count  Percentage
Survived           0         0.0
Pclass             0         0.0
Sex                0         0.0
Age                0         0.0
SibSp              0         0.0
Parch              0         0.0
Fare               0         0.0
Embarked           0         0.0


In [15]:
print(encoded_df.head(10))

   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_male  Embarked_Q  \
0         0     2.0  22.0      1      0   7.2500       1.0         0.0   
1         1     0.0  38.0      1      0  71.2833       0.0         0.0   
2         1     2.0  26.0      0      0   7.9250       0.0         0.0   
3         1     0.0  35.0      1      0  53.1000       0.0         0.0   
4         0     2.0  35.0      0      0   8.0500       1.0         0.0   
5         0     2.0   NaN      0      0   8.4583       1.0         1.0   
6         0     0.0  54.0      0      0  51.8625       1.0         0.0   
7         0     2.0   2.0      3      1  21.0750       1.0         0.0   
8         1     2.0  27.0      0      2  11.1333       0.0         0.0   
9         1     1.0  14.0      1      0  30.0708       0.0         0.0   

   Embarked_S  Embarked_nan  
0         1.0           0.0  
1         0.0           0.0  
2         1.0           0.0  
3         1.0           0.0  
4         1.0           0.0  
5    

In [10]:
# split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression()),
])

In [14]:
print(train_df.head(10))

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S
5         0       3    male  28.0      0      0   8.4583        Q
6         0       1    male  54.0      0      0  51.8625        S
7         0       3    male   2.0      3      1  21.0750        S
8         1       3  female  27.0      0      2  11.1333        S
9         1       2  female  14.0      1      0  30.0708        C


In [12]:
param_grid = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__penalty': ['l2'],
    'model__solver': ['lbfgs', 'newton-cg'],
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=1
)

grid.fit(X_train, y_train)

ValueError: 
All the 24 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ~~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        params=step_params,
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ~~~~~~~~^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\preprocessing\_data.py", line 894, in fit
    return self.partial_fit(X, y, sample_weight)
           ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\preprocessing\_data.py", line 930, in partial_fit
    X = validate_data(
        self,
    ...<4 lines>...
        reset=first_call,
    )
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 2944, in validate_data
    out = check_array(X, input_name="X", **check_params)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'Newell, Miss. Marjorie'

--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ~~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        params=step_params,
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ~~~~~~~~^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\preprocessing\_data.py", line 894, in fit
    return self.partial_fit(X, y, sample_weight)
           ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\preprocessing\_data.py", line 930, in partial_fit
    X = validate_data(
        self,
    ...<4 lines>...
        reset=first_call,
    )
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 2944, in validate_data
    out = check_array(X, input_name="X", **check_params)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "C:\Users\dreyyan\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'Partner, Mr. Austen'


In [None]:
print("Best parameters:", grid.best_params_)
print("Best cross-validated accuracy:", grid.best_score_)

# extract best model
best_model = grid.best_estimator_