In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

In [2]:
df = pd.read_csv("titanic.csv")
df = df.drop(columns=['PassengerId', 'Ticket', 'Name', 'Cabin']).dropna().reset_index(drop=True)
df.head(4)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S


In [3]:
for col in ['Age', 'Fare']:
    scaler = StandardScaler()
    scaler.fit(df[col].values.reshape(-1,1))
    df[col] = scaler.transform(df[col].values.reshape(-1,1))

for col in ['Sex', 'Embarked']:
    encoder = OneHotEncoder(handle_unknown='ignore')
    df[col] = encoder.fit_transform(df[col].values.reshape(-1,1)).toarray()

df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0.0,-0.527669,1,0,-0.51638,0.0
1,1,1,1.0,0.577094,1,0,0.694046,1.0
2,1,3,1.0,-0.251478,0,0,-0.50362,0.0
3,1,1,1.0,0.369951,1,0,0.350326,0.0
4,0,3,0.0,0.369951,0,0,-0.501257,0.0
5,0,1,0.0,1.681856,0,0,0.326933,0.0
6,0,3,0.0,-1.908622,3,1,-0.255045,0.0
7,1,3,1.0,-0.18243,0,2,-0.442974,0.0
8,1,2,1.0,-1.08005,1,0,-0.084997,1.0
9,1,3,1.0,-1.770526,1,1,-0.337746,0.0


In [4]:
X = df.drop(columns=['Survived'])
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.67, 
                                                    random_state=42)

In [5]:
def print_checks(model, name, X_train, y_train, X_test,  y_test):
    for X, y, label in zip([X_train, X_test], [y_train, y_test], ['train', 'test']):
        pred = model.predict(X)
        print(f'{name} {label}')
        print(f'MSE = {mean_squared_error(y,pred):.2f}')
        print(f'MAE = {mean_absolute_error(y,pred):.2f}')
        print(f'R2 = {r2_score(y,pred):.2f}')
        print()


knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print_checks(knn, 'knn', X_train, y_train, X_test,  y_test)


lg = LogisticRegression(random_state=0)
lg.fit(X_train, y_train)
print_checks(lg, 'lg', X_train, y_train, X_test,  y_test)


knn train
MSE = 0.10
MAE = 0.21
R2 = 0.58

knn test
MSE = 0.18
MAE = 0.28
R2 = 0.26

lg train
MSE = 0.18
MAE = 0.18
R2 = 0.25

lg test
MSE = 0.22
MAE = 0.22
R2 = 0.11



In [6]:
params = {
    "n_neighbors": range(1, 40),
    "weights": ['uniform', 'distance'],
    "p": [1, 2]
}
knn=KNeighborsRegressor()
cv = RandomizedSearchCV(knn, params, n_jobs=-1, cv=100)
cv.fit(X_train, y_train)
print(cv.best_params_)

knn = KNeighborsRegressor(**cv.best_params_)
knn.fit(X_train, y_train)
print_checks(knn, 'knn', X_train, y_train, X_test,  y_test)

{'weights': 'uniform', 'p': 1, 'n_neighbors': 19}
knn train
MSE = 0.12
MAE = 0.25
R2 = 0.51

knn test
MSE = 0.17
MAE = 0.31
R2 = 0.31



In [11]:
params = {
    "solver": [ 'liblinear'],
    "penalty": ['l1', 'l2', 'elasticnet'],
    "C": loguniform(1e-5, 100)
}

lg = LogisticRegression(random_state=0)
cv = RandomizedSearchCV(lg, params, n_jobs=-1, cv=10)
cv.fit(X_train, y_train)
print(cv.best_params_)

lg = LogisticRegression(**cv.best_params_)
lg.fit(X_train, y_train)
print_checks(lg, 'lg', X_train, y_train, X_test,  y_test)

{'C': 22.36981313962816, 'penalty': 'l2', 'solver': 'liblinear'}
lg train
MSE = 0.18
MAE = 0.18
R2 = 0.25

lg test
MSE = 0.23
MAE = 0.23
R2 = 0.04



30 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/jupyter/lib64/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/jupyter/lib64/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/jupyter/lib64/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 71, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

 0.60376773 0.81117021 0.78