**Consegna**: creare un modello di Machine Learning che preveda quali passeggeri siano sopravvissuti al naufragio del Titanic.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
import torch
data = pd.read_csv('train.csv')
data = data.fillna(0)
selected_features = ['Sex','Fare','Ticket','Cabin','Embarked']

# Si convertono le variabili categoriche in numeri usando one-hot encoding
X = pd.get_dummies(data, columns=selected_features, drop_first=True)
X = X.drop(columns=['Name'])

**Task 1**: dopo essersi opportunamente ricavati X e y, creare una baseline.

In [None]:
y = X['Survived']
X = X.drop(columns=['Survived'])

In [None]:
X_train, X_test y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

In [None]:
# Make predictions on the training set
y_train_pred = rf_classifier.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on Training Set: {accuracy_train:.2f}')

# Make predictions on the test set
y_test_pred = rf_classifier.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on Test Set: {accuracy_test:.2f}')

Accuracy on Training Set: 1.00
Accuracy on Validation Set: 0.81
Accuracy on Test Set: 0.81


**Task 2**: eseguire Hyperparameter Tuning con K-Fold Cross Validation sullo stesso modello usato nella baseline.

In [None]:
param_grid = {'max_depth': [None, 3, 5, 10], 'max_features': ['sqrt', 'log2', None]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_max_depth = grid_search.best_params_['max_depth']
best_max_features = grid_search.best_params_['max_features']

In [None]:
best_max_depth, best_max_features

(None, 'sqrt')

In [None]:
rf_classifier = RandomForestClassifier(max_depth=best_max_depth, max_features=best_max_features)
rf_classifier.fit(X_train, y_train)

In [None]:
# Make predictions on the training set
y_train_pred = rf_classifier.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on Training Set: {accuracy_train:.2f}')

# Make predictions on the test set
y_test_pred = rf_classifier.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on Test Set: {accuracy_test:.2f}')

Accuracy on Training Set: 1.00
Accuracy on Validation Set: 0.79
Accuracy on Test Set: 0.81


**Task 3**: testare almeno altri 4 modelli ed eseguire l'Hyperparameter Tuning con K-Fold Cross Validation su ognuo di essi. Attenzione alla scelta di quali parametri tunare ed a quali e quanti valori si scelgono.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {'learning_rate': [0.001, 0.01, 0.1], 'subsample': [0.5, 1]}
grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_learning_rate = grid_search.best_params_['learning_rate']
best_subsample = grid_search.best_params_['subsample']

In [None]:
model1 = GradientBoostingClassifier(learning_rate=best_learning_rate, subsample=best_subsample)
model1.fit(X_train, y_train)

In [None]:
# Make predictions on the training set
y_train_pred = model1.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on Training Set: {accuracy_train:.2f}')

# Make predictions on the test set
y_test_pred = model1.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on Test Set: {accuracy_test:.2f}')

Accuracy on Training Set: 0.83
Accuracy on Validation Set: 0.80
Accuracy on Test Set: 0.80


In [None]:
from sklearn.ensemble import AdaBoostClassifier

param_grid = {'learning_rate': [0.001, 0.01, 0.1], 'algorithm': ['SAMME', 'SAMME.R']}
grid_search = GridSearchCV(AdaBoostClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_learning_rate = grid_search.best_params_['learning_rate']
best_algorithm = grid_search.best_params_['algorithm']

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model2 = AdaBoostClassifier(learning_rate=best_learning_rate, algorithm=best_algorithm)
model2.fit(X_train, y_train)

In [None]:
# Make predictions on the training set
y_train_pred = model2.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on Training Set: {accuracy_train:.2f}')

# Make predictions on the test set
y_test_pred = model2.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on Test Set: {accuracy_test:.2f}')

Accuracy on Training Set: 0.81
Accuracy on Validation Set: 0.81
Accuracy on Test Set: 0.78


In [None]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors': [3, 5, 7], 'leaf_size': [15, 30, 40]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_n_neighbors = grid_search.best_params_['n_neighbors']
best_leaf_size = grid_search.best_params_['leaf_size']

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model3 = KNeighborsClassifier(n_neighbors=best_n_neighbors, leaf_size=best_leaf_size)
model3.fit(X, y)

In [None]:
# Make predictions on the training set
y_train_pred = model3.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on Training Set: {accuracy_train:.2f}')

# Make predictions on the test set
y_test_pred = model3.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on Test Set: {accuracy_test:.2f}')

Accuracy on Training Set: 0.69
Accuracy on Validation Set: 0.57
Accuracy on Test Set: 0.70


In [None]:
from sklearn.linear_model import LogisticRegression

param_grid = {'penalty': ['l2', 'elasticnet'], 'C': [0.001, 0.01, 0.1]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_penalty = grid_search.best_params_['penalty']
best_c = grid_search.best_params_['C']

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
from sklearn.linear_model import LogisticRegression
model4 = LogisticRegression(penalty=best_penalty, C=best_c)
model4.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_train_pred = model4.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on Training Set: {accuracy_train:.2f}')

y_test_pred = model4.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on Test Set: {accuracy_test:.2f}')

Accuracy on Training Set: 0.83
Accuracy on Validation Set: 0.81
Accuracy on Test Set: 0.80


**Task 4**: costruire un Voting Classifier sui modelli creati e tunati al Task 3.

In [None]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[('model1', model1), ('model2', model2), ('model3', model3), ('model4', model4)], voting='soft')
voting = voting.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Make predictions on the training set
y_train_pred = voting.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on Training Set: {accuracy_train:.2f}')

# Make predictions on the test set
y_test_pred = voting.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f'Accuracy on Test Set: {accuracy_test:.2f}')

Accuracy on Training Set: 0.83
Accuracy on Validation Set: 0.78
Accuracy on Test Set: 0.81


**Task 5**: costruire un Voting Classifier sui modelli creati e tunati al Task 3 *a mano*, ovvero senza usare librerie esterne.


In [None]:
classifiers = [model1, model2, model3, model4]
for model in classifiers:
  print(model)

GradientBoostingClassifier(learning_rate=0.01, subsample=1)
AdaBoostClassifier(learning_rate=0.1)
KNeighborsClassifier(leaf_size=15, n_neighbors=7)
LogisticRegression(C=0.1)


In [None]:
preds = []
for model in classifiers:
  preds.append(model.predict(X_test))

In [None]:
definiteve_preds = []
for single_pred in range(len(preds[0])):
  vote = []
  for all_preds in range(len(preds)):
    vote.append(preds[all_preds][single_pred])
  if sum(vote)  > 2:
    definiteve_preds.append(1)
  else:
    definiteve_preds.append(0)

In [None]:
accuracy_test = accuracy_score(y_test, definiteve_preds)
print(f'Accuracy on Test Set: {accuracy_test:.2f}')

Accuracy on Test Set: 0.81


**Task 6**: eseguire Hyperparameter Tuning con K-Fold Cross Validation come nel task 2 ma questa volta *a mano*, ovvero senza usare librerie esterne. Per semplicità, considerare il numero di Folds (K) uguale a 2.

In [None]:
param_grid = {'max_depth': [None, 3, 5, 10], 'max_features': ['sqrt', 'log2', None]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_max_depth = grid_search.best_params_['max_depth']
best_max_features = grid_search.best_params_['max_features']

KeyboardInterrupt: ignored

In [None]:
param_grid = {'max_depth': [None, 3, 5, 10], 'max_features': ['sqrt', 'log2', None]}

In [None]:
combinations = {}
for depth in param_grid['max_depth']:
  for maxf in param_grid['max_features']:
    performance = []
    X_train_2, X_val, y_train_2, y_val = train_test_split(X_train, y_train, test_size=0.5)
    # first fold
    rf_classifier = RandomForestClassifier(max_depth=depth, max_features=maxf)
    rf_classifier.fit(X_train_2, y_train_2)
    y_val_preds = rf_classifier.predict(X_val)
    performance.append(accuracy_score(y_val, y_val_preds))

    # second fold
    rf_classifier = RandomForestClassifier(max_depth=depth, max_features=maxf)
    rf_classifier.fit(X_val, y_val)
    y_train_preds = rf_classifier.predict(X_train_2)
    performance.append(accuracy_score(y_train_2, y_train_preds))

    combinations[tuple((depth, maxf))] = sum(performance)/2



In [None]:
best_depth, best_max_feature = None, None
max_accuracy = -1
for k, v in combinations.items():
  if max_accuracy < v:
    best_depth, best_max_feature = k[0], k[1]
    max_accuracy = v


In [None]:
best_depth, best_max_feature

(3, None)