In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [2]:
df = pd.read_csv('Titanic.csv', delimiter = ',')
df = df.dropna()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


In [3]:
from sklearn.preprocessing import LabelEncoder
categories = df.select_dtypes(include=('object')).columns
for col in categories:
    df[col] = LabelEncoder().fit_transform(df[col])
df.dtypes
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,49,0,38.0,1,0,109,71.2833,72,0
3,4,1,1,70,0,35.0,1,0,31,53.1000,48,2
6,7,0,1,112,1,54.0,0,0,55,51.8625,117,2
10,11,1,3,148,0,4.0,1,1,120,16.7000,131,2
11,12,1,1,27,0,58.0,0,0,26,26.5500,43,2
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,18,0,47.0,1,1,33,52.5542,91,2
872,873,0,1,35,1,33.0,0,0,89,5.0000,29,2
879,880,1,1,140,0,56.0,0,1,38,83.1583,61,0
887,888,1,1,75,0,19.0,0,0,10,30.0000,25,2


In [4]:
from sklearn.model_selection import train_test_split

X = df.drop('Survived', axis=1)
y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [5]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

models = {
    "SGDClassifier": {
        "model": make_pipeline(StandardScaler(), SGDClassifier(random_state=42)),
        "params": {
            'sgdclassifier__alpha': [0.0001, 0.001],
            'sgdclassifier__max_iter': [1000, 2000]
        }
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            'n_estimators': [50, 100],
            'max_depth': [5, 10]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(random_state=42),
        "params": {
            'n_estimators': [50, 100],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        }
    }
}


In [6]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Survived_Prediction")

results = []

for model_name, model_info in models.items():
    with mlflow.start_run(run_name=f"{model_name}_Opt"):

        gs = GridSearchCV(model_info["model"], model_info["params"], cv=5, scoring='accuracy')
        gs.fit(x_train, y_train)
        
        best_model = gs.best_estimator_
        y_pred = best_model.predict(x_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)

        mlflow.log_params(gs.best_params_)
        mlflow.log_metrics({
            "Accuracy": accuracy,
            "F1": f1,
            "ROC_AUC": roc_auc
        })
        mlflow.sklearn.log_model(best_model, model_name.lower())

        results.append({
            'Model': model_name,
            'Accuracy': accuracy,
            'F1': f1,
            'ROC_AUC': roc_auc,
            'Best_Params': gs.best_params_
        })
        
        print(f"\n{model_name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1: {f1:.4f}")
        print(f"ROC-AUC: {roc_auc:.4f}")

2025/04/07 12:53:50 INFO mlflow.tracking.fluent: Experiment with name 'Survived_Prediction' does not exist. Creating a new experiment.



SGDClassifier:
Accuracy: 0.7174
F1: 0.7636
ROC-AUC: 0.7038
🏃 View run SGDClassifier_Opt at: http://127.0.0.1:5000/#/experiments/1/runs/e3e0afbb81a34978bea2f00a24d2803e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1





RandomForest:
Accuracy: 0.6957
F1: 0.7586
ROC-AUC: 0.6731
🏃 View run RandomForest_Opt at: http://127.0.0.1:5000/#/experiments/1/runs/c145910f709749188b8ad74cfe6bc89f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1





XGBoost:
Accuracy: 0.7609
F1: 0.8000
ROC-AUC: 0.7481
🏃 View run XGBoost_Opt at: http://127.0.0.1:5000/#/experiments/1/runs/d897d7c4f4a44a759856c320d4b5cfb0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [7]:

best_model_result = max(results, key=lambda x: x['F1'])

print("\n Лучшая модель:")
print(f"Название: {best_model_result['Model']}")
print(f"F1-score: {best_model_result['F1']:.4f}")
print(f"Accuracy: {best_model_result['Accuracy']:.4f}")
print(f"ROC-AUC: {best_model_result['ROC_AUC']:.4f}")
print(f"Параметры: {best_model_result['Best_Params']}")

print("\n Обоснование выбора:")
print("1. F1-score выбрана как основная метрика, так как она учитывает:")
print("   - Precision (точность предсказаний положительного класса)")
print("   - Recall (полноту охвата положительного класса)")
print(f"2. По метрикам ROC_AUC и Accuracy модель {best_model_result['Model']} также выигрывает")


 Лучшая модель:
Название: XGBoost
F1-score: 0.8000
Accuracy: 0.7609
ROC-AUC: 0.7481
Параметры: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

 Обоснование выбора:
1. F1-score выбрана как основная метрика, так как она учитывает:
   - Precision (точность предсказаний положительного класса)
   - Recall (полноту охвата положительного класса)
2. По метрикам ROC_AUC и Accuracy модель XGBoost также выигрывает
