In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import mlflow 
from mlflow.models import infer_signature
from sklearn.preprocessing import OneHotEncoder


In [2]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [55]:
# feature engineering

In [59]:
data = pd.read_csv('../data/train.csv', index_col='PassengerId')
data['Embarked'] = data['Embarked'].fillna('S')
data['Fare'] = data['Fare'].fillna(data.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0])
data['Age'] = data['Age'].fillna(data.Age.median())
# Creating Deck column from the first letter of the Cabin column (M stands for Missing)
data['Deck'] = data['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
# Passenger in the T deck is changed to A
idx = data[data['Deck'] == 'T'].index
data.loc[idx, 'Deck'] = 'A'
data['Deck'] = data['Deck'].replace(['A', 'B', 'C'], 'ABC')
data['Deck'] = data['Deck'].replace(['D', 'E'], 'DE')
data['Deck'] = data['Deck'].replace(['F', 'G'], 'FG')
data.drop(['Cabin'], inplace=True, axis=1)
data['Family_Size'] = data['SibSp'] + data['Parch'] + 1
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
data['Family_Size_Grouped'] = data['Family_Size'].map(family_map)
data['Ticket_Frequency'] = data.groupby('Ticket')['Ticket'].transform('count')
data['Title'] = data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
data['Is_Married'] = 0
data['Is_Married'].loc[data['Title'] == 'Mrs'] = 1
data['Title'] = data['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
data['Title'] = data['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')
cat_features = ['Pclass', 'Sex', 'Deck', 'Embarked', 'Title', 'Family_Size_Grouped']

encoder = OneHotEncoder(sparse=False)  # Set sparse=False to get a dense array
encoded_features = encoder.fit_transform(data[cat_features])
encoded_feature_names = encoder.get_feature_names_out(cat_features)

# Create a DataFrame from the encoded features and set the index to align with the original DataFrame
encoded_data = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=data.index)

# Concatenate the original DataFrame with the new encoded features
data = pd.concat([data, encoded_data], axis=1)

# Optionally, drop the original categorical columns if they are no longer needed
data.drop(cat_features, axis=1, inplace=True)

drop_cols = [  'Family_Size',  'Name', 'Parch',   'SibSp', 'Ticket']
data.drop(drop_cols, axis=1, inplace=True)
data.to_csv('../data/train_cleaned.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Is_Married'].loc[data['Title'] == 'Mrs'] = 1


In [60]:
X = data.drop('Survived', axis=1)
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
params = {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
lr = LogisticRegression(**params)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
experiment_description = (
    "test mlflow na bazie setu titanic"
)

experiment_tags = {
    "project_name": "titanic-mlflow-test",
    "owner": "pkwiecien",
    "mlflow.note.content": experiment_description,
}

mlflow.create_experiment(name="fe", tags=experiment_tags)

mlflow.set_experiment("fe")

run_name = "fe_LR"

artifact_path = "LR2"

metrics = {
    "accuracy": accuracy,
}

signature = infer_signature(X_train, lr.predict(X_train))


with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(sk_model=lr, input_example=X_train, signature=signature, artifact_path=artifact_path,registered_model_name="lr2")

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'lr2'.
2024/02/27 20:41:05 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lr2, version 1
Created version '1' of model 'lr2'.
