# MLFLOW - Deploying Machine Learning in Production

In this assignment you will be writing a script that train models and use `mlflow` to submit runs. 

In [None]:
%%writefile ./new_data.json
{"age": {"0": 40, "1": 47},
 "balance": {"0": 580, "1": 3644},
 "campaign": {"0": 1, "1": 2},
 "contact": {"0": "unknown", "1": "unknown"},
 "day": {"0": 16, "1": 9},
 "default": {"0": "no", "1": "no"},
 "duration": {"0": 192, "1": 83},
 "education": {"0": "secondary", "1": "secondary"},
 "housing": {"0": "yes", "1": "no"},
 "job": {"0": "blue-collar", "1": "services"},
 "loan": {"0": "no", "1": "no"},
 "marital": {"0": "married", "1": "single"},
 "month": {"0": "may", "1": "jun"},
 "pdays": {"0": -1, "1": -1},
 "poutcome": {"0": "unknown", "1": "unknown"},
 "previous": {"0": 0, "1": 0}}

In [None]:
#Load all necessary libraries
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib
import json

# Load Dataset
bank = pd.read_csv('bank-full.csv', delimiter = ';')

# Split data between train and validation
X_train, X_test, y_train, y_test = train_test_split(bank.drop(columns = "y"), bank["y"], 
                                                    test_size = 0.10, random_state = 42)

X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)


Question 1: Create pre-processing function to be later used as part of the pipeline (custom transformer)

In [None]:
def transformations(df):
    """Apply one-hot encoding to categorical columns and
    standard scaling to numeric columns."""
    # One-hot encode categorical features
    onehoter = OneHotEncoder(handle_unknown='ignore', sparse=False)
    cat_cols = X_train.select_dtypes(['object']).columns
    onehoter.fit(X_train[cat_cols])
    onehot_cols = onehoter.get_feature_names_out(cat_cols)
    df_onehot = pd.DataFrame(
        onehoter.transform(df[cat_cols]), columns=onehot_cols
    )

    # Standardize numerical features
    num_cols = X_train.select_dtypes(['integer', 'float']).columns
    znormalizer = StandardScaler()
    znormalizer.fit(X_train[num_cols])
    df_norm = pd.DataFrame(
        znormalizer.transform(df[num_cols]), columns=num_cols
    )

    # Combine processed categorical and numerical data
    df_featurized = df_onehot
    df_featurized[num_cols] = df_norm

    # Clean up intermediate dataframes
    del df_onehot, df_norm
    return df_featurized


Question 2: Creating a custom transformer from the previously defined function

In [None]:
# Wrap the preprocessing function so it can be used in a pipeline
pre_processing = FunctionTransformer(transformations, validate=False)


Question 3: Creating the pipeline and defining each of two steps: (i) pre-processing, and; (ii) model (Logistic)

In [None]:
pipeline = Pipeline([
    ('pre_processing', pre_processing),  # feature engineering step
    ('model', LogisticRegression())      # classification model
], verbose=True)


Question 4: Call `fit` and `predict` on the pipeline to make sure that it all works. Remember to pass them the **un-processed** (original) data, since the data processing should be built into the pipeline now.

In [None]:
# Set parameters for Logistic Regression estimator ('model') inside the pipeline
pipeline.set_params(model__C=1.0,                 # strength of regularization
                    model__solver='lbfgs',        # optimization algorithm
                    model__max_iter=200,          # increase max_iter for convergence
                    model__fit_intercept=True,    # include intercept term
                    model__penalty='l2')          # regularization penalty type

# Fit Training Data to Model
pipeline.fit(X_train, y_train)

# Prediction on Training and Test Data
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)


Question 5: Evaluate your model by calculating the precision and recall.

In [None]:
# Create a function to evaluate the model performance using precision and recall
def eval_metrics(actual, pred):
    precision = precision_score(actual, pred, pos_label='yes')
    recall = recall_score(actual, pred, pos_label='yes')

    return precision, recall

# Calculation of evaluation metrics - Precision and Recall for training and validation data
(precision_train, recall_train) = eval_metrics(y_train, y_train_pred)
(precision_test, recall_test) = eval_metrics(y_test, y_test_pred)

# Print Model (Logistic Regression) parameters
print()
print('Main Parameters used in logistic regression are: C={}, solver={}, max_iter={}, fit_intercept={} and penalty={}'.format(
    pipeline['model'].get_params()['C'],
    pipeline['model'].get_params()['solver'],
    pipeline['model'].get_params()['max_iter'],
    pipeline['model'].get_params()['fit_intercept'],
    pipeline['model'].get_params()['penalty']
))
print('Training Precision: {:.3f}, Recall: {:.3f}'.format(precision_train, recall_train))
print('Validation Precision: {:.3f}, Recall: {:.3f}'.format(precision_test, recall_test))


Question 6: Save your pipeline object using `joblib` as shown [here](https://sklearn.org/modules/model_persistence.html).

In [None]:
# store 'pipeline' as pickle file using joblib
joblib.dump(pipeline, 'pipeline.pkl')


Question 7: Now write a **new script** for scoring: it loads the pipeline you saved in the last step, reads the data `../data/new_data.json` and converts it to a `pandas.DataFrame` object, and obtains predictions on it. The predictions should be stored as a `json` file `../data/new_preds.json`.

In [None]:
# Call and load stored 'pipeline'
pipeline = joblib.load('pipeline.pkl')

# Read json file with new data and write into a pandas dataframe
with open('./new_data.json', 'r') as f:
    data = json.load(f)
new_predictions = pd.DataFrame(data)

# Use predict method of pipeline to score (make prediction) on new data
new_predictions['prediction'] = pipeline.predict(new_predictions)

# Write predictions of new data into a json file
new_predictions.to_json('./new_preds.json', orient='columns')


In [None]:
# Read json file containing predictions made for the new data and load them into a dataframe
with open('./new_preds.json', 'r') as f:
    data = json.load(f)
    
new_pred_dataframe= pd.DataFrame(data)

#Print predictions for each observation contained in the new_data.json file and the dataframe with the data and prediction
print(new_pred_dataframe['prediction'])
new_pred_dataframe

Question 8: Create a new text cell in your Notebook: Complete a 50-100 word summary (or short description of your thinking in applying this week's learning to the solution) of your experience in this assignment. Include: What was your incoming experience with this model, if any? what steps you took, what obstacles you encountered. how you link this exercise to real-world, machine learning problem-solving. (What steps were missing? What else do you need to learn?) This summary allows your instructor to know how you are doing and allot points for your effort in thinking and planning, and making connections to real-world work.

In this exercise I built a logistic regression pipeline for the bank marketing dataset. My past experience with this model was limited, so I carefully created preprocessing steps with one-hot encoding and scaling, then wrapped them in a FunctionTransformer. The pipeline structure simplified training and evaluation. The main obstacle was keeping track of categorical vs. numeric features. This mirrors real projects where reproducible pipelines and metrics are essential for deployment.
