## Load dataset

In [1]:
import os

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("/tmp/data/2018_Yellow_Taxi_Trip_Data_sample_better.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/data/2018_Yellow_Taxi_Trip_Data_sample_better.csv'

## Feature engineering


In [None]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df.rename(columns={'tpep_pickup_datetime': 'pickup_date'}, inplace=True)

def extract_hour(row):
    return float(int(row['pickup_date'].hour) + int(row['pickup_date'].minute)/60)

def night_hour(row):
    return row['pickup_hour'] < 4

def weekday(row):
    return row['pickup_date'].weekday()

def weekday(row):
    return row['pickup_date'].weekday()

def is_weekend(row):
    return row['weekday']>4

def is_airport_destination(row):
    return row['weekday']>4

df['pickup_hour'] = df.apply (lambda row: extract_hour(row), axis=1)

df['sin_pickup_hour'] = np.sin(df['pickup_hour'])
df['cos_pickup_hour'] = np.cos(df['pickup_hour'])
df['night_hours'] = df.apply (lambda row: night_hour(row), axis=1)
df['weekday'] = df.apply (lambda row: weekday(row), axis=1)
df['weekend'] = df.apply (lambda row: is_weekend(row), axis=1)
df['passenger_count'] = df['passenger_count'].astype(int)
df['label'] = df.apply(lambda row: 1 if row['DOLocationID'] in [1, 132, 138] else 0, axis=1)
#df['label'] = df['label'].astype(int)

df = df[[
    'pickup_hour',
    'sin_pickup_hour',
    'cos_pickup_hour',
    'night_hours',
    'weekday',
    'weekend',
    'passenger_count',
    'label'
]]

df.head()

## Dataset stats


In [None]:
df.describe()
neg, pos = np.bincount(df['label'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))





In [None]:
!mlflow experiments create --artifact-location s3://mybucket --experiment-name "project2"


In [None]:
## track experiment

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import mlflow.sklearn
import mlflow


train, test = train_test_split(df, test_size=0.2)
Y_train = train[["label"]]
X_train = train.drop('label', axis=1)
feature_names = X_train.columns.values
X_train = 1 * X_train.values
Y_train = Y_train.values.reshape(-1)

Y_test = test[["label"]]
X_test = test.drop('label', axis=1)
feature_names = X_test.columns.values
X_test = 1 * X_test.values
Y_test = Y_test.values.reshape(-1)

mlflow.set_tracking_uri("http://localhost:5000")
experiment_name = "project4"
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)
experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id



Cs = [1, 2, 10]
for c in Cs:
    with mlflow.start_run(experiment_id=experiment_id):
        max_iter = 100
        mlflow.log_param('max_iterations', max_iter)
        mlflow.log_param('reg_parameter', c)

        logistic_reg = LogisticRegression(penalty='l2', max_iter=max_iter, C=c)
        logistic_reg.fit(X_train, Y_train)

        y_pred = logistic_reg.predict(X_test)

        mlflow.log_metric("accuracy score", accuracy_score(Y_test, y_pred))
        mlflow.log_metric("recall score", recall_score(Y_test, y_pred))
        mlflow.log_metric("precision score", precision_score(Y_test, y_pred))
        mlflow.log_metric("f1 score", f1_score(Y_test, y_pred))

        #artifact_path = mlflow.get_artifact_uri()

         # https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

        mlflow.sklearn.log_model(logistic_reg, "model")
        print(mlflow.active_run().info.run_id)

        mlflow.end_run()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import mlflow.sklearn
import mlflow


train, test = train_test_split(df, test_size=0.2)
Y_train = train[["label"]]
X_train = train.drop('label', axis=1)
feature_names = X_train.columns.values
X_train = 1 * X_train.values
Y_train = Y_train.values.reshape(-1)

Y_test = test[["label"]]
X_test = test.drop('label', axis=1)
feature_names = X_test.columns.values
X_test = 1 * X_test.values
Y_test = Y_test.values.reshape(-1)

mlflow.set_tracking_uri("http://localhost:5000")
experiment_name = "ccc"
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)
experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id



Cs = [1, 2, 10]
for c in Cs:
    with mlflow.start_run(experiment_id=experiment_id):
        max_iter = 100
        mlflow.log_param('max_iterations', max_iter)
        mlflow.log_param('reg_parameter', c)

        logistic_reg = LogisticRegression(penalty='l2', max_iter=max_iter, C=c)
        logistic_reg.fit(X_train, Y_train)

        y_pred = logistic_reg.predict(X_test)

        mlflow.log_metric("accuracy score", accuracy_score(Y_test, y_pred))
        mlflow.log_metric("recall score", recall_score(Y_test, y_pred))
        mlflow.log_metric("precision score", precision_score(Y_test, y_pred))
        mlflow.log_metric("f1 score", f1_score(Y_test, y_pred))

        #artifact_path = mlflow.get_artifact_uri()

         # https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

        mlflow.sklearn.log_model(logistic_reg, "model")
        print(mlflow.active_run().info.run_id)

        mlflow.end_run()


## Running mlflow server

In [None]:
!sqlite3 /tmp/example.db

In [None]:
#!mlflow server --gunicorn-opts "--log-level debug"  --host 0.0.0.0 --backend-store-uri sqlite:////tmp/example.db --default-artifact-root file:/tmp/0
!mlflow server

Trigger promotion request on staging model

[Feature proposal - Event based notifications for model registry changes](https://github.com/mlflow/mlflow/issues/3015)
