## Load dataset

In [13]:
import pandas as pd
import numpy as np

df = pd.read_csv("2018_Yellow_Taxi_Trip_Data_sample.csv")
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,12/29/2018 01:37:28 AM,12/29/2018 01:48:12 AM,1,2.51,1,N,249,170,1,10.0,0.5,0.5,2.26,0.0,0.3,13.56
1,2,12/29/2018 01:51:24 AM,12/29/2018 01:58:17 AM,1,1.8,1,N,234,48,1,8.0,0.5,0.5,1.86,0.0,0.3,11.16
2,1,12/29/2018 01:32:41 AM,12/29/2018 01:50:13 AM,0,4.8,1,N,113,225,1,17.5,0.5,0.5,3.75,0.0,0.3,22.55
3,1,12/29/2018 01:32:20 AM,12/29/2018 01:47:57 AM,1,5.0,1,N,48,166,1,17.5,0.5,0.5,3.75,0.0,0.3,22.55
4,1,12/29/2018 01:16:34 AM,12/29/2018 01:33:09 AM,1,4.9,1,N,230,42,2,16.0,0.5,0.5,0.0,0.0,0.3,17.3


## Feature engineering


In [14]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df.rename(columns={'tpep_pickup_datetime': 'pickup_date'}, inplace=True)

def extract_hour(row):
    return float(int(row['pickup_date'].hour) + int(row['pickup_date'].minute)/60)

def night_hour(row):
    return row['pickup_hour'] < 4

def weekday(row):
    return row['pickup_date'].weekday()

def weekday(row):
    return row['pickup_date'].weekday()

def is_weekend(row):
    return row['weekday']>4

def is_airport_destination(row):
    return row['weekday']>4

df['pickup_hour'] = df.apply (lambda row: extract_hour(row), axis=1)

df['sin_pickup_hour'] = np.sin(df['pickup_hour'])
df['cos_pickup_hour'] = np.cos(df['pickup_hour'])
df['night_hours'] = df.apply (lambda row: night_hour(row), axis=1)
df['weekday'] = df.apply (lambda row: weekday(row), axis=1)
df['weekend'] = df.apply (lambda row: is_weekend(row), axis=1)
df['passenger_count'] = df['passenger_count'].astype(int)
df['label'] = df.apply(lambda row: 1 if row['DOLocationID'] in [1, 132, 138] else 0, axis=1)
#df['label'] = df['label'].astype(int)

df = df[[
    'pickup_hour',
    'sin_pickup_hour',
    'cos_pickup_hour',
    'night_hours',
    'weekday',
    'weekend',
    'passenger_count',
    'label'
]]

df.head()

Unnamed: 0,pickup_hour,sin_pickup_hour,cos_pickup_hour,night_hours,weekday,weekend,passenger_count,label
0,1.616667,0.998948,-0.045854,True,5,True,1,0
1,1.85,0.961275,-0.27559,True,5,True,1,0
2,1.533333,0.999298,0.037454,True,5,True,0,0
3,1.533333,0.999298,0.037454,True,5,True,1,0
4,1.266667,0.954108,0.299463,True,5,True,1,0


## Dataset stats


In [15]:
df.describe()
neg, pos = np.bincount(df['label'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))





Examples:
    Total: 19999
    Positive: 803 (4.02% of total)



## track experiment

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import mlflow.sklearn
import mlflow


train, test = train_test_split(df, test_size=0.2)
Y_train = train[["label"]]
X_train = train.drop('label', axis=1)
feature_names = X_train.columns.values
X_train = 1 * X_train.values
Y_train = Y_train.values.reshape(-1)

Y_test = test[["label"]]
X_test = test.drop('label', axis=1)
feature_names = X_test.columns.values
X_test = 1 * X_test.values
Y_test = Y_test.values.reshape(-1)

experiment_name = "nyc-taxi-predictor"
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)
experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id


Cs = [1, 2, 10]
for c in Cs:
    with mlflow.start_run(experiment_id=experiment_id):
        max_iter = 100
        mlflow.log_param('max_iterations', max_iter)
        mlflow.log_param('reg_parameter', c)

        logistic_reg = LogisticRegression(penalty='l2', max_iter=max_iter, C=c)
        logistic_reg.fit(X_train, Y_train)

        y_pred = logistic_reg.predict(X_test)

        mlflow.log_metric("accuracy score", accuracy_score(Y_test, y_pred))
        mlflow.log_metric("recall score", recall_score(Y_test, y_pred))
        mlflow.log_metric("precision score", precision_score(Y_test, y_pred))
        mlflow.log_metric("f1 score", f1_score(Y_test, y_pred))

        #artifact_path = mlflow.get_artifact_uri()

         # https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

        mlflow.sklearn.log_model(logistic_reg, "model")
        print(mlflow.active_run().info.run_id)

        mlflow.end_run()




  _warn_prf(average, modifier, msg_start, len(result))


b164ab91c51c456798e793e8a3db5ca2


  _warn_prf(average, modifier, msg_start, len(result))


576c7a6965864607a19a84376fbf714d
c77bc0f5764f47a28d7171658dc5d2d4


  _warn_prf(average, modifier, msg_start, len(result))


## Running mlflow server

In [5]:
!sqlite3 /tmp/example.db

  and should_run_async(code)


SQLite version 3.30.0 2019-10-04 15:03:17
Enter ".help" for usage hints.
sqlite> 
sqlite> 

In [6]:
#!mlflow server --gunicorn-opts "--log-level debug"  --host 0.0.0.0 --backend-store-uri sqlite:////tmp/example.db --default-artifact-root file:/tmp/0
!mlflow server

UnboundLocalError: local variable 'child' referenced before assignment

Trigger promotion request on staging model

[Feature proposal - Event based notifications for model registry changes](https://github.com/mlflow/mlflow/issues/3015)
