In [20]:
from pandas import Index
import pandas as pd
import numpy as np
from modelexeclib.wrappers.lgbm import LGBMRegressor

class Model(object):
    HYPERPARAMETERS = [
        {'name':'num_leaves','type':'int', 'default_value': 2},
    ]

    def __init__(self, hyperparameters=None):
        hyperparameters = hyperparameters or {}
        # Read and convert hyperparameters

    def train(self):
        # Get training data
        from lyft_analysis.db import presto
        df = presto.DatabaseTool().query("""WITH eme AS (
  SELECT
    ride_id,
    feature_driver_distance_at_arrival_meters,
    feature_driver_distance_at_cancellation_meters,
    feature_dvr_cancellation_rate,
    feature_dvr_no_show_rate,
    feature_dvr_num_voice_calls_to_pax,
    feature_dvr_rides_28d,
    feature_dvr_sum_call_duration,
    feature_dvr_total_rides,
    feature_fixed_fare_amount,
    feature_gh6_total_rides,
    feature_has_waypoint,
    feature_hour_local,
    feature_hour_of_week_local,
    feature_hour_of_week_shifted_local,
    feature_hour_shifted_local,
    feature_is_scheduled_ride,
    feature_num_average_daily_rides_canceled,
    feature_num_rides_taken,
    feature_pax_avg_pickup_time_seconds,
    feature_pax_no_show_rate,
    feature_pax_num_voice_calls_to_driver,
    feature_pax_sms,
    feature_pax_sms_char_len,
    feature_pax_sum_call_duration,
    feature_pax_total_rides,
    feature_pax_unsuccessful_voice,
    feature_request_started_at_to_arrived_at_seconds,
    feature_seconds_since_arrival,
    feature_upfront_fare_amount
    FROM hive.default.event_model_executed
    WHERE ds >= '2020-02-04'
      AND ds < '2020-03-06'
      AND model = 'dummyfeatureloggingnoshowmodel'
),

dsi AS (
  SELECT
    ride_id,
    MAX(CAST(is_a1k AS INT)) AS pax_a1k
  FROM default.dimension_support_issues
  WHERE issue_started_at >= CAST('2020-02-04' AS TIMESTAMP)
    AND issue_started_at < CAST('2020-03-06' AS TIMESTAMP) + INTERVAL '7' DAY
    AND impacted_user = 'passenger'
  GROUP BY ride_id
)

SELECT
  erc.ride_id,
  feature_driver_distance_at_arrival_meters,
  feature_driver_distance_at_cancellation_meters,
  feature_dvr_cancellation_rate,
  feature_dvr_no_show_rate,
  feature_dvr_num_voice_calls_to_pax,
  feature_dvr_rides_28d,  
  feature_dvr_sum_call_duration,
  feature_dvr_total_rides,
  feature_fixed_fare_amount,
  feature_gh6_total_rides,
  feature_has_waypoint,
  feature_hour_local,
  feature_hour_of_week_local,
  feature_hour_of_week_shifted_local,
  feature_hour_shifted_local,
  feature_is_scheduled_ride,
  feature_num_average_daily_rides_canceled,
  feature_num_rides_taken,
  feature_pax_avg_pickup_time_seconds,
  feature_pax_no_show_rate,
  feature_pax_num_voice_calls_to_driver,
  feature_pax_sms,
  feature_pax_sms_char_len,
  feature_pax_sum_call_duration,
  feature_pax_total_rides,
  feature_pax_unsuccessful_voice,
  feature_request_started_at_to_arrived_at_seconds,
  feature_seconds_since_arrival,
  feature_upfront_fare_amount,
  CASE WHEN dsi.pax_a1k = 1 THEN TRUE ELSE FALSE END AS should_waive_fee

FROM default.event_cancels_process_canceled_ride erc
JOIN experimentation.latest_exposure le
  ON erc.passenger_lyft_id = le.user_lyft_id
  AND erc.ds >= '2020-02-04'
  AND erc.ds < '2020-03-06'
  AND erc.after_arrived = TRUE
  AND (erc.due_to_no_show = TRUE OR erc.canceling_party = 'passenger')
  AND erc.cancel_penalty > 0
  AND le.experiment = 'CP_SXP_PAC_NS_JointHoldout_2019Q4'
  AND erc.occurred_at > le.first_exposed_at
  AND le.variant = 'holdout'
JOIN eme 
  ON erc.ride_id = eme.ride_id
LEFT JOIN dsi
  ON erc.ride_id = dsi.ride_id
WHERE erc.ds >= '2020-02-04'
  AND erc.ds < '2020-03-06'""")
        print("retrieved data")

        from sklearn.model_selection import train_test_split
        train_dataset, test_dataset = train_test_split(df, test_size=0.33, random_state=42)
        train_features = Index(['feature_driver_distance_at_arrival_meters',
  'feature_driver_distance_at_cancellation_meters',
  'feature_dvr_cancellation_rate',
  'feature_dvr_no_show_rate',
  'feature_dvr_num_voice_calls_to_pax',
  'feature_dvr_rides_28d',  
  'feature_dvr_sum_call_duration',
  'feature_dvr_total_rides',
  'feature_fixed_fare_amount',
  'feature_gh6_total_rides',
  'feature_has_waypoint',
  'feature_hour_local',
  'feature_hour_of_week_local',
  'feature_hour_of_week_shifted_local',
  'feature_hour_shifted_local',
  'feature_is_scheduled_ride',
  'feature_num_average_daily_rides_canceled',
  'feature_num_rides_taken',
  'feature_pax_avg_pickup_time_seconds',
  'feature_pax_no_show_rate',
  'feature_pax_num_voice_calls_to_driver',
  'feature_pax_sms',
  'feature_pax_sms_char_len',
  'feature_pax_sum_call_duration',
  'feature_pax_total_rides',
  'feature_pax_unsuccessful_voice',
  'feature_request_started_at_to_arrived_at_seconds',
  'feature_seconds_since_arrival',
  'feature_upfront_fare_amount'], dtype='object')
        labels = Index(['should_waive_fee'])
        
        x_train = train_dataset[train_features]
        y_train = train_dataset[labels]
        
        x_test = test_dataset[train_features]
        y_test = test_dataset[labels]
        print("split data set")

        # Construct model using modelexeclib wrapper
        lgbm = LGBMRegressor(n_estimators=2)

        # Fit model
        lgbm.fit(x_train, y_train)
        print("model fit done")
        
        y_predict = lgbm.predict(x_test)
        print(y_predict)
        
        from sklearn.metrics import f1_score
        score = f1_score(y_test, y_predict.round(), average='weighted')
        print("f1 score computed {}".format(score))

        from lyftlearnclient.metrics import Metrics
        metrics = Metrics()
        metrics.emit('f1-score', score)

        # Return fitted model
        # return lgbm

    def init_predict(self):
        # type: (None) -> None
        # This will be called before batch_predict() calls, and called once before serving predict() calls, so any slow
        # operations to set up the model, e.g download weights from S3 or load model checkpoints should be done here.
        pass

    def predict(self, request_data):
        # type: (dict) -> (object):
        # Online prediction on a single sample.
        # The input dict will be parsed from the a REST POST request's json body
        # The output object must be json serializable (e.g. a python dictionary)
        pass

    def batch_predict(self):
        # type: (None) -> None
        # Fetch data to predict, run prediction, save results.
        pass

In [19]:
model = Model({x['name']: x['default_value'] for x in Model.HYPERPARAMETERS})
model.train()


retrieved data
split data set
model fit done
[0.29592739 0.2219032  0.25455321 ... 0.23480338 0.2219032  0.30006669]
f1 score computed


  'precision', 'predicted', average, warn_for)


f1-score=0.6351484574799411



<modelexeclib.wrappers.lgbm.LGBMRegressor at 0x7f633026d940>