In [1]:
from flytekit.configuration import set_flyte_config_file, platform
set_flyte_config_file("/Users/kumare/.ssh/notebook-production.config")
#set_flyte_config_file("notebook.config")

print("Connected to {}".format(platform.URL.get()))

def print_console_url(exc):
    print("http://{}/console/projects/{}/domains/{}/executions/{}".format(platform.URL.get(), exc.id.project, exc.id.domain, exc.id.name))

Connected to flyte.lyft.net


In [2]:
query="""WITH eme AS (
  SELECT
    ride_id,
    feature_driver_distance_at_arrival_meters,
    feature_driver_distance_at_cancellation_meters,
    feature_dvr_cancellation_rate,
    feature_dvr_no_show_rate,
    feature_dvr_num_voice_calls_to_pax,
    feature_dvr_rides_28d,
    feature_dvr_sum_call_duration,
    feature_dvr_total_rides,
    feature_fixed_fare_amount,
    feature_gh6_total_rides,
    feature_has_waypoint,
    feature_hour_local,
    feature_hour_of_week_local,
    feature_hour_of_week_shifted_local,
    feature_hour_shifted_local,
    feature_is_scheduled_ride,
    feature_num_average_daily_rides_canceled,
    feature_num_rides_taken,
    feature_pax_avg_pickup_time_seconds,
    feature_pax_no_show_rate,
    feature_pax_num_voice_calls_to_driver,
    feature_pax_sms,
    feature_pax_sms_char_len,
    feature_pax_sum_call_duration,
    feature_pax_total_rides,
    feature_pax_unsuccessful_voice,
    feature_request_started_at_to_arrived_at_seconds,
    feature_seconds_since_arrival,
    feature_upfront_fare_amount
    FROM event_model_executed
    WHERE ds >= '{{.inputs.start_date}}'
      AND ds < '{{.inputs.end_date}}'
      AND model = 'dummyfeatureloggingnoshowmodel'
),

dsi AS (
  SELECT
    ride_id,
    MAX(CAST(is_a1k AS INT)) AS pax_a1k
  FROM dimension_support_issues
  WHERE issue_started_at >= CAST('{{.inputs.start_date}}' AS TIMESTAMP)
    AND issue_started_at < CAST('{{.inputs.end_date}}' AS TIMESTAMP) + INTERVAL '7' DAY
    AND impacted_user = 'passenger'
  GROUP BY ride_id
)

SELECT
  erc.ride_id,
  feature_driver_distance_at_arrival_meters,
  feature_driver_distance_at_cancellation_meters,
  feature_dvr_cancellation_rate,
  feature_dvr_no_show_rate,
  feature_dvr_num_voice_calls_to_pax,
  feature_dvr_rides_28d,  
  feature_dvr_sum_call_duration,
  feature_dvr_total_rides,
  feature_fixed_fare_amount,
  feature_gh6_total_rides,
  feature_has_waypoint,
  feature_hour_local,
  feature_hour_of_week_local,
  feature_hour_of_week_shifted_local,
  feature_hour_shifted_local,
  feature_is_scheduled_ride,
  feature_num_average_daily_rides_canceled,
  feature_num_rides_taken,
  feature_pax_avg_pickup_time_seconds,
  feature_pax_no_show_rate,
  feature_pax_num_voice_calls_to_driver,
  feature_pax_sms,
  feature_pax_sms_char_len,
  feature_pax_sum_call_duration,
  feature_pax_total_rides,
  feature_pax_unsuccessful_voice,
  feature_request_started_at_to_arrived_at_seconds,
  feature_seconds_since_arrival,
  feature_upfront_fare_amount,
  CASE WHEN dsi.pax_a1k = 1 THEN TRUE ELSE FALSE END AS should_waive_fee

FROM event_cancels_process_canceled_ride erc
JOIN experimentation.latest_exposure le
  ON erc.passenger_lyft_id = le.user_lyft_id
  AND erc.ds >= '{{.inputs.start_date}}'
  AND erc.ds < '{{.inputs.end_date}}'
  AND erc.after_arrived = TRUE
  AND (erc.due_to_no_show = TRUE OR erc.canceling_party = 'passenger')
  AND erc.cancel_penalty > 0
  AND le.experiment = 'CP_SXP_PAC_NS_JointHoldout_2019Q4'
  AND erc.occurred_at > le.first_exposed_at
  AND le.variant = 'holdout'
JOIN eme 
  ON erc.ride_id = eme.ride_id
LEFT JOIN dsi
  ON erc.ride_id = dsi.ride_id
WHERE erc.ds >= '{{.inputs.start_date}}'
  AND erc.ds < '{{.inputs.end_date}}'"""

In [3]:
from flytekit.sdk.tasks import inputs
from flytekit.sdk.types import Types
from flytekit.common.tasks.presto_task import SdkPrestoTask

schema = Types.Schema([
('feature_driver_distance_at_arrival_meters', Types.Integer),
('feature_driver_distance_at_cancellation_meters', Types.Integer),
('feature_dvr_cancellation_rate', Types.Integer),
('feature_dvr_no_show_rate', Types.Integer),
('feature_dvr_num_voice_calls_to_pax', Types.Integer),
('feature_dvr_rides_28d',   Types.Integer),
('feature_dvr_sum_call_duration', Types.Integer),
('feature_dvr_total_rides', Types.Integer),
('feature_fixed_fare_amount', Types.Integer),
('feature_gh6_total_rides', Types.Integer),
('feature_has_waypoint', Types.Integer),
('feature_hour_local', Types.Integer),
('feature_hour_of_week_local', Types.Integer),
('feature_hour_of_week_shifted_local', Types.Integer),
('feature_hour_shifted_local', Types.Integer),
('feature_is_scheduled_ride', Types.Integer),
('feature_num_average_daily_rides_canceled', Types.Integer),
('feature_num_rides_taken', Types.Integer),
('feature_pax_avg_pickup_time_seconds', Types.Integer),
('feature_pax_no_show_rate', Types.Integer),
('feature_pax_num_voice_calls_to_driver', Types.Integer),
('feature_pax_sms', Types.Integer),
('feature_pax_sms_char_len', Types.Integer),
('feature_pax_sum_call_duration', Types.Integer),
('feature_pax_total_rides', Types.Integer),
('feature_pax_unsuccessful_voice', Types.Integer),
('feature_request_started_at_to_arrived_at_seconds', Types.Integer),
('feature_seconds_since_arrival', Types.Integer),
('feature_upfront_fare_amount', Types.Integer),
])

schema = Types.Schema()

presto = SdkPrestoTask(
    task_inputs=inputs(start_date=Types.String, end_date=Types.String),
    statement=query,
    output_schema=schema,
    catalog="hive",
    schema="default",
    discoverable=True,
    discovery_version="1",
)

In [13]:
exc = presto.register_and_launch("flyteexamples", "development", inputs={"start_date":"2020-04-07", "end_date":"2020-04-01"})
print_console_url(exc)

http://flyte.lyft.net/console/projects/flyteexamples/domains/development/executions/d42y9db6qz


In [14]:
exc.wait_for_completion()

In [15]:
exc.sync()
results = exc.outputs["results"]
results.download("/tmp/data", overwrite=True)
dfs = []
with results as reader:
    for df in reader.iter_chunks():
        dfs.append(df)

In [16]:
dfs

[]

In [5]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(df, test_size=0.33, random_state=42)
        train_features = Index(['feature_driver_distance_at_arrival_meters',
  'feature_driver_distance_at_cancellation_meters',
  'feature_dvr_cancellation_rate',
  'feature_dvr_no_show_rate',
  'feature_dvr_num_voice_calls_to_pax',
  'feature_dvr_rides_28d',  
  'feature_dvr_sum_call_duration',
  'feature_dvr_total_rides',
  'feature_fixed_fare_amount',
  'feature_gh6_total_rides',
  'feature_has_waypoint',
  'feature_hour_local',
  'feature_hour_of_week_local',
  'feature_hour_of_week_shifted_local',
  'feature_hour_shifted_local',
  'feature_is_scheduled_ride',
  'feature_num_average_daily_rides_canceled',
  'feature_num_rides_taken',
  'feature_pax_avg_pickup_time_seconds',
  'feature_pax_no_show_rate',
  'feature_pax_num_voice_calls_to_driver',
  'feature_pax_sms',
  'feature_pax_sms_char_len',
  'feature_pax_sum_call_duration',
  'feature_pax_total_rides',
  'feature_pax_unsuccessful_voice',
  'feature_request_started_at_to_arrived_at_seconds',
  'feature_seconds_since_arrival',
  'feature_upfront_fare_amount'], dtype='object')
    
labels = Index(['should_waive_fee'])

x_train = train_dataset[train_features]
y_train = train_dataset[labels]

x_test = test_dataset[train_features]
y_test = test_dataset[labels]

IndentationError: unexpected indent (<ipython-input-5-3bbda51f0e3c>, line 3)

In [None]:
from flytekit.sdk.workflow import workflow_class, Input, Output