In [1]:
import azureml.dataprep as dprep
from IPython.display import display
dataset_root = "https://dprepdata.blob.core.windows.net/demo"

green_path = "/".join([dataset_root, "green-small/*"])
yellow_path = "/".join([dataset_root, "yellow-small/*"])

green_df_raw = dprep.read_csv(path=green_path, header=dprep.PromoteHeadersMode.GROUPED)
# auto_read_file automatically identifies and parses the file type, which is useful when you don't know the file type.
yellow_df_raw = dprep.auto_read_file(path=yellow_path)

In [None]:
display(green_df_raw.head(5))
display(yellow_df_raw.head(5))
display(green_df_raw.get_profile())
display(yellow_df_raw.get_profile())

In [None]:
all_columns = dprep.ColumnSelector(term=".*", use_regex=True)
drop_if_all_null = [all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)]
useful_columns = [
    "cost", "distance", "dropoff_datetime", "dropoff_latitude", "dropoff_longitude",
    "passengers", "pickup_datetime", "pickup_latitude", "pickup_longitude", "store_forward", "vendor"
]

In [None]:
green_df = (green_df_raw
    .replace_na(columns=all_columns)
    .drop_nulls(*drop_if_all_null)
    .rename_columns(column_pairs={
        "VendorID": "vendor",
        "lpep_pickup_datetime": "pickup_datetime",
        "Lpep_dropoff_datetime": "dropoff_datetime",
        "lpep_dropoff_datetime": "dropoff_datetime",
        "Store_and_fwd_flag": "store_forward",
        "store_and_fwd_flag": "store_forward",
        "Pickup_longitude": "pickup_longitude",
        "Pickup_latitude": "pickup_latitude",
        "Dropoff_longitude": "dropoff_longitude",
        "Dropoff_latitude": "dropoff_latitude",
        "Passenger_count": "passengers",
        "Fare_amount": "cost",
        "Trip_distance": "distance"
     })
    .keep_columns(columns=useful_columns))
green_df.head(5)

In [None]:
yellow_df = (yellow_df_raw
    .replace_na(columns=all_columns)
    .drop_nulls(*drop_if_all_null)
    .rename_columns(column_pairs={
        "vendor_name": "vendor",
        "VendorID": "vendor",
        "vendor_id": "vendor",
        "Trip_Pickup_DateTime": "pickup_datetime",
        "tpep_pickup_datetime": "pickup_datetime",
        "Trip_Dropoff_DateTime": "dropoff_datetime",
        "tpep_dropoff_datetime": "dropoff_datetime",
        "store_and_forward": "store_forward",
        "store_and_fwd_flag": "store_forward",
        "Start_Lon": "pickup_longitude",
        "Start_Lat": "pickup_latitude",
        "End_Lon": "dropoff_longitude",
        "End_Lat": "dropoff_latitude",
        "Passenger_Count": "passengers",
        "passenger_count": "passengers",
        "Fare_Amt": "cost",
        "fare_amount": "cost",
        "Trip_Distance": "distance",
        "trip_distance": "distance"
    })
    .keep_columns(columns=useful_columns))
yellow_df.head(5)

In [None]:
combined_df = green_df.append_rows([yellow_df])

In [None]:
decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL)
combined_df = combined_df.set_column_types(type_conversions={
    "pickup_longitude": decimal_type,
    "pickup_latitude": decimal_type,
    "dropoff_longitude": decimal_type,
    "dropoff_latitude": decimal_type
})
combined_df.keep_columns(columns=[
    "pickup_longitude", "pickup_latitude",
    "dropoff_longitude", "dropoff_latitude"
]).get_profile()

In [None]:
latlong_filtered_df = (combined_df
    .drop_nulls(
        columns=["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"],
        column_relationship=dprep.ColumnRelationship(dprep.ColumnRelationship.ANY)
    )
    .filter(dprep.f_and(
        dprep.col("pickup_longitude") <= -73.72,
        dprep.col("pickup_longitude") >= -74.09,
        dprep.col("pickup_latitude") <= 40.88,
        dprep.col("pickup_latitude") >= 40.53,
        dprep.col("dropoff_longitude") <= -73.72,
        dprep.col("dropoff_longitude") >= -74.09,
        dprep.col("dropoff_latitude") <= 40.88,
        dprep.col("dropoff_latitude") >= 40.53
    )))
latlong_filtered_df.keep_columns(columns=[
    "pickup_longitude", "pickup_latitude",
    "dropoff_longitude", "dropoff_latitude"
]).get_profile()

In [None]:
latlong_filtered_df.keep_columns(columns='store_forward').get_profile()

In [None]:
replaced_stfor_vals_df = latlong_filtered_df.replace(columns="store_forward", find="0", replace_with="N").fill_nulls("store_forward", "N")

In [None]:
replaced_distance_vals_df = replaced_stfor_vals_df.replace(columns="distance", find=".00", replace_with=0).fill_nulls("distance", 0)
replaced_distance_vals_df = replaced_distance_vals_df.to_number(["distance"])

In [None]:
time_split_df = (replaced_distance_vals_df
    .split_column_by_example(source_column="pickup_datetime")
    .split_column_by_example(source_column="dropoff_datetime"))
time_split_df.head(5)

In [None]:
renamed_col_df = (time_split_df
    .rename_columns(column_pairs={
        "pickup_datetime_1": "pickup_date",
        "pickup_datetime_2": "pickup_time",
        "dropoff_datetime_1": "dropoff_date",
        "dropoff_datetime_2": "dropoff_time"
    }))
renamed_col_df.head(5)

In [None]:
renamed_col_df.get_profile()

In [None]:
transformed_features_df = (renamed_col_df
    .derive_column_by_example(
        source_columns="pickup_date",
        new_column_name="pickup_weekday",
        example_data=[("2009-01-04", "Sunday"), ("2013-08-22", "Thursday")]
    )
    .derive_column_by_example(
        source_columns="dropoff_date",
        new_column_name="dropoff_weekday",
        example_data=[("2013-08-22", "Thursday"), ("2013-11-03", "Sunday")]
    )

    .split_column_by_example(source_column="pickup_time")
    .split_column_by_example(source_column="dropoff_time")
    # The following two calls to split_column_by_example reference the column names generated from the previous two calls.
    .split_column_by_example(source_column="pickup_time_1")
    .split_column_by_example(source_column="dropoff_time_1")
    .drop_columns(columns=[
        "pickup_date", "pickup_time", "dropoff_date", "dropoff_time",
        "pickup_date_1", "dropoff_date_1", "pickup_time_1", "dropoff_time_1"
    ])

    .rename_columns(column_pairs={
        "pickup_date_2": "pickup_month",
        "pickup_date_3": "pickup_monthday",
        "pickup_time_1_1": "pickup_hour",
        "pickup_time_1_2": "pickup_minute",
        "pickup_time_2": "pickup_second",
        "dropoff_date_2": "dropoff_month",
        "dropoff_date_3": "dropoff_monthday",
        "dropoff_time_1_1": "dropoff_hour",
        "dropoff_time_1_2": "dropoff_minute",
        "dropoff_time_2": "dropoff_second"
    }))

transformed_features_df.head(5)

In [None]:
processed_df = transformed_features_df.drop_columns(columns=["pickup_datetime", "dropoff_datetime"])

In [None]:
type_infer = processed_df.builders.set_column_types()
type_infer.learn()
type_infer

In [None]:
type_converted_df = type_infer.to_dataflow()
type_converted_df.get_profile()

In [None]:
final_df = type_converted_df.filter(dprep.col("distance") > 0)
final_df = final_df.filter(dprep.col("cost") > 0)

In [2]:
import os

file_path = os.path.join(os.getcwd(), "./automated-ml-regression/nyctaxi.csv")
final_df.to_pandas_dataframe().to_csv(file_path)

NameError: name 'final_df' is not defined

In [3]:
import azureml.core
import pandas as pd
from azureml.core import Workspace, Datastore
import logging

In [4]:
subscription_id = os.getenv("SUBSCRIPTION_ID", default="da21a094-26a3-472f-991b-e2b11979af40")
resource_group = os.getenv("RESOURCE_GROUP", default="mlservices")
workspace_name = os.getenv("WORKSPACE_NAME", default="autoML")
from azureml.core import Workspace

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded.")
except:
    print("Workspace not accessible. Change your parameters or create a new workspace.")

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code F93FEWP7W to authenticate.
Interactive authentication successfully completed.
Wrote the config file config.json to: /home/nbuser/library/aml_config/config.json
Workspace configuration succeeded.


In [5]:
ws = Workspace.from_config()
# choose a name for the run history container in the workspace
experiment_name = 'automated-ml-regression'
# project folder
project_folder = './automated-ml-regression'

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T

Found the config file in: /home/nbuser/library/aml_config/config.json


Unnamed: 0,Unnamed: 1
Location,westeurope
Project Directory,./automated-ml-regression
Resource Group,mlservices
SDK version,1.0.17
Subscription ID,da21a094-26a3-472f-991b-e2b11979af40
Workspace,autoML


In [10]:
%%writefile automated-ml-regression/train.py
import azureml.dataprep as dprep
from azureml.train.automl import AutoMLConfig

final_df = dprep.auto_read_file('nyctaxi.csv')
from sklearn.model_selection import train_test_split
from numpy import array

dflow_X = final_df.keep_columns(['pickup_weekday','pickup_hour', 'distance','passengers', 'vendor'])
dflow_y = final_df.keep_columns('cost')
X_df = dflow_X.to_pandas_dataframe()
y_df = dflow_y.to_pandas_dataframe()

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.15, random_state=223)

automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 30,
    "primary_metric" : 'normalized_root_mean_squared_error',
    "preprocess" : True,
    "verbosity" : logging.INFO,
    "n_cross_validations": 10
}

automated_ml_config = AutoMLConfig(task = 'regression',
                             debug_log = 'automated_ml_errors.log',
                             path = './automated-ml-regression',
                             X = X_train,
                             y = y_train.values.flatten(),
                             **automl_settings)


Overwriting automated-ml-regression/train.py


In [6]:
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

cpucluster AmlCompute Succeeded


In [8]:
from azureml.train.estimator import Estimator
compute_target = ws.compute_targets['cpucluster']

exec_environment = Estimator(source_directory='./automated-ml-regression',
                             entry_script='train.py',              
                             compute_target=compute_target,
                             pip_packages=['scikit-learn','azureml-dataprep','azureml-train-automl'])

In [19]:
from azureml.core.run import Run
from azureml.core.experiment import Experiment

experiment = Experiment(workspace=ws, name='NYCTaxi-regression')
run = experiment.submit(exec_environment, show_output=True)
run.wait_for_completion(show_output = True)

RunId: NYCTaxi-regression_1555255586_5785527e

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt
Streaming log file azureml-logs/80_driver_log.txt

Streaming azureml-logs/80_driver_log.txt



The experiment completed successfully. Finalizing run...
Logging experiment finalizing status in history service
Cleaning up all outstanding Run operations, waiting 300.0 seconds
1 items cleaning up...
Cleanup took 0.25090980529785156 seconds

Execution Summary
RunId: NYCTaxi-regression_1555255586_5785527e



{'runId': 'NYCTaxi-regression_1555255586_5785527e',
 'target': 'cpucluster',
 'status': 'Completed',
 'startTimeUtc': '2019-04-14T15:26:47.121252Z',
 'endTimeUtc': '2019-04-14T15:27:09.625548Z',
 'properties': {'azureml.runsource': 'experiment',
  'AzureML.DerivedImageName': 'azureml/azureml_0d3a9aba02af87953ef06be41b165f2a',
  'ContentSnapshotId': 'c2800697-b826-4f51-9613-42b2e884c591'},
 'runDefinition': {'script': 'train.py',
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cpucluster',
  'dataReferences': {},
  'jobName': None,
  'autoPrepareEnvironment': True,
  'maxRunDurationSeconds': None,
  'nodeCount': 1,
  'environment': {'name': 'Experiment NYCTaxi-regression Environment',
   'version': 'auto_4f7e7cbf11c44920b95b52321644befd',
   'python': {'interpreterPath': 'python',
    'userManagedDependencies': False,
    'condaDependencies': {'name': 'project_environment',
     'dependencies': ['python=3.6.2',
     

In [20]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [16]:
print(run.get_portal_url())

https://mlworkspace.azure.ai/portal/subscriptions/da21a094-26a3-472f-991b-e2b11979af40/resourceGroups/mlservices/providers/Microsoft.MachineLearningServices/workspaces/autoML/experiments/NYCTaxi-regression/runs/NYCTaxi-regression_1555254100_506d2a71


In [18]:
from azureml.widgets import RunDetails
from azureml.core.run import Run

experiment = Experiment (ws, 'NYCTaxi-regression')
run_id = 'NYCTaxi-regression_1555254100_506d2a71' #replace with run_ID
run = Run(experiment, run_id)
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [None]:
ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)

In [None]:
import os
script_folder  = os.path.join(os.getcwd(), "NYCTaxi")
os.makedirs(script_folder, exist_ok=True)

In [None]:
import azureml.data
from azureml.data.azure_storage_datastore import AzureFileDatastore, AzureBlobDatastore

ds.upload(src_dir='data',
          target_path='Data',
          overwrite=True,
          show_progress=True)

In [12]:
import azureml.dataprep as dprep
final_df = dprep.auto_read_file('./automated-ml-regression/nyctaxi.csv')
final_df.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Not Missing Count,Percent missing,Error Count,Empty count,0.1% Quantile,1% Quantile,5% Quantile,25% Quantile,50% Quantile,75% Quantile,95% Quantile,99% Quantile,99.9% Quantile,Mean,Standard Deviation,Variance,Skewness,Kurtosis
Column1,FieldType.DECIMAL,0,6147,6148.0,0.0,6148.0,0.0,0.0,0.0,5.648,614.3,600.0,1536.5,3073.5,4610.5,5840.1,6086.02,6141.35,3073.5,1774.92,3150340.0,0.0,-1.20059
Column1_2,FieldType.DECIMAL,0,6147,6148.0,0.0,6148.0,0.0,0.0,0.0,5.648,614.3,600.0,1536.5,3073.5,4610.5,5840.1,6086.02,6141.35,3073.5,1774.92,3150340.0,0.0,-1.20059
Column1_1,FieldType.DECIMAL,0,6147,6148.0,0.0,6148.0,0.0,0.0,0.0,5.648,614.3,600.0,1536.5,3073.5,4610.5,5840.1,6086.02,6141.35,3073.5,1774.92,3150340.0,0.0,-1.20059
vendor,FieldType.DECIMAL,1,2,6148.0,98.0,6050.0,0.01594,0.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.95025,0.21745,0.0472846,-4.14047,15.146
pickup_weekday,FieldType.STRING,Friday,Wednesday,6148.0,0.0,6148.0,0.0,0.0,0.0,,,,,,,,,,,,,,
pickup_hour,FieldType.DECIMAL,0,23,6148.0,0.0,6148.0,0.0,0.0,0.0,0.0,3.0,3.0,9.86524,16.0,19.4021,22.8728,23.0,23.0,14.2731,6.59242,43.46,-0.693723,-0.570403
pickup_minute,FieldType.DECIMAL,0,59,6148.0,0.0,6148.0,0.0,0.0,0.0,0.0,5.15228,5.0,13.9968,29.3438,44.8825,56.6344,59.0,59.0,29.427,17.4333,303.921,0.0120999,-1.20981
pickup_second,FieldType.DECIMAL,0,59,6148.0,0.0,6148.0,0.0,0.0,0.0,0.0,5.43206,5.0,14.8605,29.9765,44.9634,56.9584,59.0,59.0,29.7443,17.3595,301.351,-0.0252399,-1.19616
dropoff_weekday,FieldType.STRING,Friday,Wednesday,6148.0,0.0,6148.0,0.0,0.0,0.0,,,,,,,,,,,,,,
dropoff_hour,FieldType.DECIMAL,0,23,6148.0,0.0,6148.0,0.0,0.0,0.0,0.0,2.49493,2.0,9.65682,16.0,19.75,22.9283,23.0,23.0,14.2105,6.71093,45.0365,-0.687292,-0.61951


In [None]:
from sklearn.model_selection import train_test_split
from numpy import array

dflow_X = final_df.keep_columns(['pickup_weekday','pickup_hour', 'distance','passengers', 'vendor'])
dflow_y = final_df.keep_columns('cost')
X_df = dflow_X.to_pandas_dataframe()
y_df = dflow_y.to_pandas_dataframe()

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.15, random_state=223)
# flatten y_train to 1d array
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

In [None]:
automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 30,
    "primary_metric" : 'normalized_root_mean_squared_error',
    "preprocess" : True,
    "verbosity" : logging.INFO,
    "n_cross_validations": 10
}

In [None]:
from azureml.train.automl import AutoMLConfig

# local compute
automated_ml_config = AutoMLConfig(task = 'regression',
                             debug_log = 'automated_ml_errors.log',
                             path = project_folder,
                             X = X_train,
                             y = y_train.values.flatten(),
                             **automl_settings)

In [None]:
from azureml.core.experiment import Experiment
experiment=Experiment(ws, experiment_name)
local_run = experiment.submit(automated_ml_config, show_output=True)

In [None]:
ds.as_mount()
ds.path('./data').as_download()