In [None]:
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import mlflow


StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 4, Finished, Available, Finished)

In [None]:
train_taxi_df = spark.sql("SELECT * FROM silver_lakehouse.green_tripdata_2017 LIMIT 10000")
display(train_taxi_df)

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, dbcd60fa-52d3-4893-95b7-6c162b0a8c76)

In [None]:
eval_taxi_df = spark.sql("SELECT * FROM silver_lakehouse.green_tripdata_2018 LIMIT 10000")
display(eval_taxi_df)

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 6, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b28df12a-0fe6-4cb0-91b9-734848846e17)

In [None]:
train_df = train_taxi_df.toPandas()
eval_df = eval_taxi_df.toPandas()

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 7, Finished, Available, Finished)

In [None]:
# Function to read and preprocess data
def prep_dataframe(df):
    df['duration'] = (df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    df['PULocationID'] = df['PULocationID'].astype('category')
    df['DOLocationID'] = df['DOLocationID'].astype('category')
    return df

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 8, Finished, Available, Finished)

In [None]:
final_train_df = prep_dataframe(train_df)
final_eval_df = prep_dataframe(eval_df)

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 9, Finished, Available, Finished)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PULocationID'] = df['PULocationID'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOLocationID'] = df['DOLocationID'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PULocationID'] = df['PULocationID'].astype('category')
A value is trying to be set 

In [None]:
mlflow.set_experiment('nyc-taxi-experiment')
target = 'duration'
categorical_columns = ['PULocationID', 'DOLocationID']

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 10, Finished, Available, Finished)

In [None]:
final_train_df['duration'].describe()

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 11, Finished, Available, Finished)

count    8817.000000
mean       30.107219
std        14.169795
min         1.000000
25%        18.200000
50%        28.750000
75%        41.233333
max        59.983333
Name: duration, dtype: float64

In [None]:
# Vectorize categorical features
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(final_train_df[categorical_columns].to_dict(orient='records'))
X_val = dv.transform(final_eval_df[categorical_columns].to_dict(orient='records'))
y_train = final_train_df[target].values
y_val = final_eval_df[target].values


StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 12, Finished, Available, Finished)

In [None]:
display(X_train)

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 13, Finished, Available, Finished)

array([[226.,  74.],
       [138.,  74.],
       [239., 223.],
       ...,
       [ 90., 226.],
       [247.,  82.],
       [260., 244.]])

In [None]:
type(X_train)

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 15, Finished, Available, Finished)

numpy.ndarray

In [None]:
X_train.shape

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, 16, Finished, Available, Finished)

(8817, 2)

In [None]:
print("Dimensionality (number of columns):", X_val.shape[1])

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, -1, Cancelled, , Cancelled)

In [None]:
from mlflow.models.signature import infer_signature
with mlflow.start_run() as run:
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    # score = lr.score(X_train, y_train)
    signature = infer_signature(X_train, y_train)

    lr_rmse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
    print("Linear Regression MSE:", lr_rmse)

    print("log_model.")
    mlflow.sklearn.log_model(lr, "sklearn-model", signature=signature)
    print("Model saved in run_id=%s" % run.info.run_id)

    print("register_model.")
    mlflow.register_model(

        "runs:/{}/sklearn-model".format(run.info.run_id), "nyctaxi-sklearn"
    )

StatementMeta(, 6d697a1d-39da-425e-bb44-e5b6702e9437, -1, Cancelled, , Cancelled)