In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression  # this is your ALGORITHM
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import root_mean_squared_error

import mlflow
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('taxi')

df = pd.read_pickle('../tripdata.pkl')
df['duration'] = (df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']).dt.total_seconds()/60
df.drop(columns=['VendorID', 'store_and_fwd_flag', 'RatecodeID', 'passenger_count',
                  'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tolls_amount', 
                  'improvement_surcharge', 'congestion_surcharge', 'tip_amount', 'ehail_fee', 
                  'total_amount', 'trip_type', 'lpep_pickup_datetime','lpep_dropoff_datetime'], inplace=True)

df = df[(df.duration > 1) & (df.duration < 60)]

categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)

df['pu_do'] = df['PULocationID'] + '_' + df['DOLocationID']

numerical = ['trip_distance']
train_dicts = df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df['duration'].values

with mlflow.start_run():
    mlflow.set_tag("developer", "Hank Hill")
    mlflow.log_param("train-data-path", "tripdata.pkl")
    alpha = 1
    mlflow.log_param("alpha", alpha)
    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)
    model.score(X_train, y_train)
    y_pred = model.predict(X_train)
    rmse = root_mean_squared_error(y_train, y_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(model, "model")


In [None]:
lr = Ridge(alpha=0.1)
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

In [None]:


categorical = ['pu_do']
train_dicts = df[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)



In [None]:

lr = Lasso(0.1)
lr.fit(X_train, y_train)
lr.score(X_train, y_train)