In [0]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
# from sklearn.preprocessing import Imputer
import mlflow

In [0]:
# mlflow.set_tracking_uri("mysql+pymysql://mlflow-user:password@localhost:3306/mlflow")
experiment_name = '/Users/muhammad.feroze@confiz.com/mlflow-test/boston-housing-experiment'
mlflow.set_experiment(experiment_name)
client = mlflow.tracking.MlflowClient()
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id

In [0]:
file_location = "/FileStore/tables/train-4.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
spark_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df = spark_df.toPandas()

In [0]:
print(df.dtypes)

In [0]:
features = list(df.columns[:-1])
x = df[features]
y = df[df.columns[-1]]
thresh  = len(x) * .8
x = x.dropna(axis = 1, thresh = thresh)
x_nominal = x.select_dtypes(include =['object'])
x_numeric = x.select_dtypes(include =['int32', 'float32'])
x_numeric = x_numeric.drop(['Id'], axis=1)
#x_nominal = x_nominal.fillna('feroze')
#x_nominal = x_nominal.drop(['SaleCondition', 'SaleType','PavedDrive'], axis=1)
x_filled_nominal = x_nominal.fillna(x_nominal.mode().loc[0])
x_filled_numeric = x_numeric.fillna(x_numeric.mean())

In [0]:
x_encoded_nominal = x_filled_nominal.apply(LabelEncoder().fit_transform)

In [0]:
x_final = pd.concat([x_encoded_nominal,x_filled_numeric], axis=1)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(x_final, y, test_size=0.25, random_state=42, shuffle=True)

In [0]:
with mlflow.start_run(experiment_id=experiment_id) as run:
    run_id = run.info.run_uuid
    print("run_id:",run_id)
    lm = LinearRegression()
    lm.fit(X_train.values, y_train.values)
    y_pred = lm.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mse = mean_squared_error(y_test, y_pred, squared=True)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mse', mse)
    mlflow.sklearn.log_model(lm, "lg_model")

In [0]:
model_name = "test_linear_regression"
model_version = mlflow.register_model(f"runs:/{run_id}/lg_model", model_name)

In [0]:
from sklearn import linear_model
alpha = 0.01
with mlflow.start_run(experiment_id=experiment_id) as run:
    run_id = run.info.run_uuid
    print("run_id:",run_id)

    mlflow.log_param('alpha', alpha)
    reg = linear_model.Lasso(alpha=alpha)
    reg.fit(X_train.values, y_train.values)
    y_pred = reg.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mse = mean_squared_error(y_test, y_pred, squared=True)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mse', mse)
    mlflow.sklearn.log_model(reg, "model")

In [0]:
rmse