## Machine Learning (ML) Pipelines

In [2]:
%sh

pip install mlflow



In [4]:

# Import libraries

from __future__ import print_function

from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,BayesianRidge,Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from statsmodels.api import OLS
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import numpy as np
import pandas as pd


In [6]:
# File location and type
file_location = "/FileStore/tables/all_vars_for_zeroinf_analysis.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

## import all_vars_for_zeroinf_analysis  file 
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [7]:
new_column_name_list= list(map(lambda x: x.replace(".", "_"), df.columns))

df = df.toDF(*new_column_name_list)

In [8]:
display(df)

county,confirmed_cases,confirmed_deaths,state,length_of_lockdown,cases,deaths,POP_ESTIMATE_2018,total_state_pop,Active_Physicians_per_100_000_Population__2018__AAMC_,Total_Active_Patient_Care_Physicians_per_100_000_Population__2018__AAMC_,Active_Primary_Care_Physicians_per_100_000_Population__2018__AAMC_,Active_Patient_Care_Primary_Care_Physicians_per_100_000_Population__2018__AAMC_,Active_General_Surgeons_per_100_000_Population__2018__AAMC_,Active_Patient_Care_General_Surgeons_per_100_000_Population__2018__AAMC_,Percentage_of_Active_Physicians_Who_Are_Female__2018__AAMC_,Percentage_of_Active_Physicians_Who_Are_International_Medical_Graduates__IMGs___2018__AAMC_,Percentage_of_Active_Physicians_Who_Are_Age_60_or_Older__2018__AAMC_,MD_and_DO_Student_Enrollment_per_100_000_Population__AY_2018_2019__AAMC_,Student_Enrollment_at_Public_MD_and_DO_Schools_per_100_000_Population__AY_2018_2019__AAMC_,Percentage_Change_in_Student_Enrollment_at_MD_and_DO_Schools__2008_2018__AAMC_,Percentage_of_MD_Students_Matriculating_In_State__AY_2018_2019__AAMC_,Total_Residents_Fellows_in_ACGME_Programs_per_100_000_Population_as_of_December_31__2018__AAMC_,Total_Residents_Fellows_in_Primary_Care_ACGME_Programs_per_100_000_Population_as_of_Dec__31__2018__AAMC_,Percentage_of_Residents_in_ACGME_Programs_Who_Are_IMGs_as_of_December_31__2018__AAMC_,Ratio_of_Residents_and_Fellows__GME__to_Medical_Students__UME___AY_2017_2018__AAMC_,Percent_Change_in_Residents_and_Fellows_in_ACGME_Accredited_Programs__2008_2018__AAMC_,Percentage_of_Physicians_Retained_in_State_from_Undergraduate_Medical_Education__UME___2018__AAMC_,All_Specialties__AAMC_,State_Local_Government_hospital_beds_per_1000_people__2019_,Non_profit_hospital_beds_per_1000_people__2019_,For_profit_hospital_beds_per_1000_people__2019_,Total_hospital_beds_per_1000_people__2019_,Total_nurse_practitioners__2019_,Total_physician_assistants__2019_,Total_Hospitals__2019_,Total_Primary_Care_Physicians__2019_,Surgery_specialists__2019_,Emergency_Medicine_specialists__2019_,Total_Specialist_Physicians__2019_,ICU_Beds,pop_fraction,Length_of_Life_rank,Quality_of_Life_rank,Health_Behaviors_rank,Clinical_Care_rank,Social___Economic_Factors_rank,Physical_Environment_rank,Adult_smoking_percentage,Adult_obesity_percentage,Excessive_drinking_percentage,Population_per_sq_mile,House_per_sq_mile,Share_of_Tests_with_Positive_COVID_19_Results,Number_of_Tests_with_Results_per_1_000_Population
Autauga,93,4,Alabama,26,1.672631787,0.071941152,55601,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.17149757,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499741,0.15425939,0.095133444,1.297497418,0.107911728,0.00568765,5.0,11.0,9.0,13.0,7.0,53.0,18,33,15,91.8,37.2,0.077853,27.8
Baldwin,231,7,Alabama,26,1.059526103,0.032106852,218022,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.17149757,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499741,0.15425939,0.095133444,1.297497418,0.233921347,0.022302348,3.0,2.0,5.0,6.0,3.0,5.0,17,31,18,114.6,65.5,0.077853,27.8
Barbour,69,1,Alabama,26,2.773200434,0.040191311,24881,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.17149757,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499741,0.15425939,0.095133444,1.297497418,0.200956553,0.002545178,23.0,55.0,54.0,41.0,60.0,28.0,22,42,13,31.0,13.4,0.077853,27.8
Bibb,46,1,Alabama,26,2.053571429,0.044642857,22400,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.17149757,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499742,0.15425939,0.095133444,1.297497417,0.0,0.002291386,49.0,14.0,32.0,18.0,43.0,22.0,19,38,16,36.8,14.4,0.077853,27.8
Blount,45,0,Alabama,26,0.778008299,0.0,57840,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.171497571,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499741,0.15425939,0.095133444,1.297497418,0.10373444,0.005916686,42.0,13.0,15.0,45.0,16.0,49.0,19,34,14,88.9,37.0,0.077853,27.8
Bullock,28,1,Alabama,26,2.761885974,0.098638785,10138,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.171497571,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499742,0.15425939,0.095133444,1.297497418,0.0,0.001037057,45.0,63.0,63.0,58.0,66.0,19.0,23,37,12,17.5,7.2,0.077853,27.8
Butler,230,6,Alabama,26,11.68699187,0.304878049,19680,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.17149757,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499741,0.15425939,0.095133444,1.297497418,0.355691057,0.002013146,64.0,56.0,57.0,56.0,55.0,45.0,22,43,12,27.0,12.8,0.077853,27.8
Calhoun,127,3,Alabama,26,1.111334739,0.026252002,114277,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.17149757,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499741,0.15425939,0.095133444,1.297497418,0.210016014,0.011689854,51.0,28.0,39.0,40.0,33.0,37.0,21,39,14,195.7,88.0,0.077853,27.8
Chambers,324,22,Alabama,26,9.638554217,0.654469731,33615,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.17149757,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499741,0.15425939,0.095133444,1.297497418,0.0,0.003438614,38.0,47.0,37.0,35.0,36.0,40.0,19,40,13,57.4,28.5,0.077853,27.8
Cherokee,24,0,Alabama,26,0.921942225,0.0,26032,9775742,217.1,196.7,77.2,71.2,7.6,6.9,28.5,18.1,34.4,49.1,22.8,136.8,82.1,29.9,11.9,21.9,0.6,18.4,48.4,2.17149757,1.4,0.8,0.9,3.1,0.519039885,0.109454607,0.020663393,1.199499741,0.15425939,0.095133444,1.297497417,0.0,0.002662918,56.0,10.0,7.0,33.0,18.0,27.0,17,35,14,46.9,29.4,0.077853,27.8


## Preprocess data

So what do we need to do to get our data ready for Machine Learning?

Recall our goal: We want to learn to predict the count of bike rentals (the cnt column). We refer to the count as our target "label".

Features: What can we use as features (info describing each row) to predict the ... label? We can use the rest of the columns, with a few exceptions:

In [11]:
df.printSchema()

In [12]:
# The following call takes all columns (df.columns) and casts them using Spark SQL to a numeric type (DoubleType).
from pyspark.sql.functions import col  # for indicating a column using a string in the line below
df = df.select([col(c).cast("double").alias(c) for c in df.columns])
df.printSchema()

In [13]:
from pyspark.sql.functions import col

df = df.select( col('length_of_lockdown'), col('confirmed_cases'), col('confirmed_deaths'),col('POP_ESTIMATE_2018'), col('POP_ESTIMATE_2018'), col('ICU_Beds'), col('Adult_obesity_percentage'),col('Quality_of_Life_rank'), col('Excessive_drinking_percentage'),col('Population_per_sq_mile'), col('Clinical_Care_rank'), col('ICU_Beds'), col('Adult_smoking_percentage'), col('Total_Specialist_Physicians__2019_'), col('Physical_Environment_rank'), col('Number_of_Tests_with_Results_per_1_000_Population'))

In [14]:
#df = df.drop("confirmed_cases")

#display(df)

### Split data into training and test sets

Our final data preparation step will split our dataset into separate training and test sets. We can train and tune our model as much as we like on the training set, as long as we do not look at the test set. After we have a good model (based on the training set), we can validate it on the held-out test set in order to know with high confidence our well our model will make predictions on future (unseen) data.

In [16]:
# Split the dataset randomly into 70% for training and 30% for testing.
train, test = df.randomSplit([0.7, 0.3])
print("We have %d training examples and %d test examples." % (train.count(), test.count()))


## Train a Machine Learning Pipeline

In [18]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
featuresCols = df.columns
featuresCols.remove('confirmed_cases')
# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="rawFeatures")
# This identifies categorical features and indexes them.
vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=4)

In [19]:
from pyspark.ml.regression import GBTRegressor
# Takes the "features" column and learns to predict "deaths"
gbt = GBTRegressor(labelCol="confirmed_cases")

we wrap the model training stage within a CrossValidator stage. CrossValidator knows how to call the GBT algorithm with different hyperparameter settings. It will train multiple models and choose the best one, based on minimizing some metric. In this example, our metric is Root Mean Squared Error (RMSE).

In [21]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
# Define a grid of hyperparameters to test:
#  - maxDepth: max depth of each decision tree in the GBT ensemble
#  - maxIter: iterations, i.e., number of trees in each GBT ensemble
# In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
paramGrid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [2, 5])\
  .addGrid(gbt.maxIter, [10, 100])\
  .build()
# We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol())
# Declare the CrossValidator, which runs model tuning for us.
cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid)

In [22]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])


In [23]:
pipelineModel = pipeline.fit(train)

## Make predictions, and evaluate results 

Our final step will be to use our fitted model to make predictions on new data. 

We will use our held-out test set, but you could also use this model to make predictions on completely new data. For example, if we created some features data based on weather predictions for the next week, we could predict bike rentals expected during the next week!
We will also evaluate our predictions. Computing evaluation metrics is important for understanding the quality of predictions, as well as for comparing models and tuning parameters.

In [25]:
predictions = pipelineModel.transform(test)

In [26]:
display(predictions.select("confirmed_cases", "prediction", *featuresCols))

confirmed_cases,prediction,length_of_lockdown,confirmed_deaths,POP_ESTIMATE_2018,POP_ESTIMATE_2018.1,ICU_Beds,Adult_obesity_percentage,Quality_of_Life_rank,Excessive_drinking_percentage,Population_per_sq_mile,Clinical_Care_rank,ICU_Beds.1,Adult_smoking_percentage,Total_Specialist_Physicians__2019_,Physical_Environment_rank,Number_of_Tests_with_Results_per_1_000_Population
1.0,136.03546929180936,0.0,0.0,1445.0,1445.0,0.0,26.0,15.0,10.0,2.1,23.0,0.0,9.0,1.169211399,3.0,48.9
1.0,63.50329514130337,0.0,0.0,2283.0,2283.0,0.0,35.0,27.0,20.0,1.6,41.0,0.0,17.0,1.188037528,22.0,64.2
1.0,63.50329514130337,0.0,0.0,2742.0,2742.0,0.0,52.0,59.0,15.0,1.4,53.0,0.0,32.0,1.085878479,4.0,29.2
1.0,63.50329514130337,0.0,0.0,2806.0,2806.0,0.0,38.0,73.0,18.0,4.1,54.0,0.0,16.0,1.422819432,15.0,25.7
1.0,63.50329514130337,0.0,0.0,6958.0,6958.0,0.0,39.0,13.0,22.0,17.8,16.0,0.0,15.0,1.422819432,52.0,25.7
1.0,46.51262068015541,0.0,0.0,8037.0,8037.0,0.0,38.0,1.0,24.0,9.4,10.0,0.0,16.0,1.188037528,39.0,64.2
1.0,63.50329514130337,0.0,0.0,10772.0,10772.0,0.0,36.0,51.0,23.0,10.5,37.0,0.0,16.0,1.422819432,10.0,25.7
1.0,63.50329514130337,0.0,0.0,15449.0,15449.0,0.064729109,35.0,27.0,10.0,1.9,28.0,0.064729109,16.0,1.169211399,19.0,48.9
1.0,63.50329514130337,0.0,0.0,30997.0,30997.0,0.129044746,34.0,20.0,25.0,18.1,11.0,0.129044746,17.0,1.188037528,29.0,64.2
2.0,63.50329514130337,0.0,0.0,4165.0,4165.0,0.0,39.0,58.0,15.0,1.6,60.0,0.0,32.0,1.085878479,46.0,29.2


- Metrics: Manually viewing the predictions gives intuition about accuracy, but it can be useful to have a more concrete metric. Below, we compute an evaluation metric which tells us how well our model makes predictions on all of our data. In this case (for RMSE), lower is better. This metric does not mean much on its own, but it can be used to compare different models. (This is what CrossValidator does internally.)

In [28]:
rmse = evaluator.evaluate(predictions)
print("RMSE on our test set: %g" % rmse)

- Visualization: Plotting predictions vs. features can help us make sure that the model "understands" the input features and is using them properly to make predictions. Below, we can see that the model predictions are correlated with the hour of the day, just like the true labels were.
Note: For more expert ML usage, check out other Databricks guides on plotting residuals, which compare predictions vs. true labels.

In [30]:
display(predictions.select("confirmed_cases", "prediction"))

confirmed_cases,prediction
1.0,136.03546929180936
1.0,63.50329514130337
1.0,63.50329514130337
1.0,63.50329514130337
1.0,63.50329514130337
1.0,46.51262068015541
1.0,63.50329514130337
1.0,63.50329514130337
1.0,63.50329514130337
2.0,63.50329514130337


In [31]:
# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once,
# but a second run would hit conflicts when attempting to overwrite the first run.
import mlflow
import mlflow.mleap
with mlflow.start_run():
  from pyspark.ml import Pipeline
  pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])
  pipelineModel = pipeline.fit(train)
  mlflow.set_tag('data-teams-unite-personal-team', 'COVID-19 Mortality Risk Factors and economic policies') # Logs user-defined tags
  test_metric = evaluator.evaluate(pipelineModel.transform(test))
  mlflow.log_metric('test_' + evaluator.getMetricName(), test_metric) # Logs additional metrics
  mlflow.mleap.log_model(spark_model=pipelineModel.bestModel, sample_input=test, artifact_path='best-model') # Logs the best model via mleap


In [32]:
# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once,
# but a second run would hit conflicts when attempting to overwrite the first run.
import mlflow
import mlflow
import mlflow.mleap
import pyspark
#import pyspark.ml.mleap.SparkUtil 
#import mlflow.mleap.SparkUtil 
import mlflow.mleap
with mlflow.start_run():
  cvModel = cv.fit(trainingData)
  mlflow.set_tag('owner_team', 'UX Data Science') # Logs user-defined tags
  test_metric = evaluator.evaluate(cvModel.transform(testData))
  mlflow.log_metric('testData_' + evaluator.getMetricName(), test_metric) # Logs additional metrics
  mlflow.mleap.log_model(spark_model=cvModel.bestModel, sample_input=testData, artifact_path='dbfs:/databricks/mlflow/2835302286394144') # Logs the best model via mleap



## Improving our model
You are not done yet! This section describes how to take this notebook and improve the results even more. Try copying this notebook into your Databricks account and extending it, and see how much you can improve the predictions.

There are several ways we could further improve our model:

- Expert knowledge: We may not be experts on bike sharing programs, but we know a few things we can use:
          - The confirmed_cases cannot be negative. GBTRegressor does not know that, but we could threshold the predictions to be >= 0 post-hoc.
          - The confirmed_casesis the sum of registered and casual rentals. These two counts may have different behavior. (Frequent cyclists and casual cyclists probably rent bikes for different reasons.) The --
 - best models for this dataset take this into account. Try training one GBT model for registered and one for casual, and then add their predictions together to get the full prediction.
Better tuning: To make this notebook run quickly, we only tried a few hyperparameter settings. To get the most out of our data, we should test more settings. Start by increasing the number of trees in our GBT model by setting maxIter=200; it will take longer to train but can be more accurate.
- Feature engineering: We used the basic set of features given to us, but we could potentially improve them. For example, we may guess that weather is more or less important depending on whether or not it is a workday vs. weekend. To take advantage of that, we could build a few feature by combining those two base features. MLlib provides a suite of feature transformers; find out more in the ML guide.