# Regression model for California housing using NimbusML

Regression is a ML task type of supervised machine learning algorithms. A regression ML model predicts continuous value outputs (such as numbers). In this exaple trying to predict the AverageHousePrice of a neighborhood in California.

## Verify your NimbusML version

In [None]:
# import nimbusml, verify your version
import nimbusml
from nimbusml import FileDataStream, DataSchema
print("nimbusml version: ", nimbusml.__version__)

## Load your data

In [None]:
# load data using FileDataStream
# these are actually faster than Pandas Dataframes
# Do you want to know more? https://arxiv.org/pdf/1905.05715.pdf
ds_train = FileDataStream.read_csv("../datasets/california-housing/california-housing-train.csv", numeric_dtype="R4")
ds_test = FileDataStream.read_csv("../datasets/california-housing/california-housing-test.csv", numeric_dtype="R4")
ds_train.head(5)

## Verify the schema of your data 

In [None]:
# Want to know more about the Schema?
# https://docs.microsoft.com/en-us/nimbusml/concepts/schema

# Want to know more about data types? in Nimbus?
# https://docs.microsoft.com/en-us/nimbusml/concepts/types
ds_train.schema

## Data transformations pipeline for NimbusML model

In [None]:
# create a pipeline, and a scaler
from nimbusml import Pipeline
from nimbusml.preprocessing.normalization import MeanVarianceScaler

# https://docs.microsoft.com/en-us/python/api/nimbusml/nimbusml.preprocessing.normalization.meanvariancescaler?view=nimbusml-py-latest
# not normalizing 'MedianHouseValue' why?
mvs = MeanVarianceScaler()      << ['Longitude','Latitude','HousingMedianAge','TotalRooms','TotalBedrooms','Population','TotalHouseholds','MedianIncome']

preprocess_pipeline = Pipeline([mvs])

## Make it a training pipeline

In [None]:
# # create a regresor of your choice, append the trainer to the transform pipeline
from nimbusml.linear_model import OrdinaryLeastSquaresRegressor
from nimbusml.linear_model import FastLinearRegressor
#OrdinaryLeastSquaresRegressor,FastLinearRegressor,OnlineGradientDescentRegressor,PoissonRegressionRegressor,
#GamRegressor,LightGbmRegressor, FastTreesRegressor, FastForestRegressor, FastTreesTweedieRegressor
# https://docs.microsoft.com/en-us/python/api/nimbusml/nimbusml.linear_model.fastlinearregressor?view=nimbusml-py-latest
# https://docs.microsoft.com/en-us/python/api/nimbusml/nimbusml.linear_model.ordinaryleastsquaresregressor?view=nimbusml-py-latest

olsr = OrdinaryLeastSquaresRegressor(feature= ['Longitude','Latitude','HousingMedianAge','TotalRooms','TotalBedrooms','Population','TotalHouseholds','MedianIncome'] , label='MedianHouseValue')
flr = FastLinearRegressor(feature= ['Longitude','Latitude','HousingMedianAge','TotalRooms','TotalBedrooms','Population','TotalHouseholds','MedianIncome'] , label='MedianHouseValue')

training_pipeline = preprocess_pipeline.clone()

In [None]:
training_pipeline.append(flr)

## Fit both pipelines

In [None]:
# fit the pipelines
preprocess_pipeline.fit(ds_train, 'y')
training_pipeline.fit(ds_train, 'y')

print("preprocess pipeline:", preprocess_pipeline)
print("training pipeline:", training_pipeline)

## Observe the transformed data

In [None]:
#look at the transformed data
preprocess_pipeline.transform(ds_train)

## Measure the performance on the training and test set

In [None]:
# calculate the metrics
# https://docs.microsoft.com/en-us/nimbusml/concepts/metrics
metrics, scores = training_pipeline.test(ds_train, output_scores=True)
#print(scores) # uncomment this if you want to look at the scores
metrics

In [None]:
metrics, scores = training_pipeline.test(ds_test, output_scores=True)
#print(scores) # uncomment this if you want to look at the scores
metrics

## Basic Feature Engineering

In [None]:
# load the dataset now with pandas
import pandas as pd
# we're now loading the datasets with slower pandas, but we need to manipulate it.

df_train = pd.read_csv("../datasets/california-housing/california-housing-train.csv")
df_test = pd.read_csv("../datasets/california-housing/california-housing-test.csv")
df_train.head(5)

In [None]:
# create basic features
# Some really basic features, there's way fancier things we can do.
df_train['NumRooms'] = df_train['TotalRooms'] / df_train['TotalHouseholds']
df_train['NumBedrooms'] = df_train['TotalBedrooms'] / df_train['TotalHouseholds']
df_train['PersonsPerHouse'] = df_train['Population'] / df_train['TotalHouseholds']
df_train['RoomsPerPerson'] = df_train['TotalBedrooms'] / df_train['PersonsPerHouse'] 
df_train['BedroomsPerPerson'] = df_train['TotalBedrooms'] / df_train['PersonsPerHouse'] 

# Do it for test as well
df_test['NumRooms'] = df_test['TotalRooms'] / df_test['TotalHouseholds']
df_test['NumBedrooms'] = df_test['TotalBedrooms'] / df_test['TotalHouseholds']
df_test['PersonsPerHouse'] = df_test['Population'] / df_test['TotalHouseholds']
df_test['RoomsPerPerson'] = df_test['TotalBedrooms'] / df_test['PersonsPerHouse'] 
df_test['BedroomsPerPerson'] = df_test['TotalBedrooms'] / df_test['PersonsPerHouse'] 

#some other features less simple ones:

#cal_df['room_value'] = cal_df['median_house_value'] / cal_df['num_rooms'] #   these features have the predict target, duh of course they're good
#cal_df['bedroom_value'] = cal_df['median_house_value'] / cal_df['num_bedrooms']

#cal_df['room_value'] = cal_df['median_house_value'] / 3  #silly example to show perfect predction because of target variable bleeding
#cal_df['bedroom_value'] = cal_df['median_house_value'] / 3

#cal_df['long_diff_from_sfo'] = abs(cal_df['longitude'] - -122.451183)
#cal_df['lat_diff_from_sfo'] = abs(cal_df['latitude'] - 37.761345)
#cal_df['long_diff_from_sjc'] = abs(cal_df['longitude'] - -121.879819)
#cal_df['lat_diff_from_sjc'] = abs(cal_df['latitude'] - 37.325021)
#cal_df['long_diff_from_sd'] = abs(cal_df['longitude'] - -117.146320)
#cal_df['lat_diff_from_sd'] = abs(cal_df['latitude'] - 32.753505)
#cal_df['long_diff_from_la'] = abs(cal_df['longitude'] - -118.393758)
#cal_df['lat_diff_from_la'] = abs(cal_df['latitude'] - 34.038300)

#cal_df['long_diff_from_avg'] = cal_df['longitude'] - cal_df['longitude'].mean()
#cal_df['lat_diff_from_avg'] = cal_df['latitude'] - cal_df['latitude'].mean()

In [None]:
# verify everything looks good
df_train.head(5)

In [None]:
# caution this is a descructive cell
# separate the target variable
df_train_y = df_train['MedianHouseValue']
del(df_train['MedianHouseValue'])

df_test_y = df_test['MedianHouseValue']
del(df_test['MedianHouseValue'])

In [None]:
#create another pipeline that uses the new features
new_mvs = MeanVarianceScaler()      << ['Longitude','Latitude','HousingMedianAge','TotalRooms','TotalBedrooms','Population','TotalHouseholds','MedianIncome','NumRooms','PersonsPerHouse','RoomsPerPerson','BedroomsPerPerson']

new_preprocess_pipeline = Pipeline([new_mvs])

new_flr = FastLinearRegressor(feature= ['Longitude','Latitude','HousingMedianAge','TotalRooms','TotalBedrooms','Population','TotalHouseholds','MedianIncome','NumRooms','PersonsPerHouse','RoomsPerPerson','BedroomsPerPerson'])

new_training_pipeline = new_preprocess_pipeline.clone()

new_training_pipeline.append(new_flr)

In [None]:
# fit the pipelines
new_preprocess_pipeline.fit(df_train, df_train_y)
new_training_pipeline.fit(df_train, df_train_y)

print("preprocess pipeline:", new_preprocess_pipeline)
print("training pipeline:", new_training_pipeline)

In [None]:
#evaluate metrics
# https://docs.microsoft.com/en-us/nimbusml/concepts/metrics
metrics, scores = new_training_pipeline.test(df_train, df_train_y, output_scores=True)
#print(scores) # uncomment this if you want to look at the scores
metrics

In [None]:
metrics, scores = new_training_pipeline.test(df_test, df_test_y, output_scores=True)
#print(scores) # uncomment this if you want to look at the scores
metrics

## Visualize the pipeline

In [None]:
### create the pipeline diagram
### Does this not work?
### make sure you've installed graphviz https://graphviz.gitlab.io/download/
### make sure you've run set_path_graphviz.bat 
from nimbusml.utils.exports import img_export_pipeline
figure = img_export_pipeline(training_pipeline,ds_train)
figure

## Save the model

In [None]:
print("Saving the ML.NET Model as a file...")

model_file_path = "../models/ml_net_california_housing_python.zip"
training_pipeline.save_model(model_file_path)

print("The model was saved to: {}".format(model_file_path))