# Lesson 3: Model Deployement Workflow 

## Dataset Principles

Your tasks for this exercise are:

Create a dataframe with your features and target arrays from make_regression.
Create a 60% Train / 20% Validation / 20% Test dataset group using the train_test_split method.
Confirm the datasets are the correct size by outputing their shape.
Save the three datasets to CSV

In [2]:
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

In [None]:
# Creating a regression dataset with 1000 samples, 5 feature columns, 2 which are actually useful, and 1 target column
regression_dataset = make_regression(
    n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=0
)

In [None]:
df = pd.DataFrame(regression_dataset[0])
df["target"] = regression_dataset[1]
df.head()

In [None]:
# Create a train: 0.8 | test: 0.2 ratio dataset
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

# Create a train: 0.6 | validation: 0.2 ratio dataset
df_train, df_val = train_test_split(df, test_size=0.25, random_state=0)
# Final dataset sizes: train: 0.6, validation: 0.2, test: 0.2,

In [None]:
# Output each shape to confirm the size of train/validation/test
print(f"Train: {df_train.shape}")
print(f"Validation: {df_val.shape}")
print(f"Test: {df_test.shape}"

In [None]:
# Output all datasets to csv
df_train.to_csv('train.csv', index=False)
df_val.to_csv('validation.csv', index=False)
df_test.to_csv('test.csv', index=False)

## Data Cleaning and Feature Engineering 

In [None]:
import pandas as pd

# Creating a mixed dataset of strings, floats, and date strings
df = pd.DataFrame(
    [
        ["cat", 1.0, "3-2021"],
        ["cat", 0.5, "1-2021"],
        ["dog", 0.2, "5-2021"],
        ["bird", 3.3, "3-2021"],
        ["dog", 5.7, "1-2021"],
        ["dog", 0.0, "2-2021"],
        ["cat", 1.9, "4-2021"],
        ["bird", 2.4, "4-2021"],
        ["bird", 2.4, "5-2021"]
    ],
    columns=["animal", "value", "date"]
)
df.info()


In [None]:
#change datatype 
df.loc[:, "animal"] = df["animal"].astype("category")
df.info()

Normalizing Data
Transforms numerical data to have specific range of values
Transformations typically have zero mean, meaning their average is 0.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[["value"]])
scaler.transform(df[["value"]])

Parsing Data Types
Pandas to_datetime() method will parse datetime strings
Converts strings to datetime objects


In [None]:
pd.to_datetime(df.loc[:, "date"])

One-hot Encoding
Required for models that only take numerical data
Pandas has a one-hot encoding function,.get_dummies()
Converts categorical data to many feature columns

In [None]:
pd.get_dummies(df.animal, prefix="animal")

## Exercise: Data Cleansing and Feature Engineering

Apply these changes to the data.csv dataset.

Load data.csv into a dataframe.
Output the table info to see if there are any null values.
Remove all null values from the dataframe.
Change the date column from an object to a datetime64[ns] type.
Change the weather column to a category type.
One hot encode the date column to year, month, and day.
Normalized the columns from the all_features list so each feature has a zero mean.
Create and save the cleaned dataframe, as well as the train/validation/test dataframes to CSV.

In [None]:
import random
from datetime import datetime
import pandas as pd
import numpy as np

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Reading the dataset created by 02_exercise_dataset_creation.ipynb
df = pd.read_csv("data.csv")

In [None]:
# Output general info about the table, notice we have some null values in all of our features
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Drop all null values
df.dropna().head()

In [None]:
# Change the date column to a datetime
df.loc[:, 'date'] = pd.to_datetime(df.loc[:, 'date'])
# Change weather column to a category
df.loc[:, 'weather'] = df["weather"].astype("category")
df.head()

In [None]:
# Extract year, month, and day into separate columns
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day

In [None]:
# One hot encode the weather category to have individual features. Prefix with `weather`
weather_one_hot_df = pd.get_dummies(df.weather, prefix='weather')
weather_one_hot_df.head()

In [None]:
# Add the one hot encoded values back to the df
df[weather_one_hot_df.columns.tolist()] = weather_one_hot_df
df.head()

In [None]:
# Verify now that are table info has no nulls and correct Dtypes
df.info()

In [None]:
#mpute the data with any of the two lines below:
data = data.fillna(lambda x: x.median()) #or
data = data.fillna(method='ffill')

In [None]:
# These may change if you decided to call your columns different from above
all_features = [
    "feature0",
    "feature1",
    "feature2",
    "year",
    "month",
    "day",
    "weather_cloudy",
    "weather_rainy",
    "weather_sunny",
]

In [None]:
# Table summary, notice the mean to many of our tables are not zero.
df[all_features].describe()

In [None]:
# Standarize feature values to have a zero mean
scaler = StandardScaler()
scaler.fit(df[all_features])
df.loc[:, all_features] = scaler.transform(df[[all_features]])

In [None]:
# train: 0.8 | test: 0.2
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

# train: 0.6 | validation: 0.2
df_train, df_val = train_test_split(df_train, test_size=0.25, random_state=0)

# Final dataset sizes: train: 0.6, validation: 0.2, text: 0.2,

In [None]:
# Output each shape to confirm the size of train/validation/test
print(f"Train: {df_train.shape}")
print(f"Validation: {df_val.shape}")
print(f"Test: {df_test.shape}")

In [None]:
# Save all clean data, and the train, validation, test data as csv
df.to_csv("data_clean.csv", index=False)
df_train.to_csv("train.csv", index=False)
df_val.to_csv("validation.csv", index=False)
df_test.to_csv("test.csv", index=False)

## Model Training 

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

df = pd.DataFrame([[5, 3.4, 6], [1, 0.4, 10], [2, 0.1, 1]])
target = [0, 1, 1]

# One line model creation
reg = LinearRegression().fit(df, target)

# Score model with default metrics
print(reg.score(df, target))

#output
1.0

# Predict targets
print(reg.predict(df))

https://scikit-learn.org/stable/modules/neighbors.html guide to all models
https://towardsdatascience.com/20-popular-machine-learning-metrics-part-1-classification-regression-evaluation-metrics-1ca3e282a2ce  Model evaluation
https://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics  scikit-learn API Reference

## Exercise: Model Training and Evaluation

In [None]:
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
regression_dataset = make_regression(
    n_samples=10000,
    n_features=10,
    n_informative=5,
    bias=0,
    noise=40,
    n_targets=1,
    random_state=0,
)

In [None]:
# Create the dataframe using the dataset
df = pd.DataFrame(regression_dataset[0])
df["target"] = regression_dataset[1]

In [None]:
# train: 0.8 | test: 0.2
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

# train: 0.6 | validation: 0.2
df_train, df_val = train_test_split(df, test_size=0.25, random_state=0)

# Final dataset sizes: train: 0.6, validation: 0.2, text: 0.2,

In [None]:
# Output each shape to confirm the size of train/validation/test
print(f"Train: {df_train.shape}")
print(f"Validation: {df_val.shape}")
print(f"Test: {df_test.shape}")

In [None]:
# Train the linear model by fitting it on the dataframe features and dataframe target
reg = LinearRegression().fit(df_train[list(range(10))], df_train['target'])()

In [None]:
# Evaluate the linear model by scoring it, by default it's the metric r2.
reg.score(df_val[list(range(10))], df_val['target'])

In [None]:
# Once done optimizing the model using the validation dataset,
# Evaluate the linear model by scoring it on the test dataset.
reg.score(df_test[list(range(10))], df_test['target'])

## Hyperparameter Tuning 

In [None]:
from sklearn.tree import DecisionTreeClassifier

# create baseline with default params
model = DecisionTreeClassifier()

# create model with hyperparameters
model = DecisionTreeClassifier(
    max_depth=10,
    criterion="entropy"
)

Hyperparameter sources
https://scikit-learn.org/stable/modules/classes.html#hyper-parameter-optimizers 
https://scikit-learn.org/stable/modules/grid_search.html
https://neptune.ai/blog/hyperparameter-tuning-in-python-complete-guide

## Exercise: Diabetes Model

Load the diabetes dataset into a dataframe.
Check the table summary to show that indeed the mean is zero for all features.
Split the dataset into train, validation, and test sets
Use a linear regression Ridge model to fit and score:
Fit and score on the whole dataset
Fit on train, score on validation, using default model
Fit on train, score on validation, using hyperparameters model
Fit on train, score on test, using hyperparameterized model
Plot all scores in a bar graph

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [None]:
# Load in the diabetes dataset
diabetes = datasets.load_diabetes()

In [None]:
diabetes.keys()

In [None]:
# Create the diabetes `data` dataset as a dataframe and name the columns with `feature_names`
df = pd.DataFrame(diabetes['data'], columns=diabetes['feature_names'])

# Include the target as well
df['target'] = diabetes['target']

In [None]:
# Describe df using table summary.
# No need to normalize, near zero mean.
df.describe()

In [None]:
df.corr()

In [None]:
# train: 0.8 | test: 0.2
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

# train: 0.6 | validation: 0.2
df_train, df_val = train_test_split(df, test_size=0.25, random_state=0)

# Final dataset sizes: train: 0.6, validation: 0.2, text: 0.2

In [None]:
# How does the model perform on the entire dataset and default model parameters
reg = Ridge().fit(df[diabetes['feature_names']], df['target'])
all_df_score = reg.score(df[diabetes['feature_names']], df['target'])
all_df_score

In [None]:
# How does the model perform on the training dataset and default model parameters
# Remember we use the validation dataset score the model
reg = Ridge().fit(df_train[diabetes['feature_names']], df_train['target'])
val_df_score = reg.score(df_val[diabetes['feature_names']], df_val['target'])
val_df_score

In [None]:
# How does the model perform on the training dataset and different model parameters
# Change alpha, solver, and max_iter
reg_h = Ridge(alpha=0.01, solver="saga", max_iter=10000).fit(df_train[diabetes['feature_names']], df_train['target'])
val_df_h_score = reg_h.score(df_val[diabetes['feature_names']], df_val['target'])
val_df_h_score

In [None]:
# Use optimized model on the held out test dataset.
test_df_h_score = reg_h.score(df_test[diabetes["feature_names"]], df_test["target"])
test_df_h_score

In [None]:
# Bar plot of all scores from each model fit: all_df_score, val_df_score, val_df_h_score, test_df_h_score
import matplotlib as plt 

pd.Series({"all_df_score": all_df_score,
        "val_df_score": val_df_score,
        "val_df_h_score": val_df_h_score,
        "test_df_h_score": test_df_h_score,}).plot(kind="bar", legend=False, title="R2 Score of Ridge Model")

## Lesson 4: Algorithms and Tools 

Linear models 

addtional resources 

https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html

https://scikit-learn.org/stable/modules/linear_model.html

In [None]:
#Code Example of Linear Model

import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression

df = pd.DataFrame(
    [[1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1]],
    columns=["num", "amount", "target"]
)

# regression model
reg = LinearRegression().fit(df[["num", "amount"]], df["target"])
reg.score(df[["num", "amount"]], df["target"])

#classification model
clf = LogisticRegression().fit(df[["num", "amount"]], df["target"])
clf.score(df[["num", "amount"]], df["target"])

## Exercise: Linear Models

You're tasked with compeleting the following steps:

Load in the wine dataset from scikit learn.
For the wine dataset, create a train and test split, 80% train / 20% test.
Create a LogisticRegression model with these hyper parameters: random_state=0, max_iter=10000
Evaluate the model with the test dataset
Load the diabetes dataset from scikit learn
For the Diabetes dataset, create a train and test split, 80% train / 20% test.
Create a SGDRegressor model model with these hyper parameters: random_state=0, max_iter=10000
Evaluate the model with the test dataset

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDRegressor

Linear Classifier

In [None]:
# Load in the wine dataset
wine = datasets.load_wine()
wine.keys()

In [None]:
# Create the wine `data` dataset as a dataframe and name the columns with `feature_names`
df = pd.DataFrame(wine['data'], columns=wine['feature_names'])

# Include the target as well
df['target'] = wine['target']

In [None]:
# Split your data with these ratios: train: 0.8 | test: 0.2
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model
clf = LogisticRegression(random_state=0, max_iter=10000).fit(df_train[wine['feature_names']], df_train['target'])
clf.score(df_test[wine["feature_names"]], df_test["target"])

Linear Regression

In [None]:
# Load in the diabetes dataset
diabetes = datasets.load_diabetes()

In [None]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model
reg = SGDRegressor(random_state=0, max_iter=10000).fit(dfd_train[diabetes['feature_names']], dfd_train['target'])
reg.score(dfd_test[diabetes["feature_names"]], dfd_test["target"])

## Tree Based Model

decision tree   https://scikit-learn.org/stable/modules/tree.html#tree
random forest    https://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees
hierarchical     https://en.wikipedia.org/wiki/Hierarchical_clustering
feature selection  https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection

## XGBoost Model: Tree based model

XGBoost is a tree-based model with a different implementation compared to other tree-based models. The ensemble it creates is comprised of weak learners, meaning that each tree in the ensemble can barely make accurate predictions. But with enough of these models, it actually creates an ensemble that usually outperforms random forests. A benefit to XGBoost is that it provides a way to highly optimize the models. This is done by offering a large number of hyperparameters to tune. The downside to all these optimizations, and with ensemble models in general, is that they are hard to interpret.

https://xgboost.readthedocs.io/en/latest/prediction.html

https://arxiv.org/abs/1603.02754

In [None]:
# Code Example of XGBoost
import pandas as pd
import xgboost as xgb

df = pd.DataFrame(
    [[1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1]],
    columns=["num", "amount", "target"]
)
df_xgb = xgb.DMatrix(
    df[["num", "amount"]], label=df["target"]
)
params = {"eval_metric": "logloss", "objective": "binary:hinge"}
bst = xgb.train(params, df_xgb)

bst.predict(df_xgb)

# output
array([0., 1., 0., 1.], dtype=float32)


Exercise: XGBoost

In [None]:
# Load in the wine dataset
wine = datasets.load_wine()

# Create the wine `data` dataset as a dataframe and name the columns with `feature_names`
df = pd.DataFrame(wine['data'], columns=wine['feature_names'])

# Include the target as well
df['target'] = wine['target']

# Split your data with these ratios: train: 0.8 | test: 0.2
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

# Load your train/test dataframe into DMatrix
dtrain = xgb.DMatrix(df_train[wine['feature_names']], df_train['target'])
dtest = xgb.DMatrix(df_test[wine['feature_names']], df_test['target'])


In [None]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model
params = param = {"max_depth": 10, "eta": 1, "objective": "multi:softmax", "num_class": 5}
num_round = 100

bst = xgb.train(params, dtrain, num_round)


# xgboost is not scikit learn, so you'll need to do predictions using their API
preds = bst.predict(dtest)

# Accuracy score using scikit learn function for classification metric
accuracy_score(df_test['target'], preds)


# Plot the importance of the features based on fitted trees
xgb.plot_importance(bst)

XGBoost Regression Ex

In [None]:
# Load in the diabetes dataset
diabetes = diabetes = datasets.load_diabetes()

# Create the diabetes `data` dataset as a dataframe and name the columns with `feature_names`
dfd = pd.DataFrame(diabetes['data'], columns=diabetes['feature_names'])

# Include the target as well
dfd['target'] = diabetes['target']

# Split your data with these ratios: train: 0.8 | test: 0.2
dfd_train, dfd_test = train_test_split(dfd, test_size=0.2, random_state=0)

# Load your train/test dataframe into DMatrix
dtrain = xgb.DMatrix(dfd_train[diabetes['feature_names']], dfd_train['target'])
dtest = xgb.DMatrix(dfd_test[diabetes['feature_names']], dfd_test['target'])

In [None]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model
param = { "max_depth": 2, "eta": 0.03, "gamma": 0.09, "colsample_bytree": 0.5,  "objective": "reg:squarederror",}

num_round = 100

bst = xgb.train(param, dtrain, num_round)

In [None]:
# xgboost is not scikit learn, so you'll need to do predictions using their API
preds = bst.predict(dtest)

# R2 score using scikit learn function for regression metric
r2_score(dfd_test['target'], preds)

# Plot the importance of the features based on fitted trees
xgb.plot_importance(bst)

## AutoGluon Model

AutoGluon is a framework that automates the processing, creating, and tuning of ML models. It is part of a class of models call AutoML, which automates the machine learning workflow. In AutoGluon, the main parameters you use are: defining the target value for a dataset, and how long to train for. It will automate everything else, trying a variety of models and parameters up to the time limit. Because of its ease of use, it is a new way to easily create a baseline model. Another benefit of AutoGluon is the way it tries so many different models. Providing metrics on all of the models, you can see how different models work on your data that would otherwise not be used.

https://auto.gluon.ai/stable/index.html

https://en.wikipedia.org/wiki/Automated_machine_learning

https://github.com/windmaple/awesome-AutoML

In [None]:
#Code Example of AutoGluon

import pandas as pd
from autogluon.tabular import TabularPredictor

df = pd.DataFrame(
    [[1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1]],
    columns=["num", "amount", "target"]
)

predictor = TabularPredictor(label="target").fit(
    train_data=df,
    time_limit=60,
    presets="best_quality"
)

# output a summary of created models
predictor.fit_summary()

# evaluate best model from hyperparameter search
performance = predictor.evaluate(df)

Exercise: AutoGluon

Load in the wine dataset from scikit learn.
For the wine dataset, create a train and test split, 80% train / 20% test.
Create a AutoGluon Classifier model with these hyper parameters:

time_limit: 120
presets: best_quality
Output the model table summary

Evaluate the trained model on the test dataset
Load the diabetes dataset from scikit learn
For the Diabetes dataset, create a train and test split, 80% train / 20% test.
Create a AutoGluon Regression model with these hyper parameters:

eval_metric: r2
time_limit: 120
presets: best_quality
Output the model table summary

Evaluate the trained model on the test dataset


In [None]:
!pip install -U pip
!pip install -U setuptools wheel
!pip install -U "mxnet<2.0.0" bokeh==2.0.1
!pip install autogluon --no-cache-dir

Setup
Open up Sagemaker Studio
Notebook should be using a ml.t3.medium instance (2 vCPU + 4 GiB)
Notebook should be using kernal: Python 3 (MXNet 1.8 Python 3.7 CPU Optimized)

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.metrics import r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

AutoGluon Classifier

In [None]:
# Load in the wine dataset
wine = datasets.load_wine()

# Create the wine `data` dataset as a dataframe and name the columns with `feature_names`
df = pd.DataFrame(wine['data'], columns=wine['feature_names'])

# Include the target as well
df['target'] = wine['target']

# Split your data with these ratios: train: 0.8 | test: 0.2
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model
# No need to explicitly say this is a classifier, autogluon will pick it up
predictor = TabularPredictor(label ='target').fit(train_data=df_train, time_limit=120,
                                                  presets="best_quality")

In [None]:
# Evaluate the models performance on the test dataset
performance = performance = predictor.evaluate(df_test)

In [None]:
# Output the model's `score_val` in a bar chart to compare performance
predictor.leaderboard(silent=True).plot(kind="bar", x="model", y="score_val")

AutoGluon Regression

In [None]:
# Load in the diabetes dataset
diabetes = datasets.load_diabetes()

# Create the diabetes `data` dataset as a dataframe and name the columns with `feature_names`
dfd = pd.DataFrame(diabetes["data"], columns=diabetes["feature_names"])

# Include the target as well
dfd['target'] = diabetes["target"]

# Split your data with these ratios: train: 0.8 | test: 0.2
dfd_train, dfd_test = train_test_split(dfd, test_size=0.2, random_state=0)

In [None]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model
# No need to explicitly say this is a regression, autogluon will pick it up
predictor = TabularPredictor(label="target", problem_type="regression", eval_metric="r2"
                            ).fit(train_data=dfd_train, time_limit=120, presets="best_quality",)

In [None]:
# Output the fit summary of the training run
predictor.fit_summary()

In [None]:
# Evaluate the models performance on the test dataset
performance = predictor.evaluate(dfd_test)

In [None]:
# Output the model's `score_val` in a bar chart to compare performance
predictor.leaderboard(silent=True).plot(kind="bar", x="model", y="score_val")

Sagemaker Jumpstart : https://docs.aws.amazon.com/sagemaker/latest/dg/studio-jumpstart.html