<a href="https://colab.research.google.com/github/visiont3lab/iot-app/blob/master/notebooks/Regressione_Steps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Widgets
# Text Widget: https://ipywidgets.readthedocs.io/en/latest/examples/Widget%20List.htm
# print(dir(widget))
import ipywidgets as widgets
from ipywidgets import GridspecLayout, AppLayout, Button, Layout,Output

# Machine Learning
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV,train_test_split,KFold,cross_val_score
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,mean_absolute_error,  median_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import RANSACRegressor, SGDRegressor, HuberRegressor, TheilSenRegressor,LinearRegression,Lasso, Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,ExtraTreesRegressor,RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn import datasets

# Dataset
import pandas as pd

# Math
import numpy as np

# Visualization
%load_ext google.colab.data_table
from google.colab import data_table
import seaborn as sn
import matplotlib.pyplot as plt
import random
import plotly.graph_objects as go
import plotly.express as px

# Saving
import pickle
import json

  import pandas.util.testing as tm


In [None]:
def plot_fig(Ys, names):
    # Ys list of output to plot [Y_real, Y_pred]
    n = np.linspace(0,len(Ys[0]), len(Ys[0]), dtype=int)
    fig = go.Figure()
    for yh,nm in zip(Ys,names):
        fig.add_trace(go.Scatter(x=n, y=yh,
                      mode='lines',#mode='lines+markers',
                      name=nm))
    fig.update_layout(
      hovermode = "x",
      paper_bgcolor = "rgb(0,0,0)" ,
      plot_bgcolor = "rgb(10,10,10)" , 
      title=dict(
          x = 0.5,
          text = "Train Results",
          font=dict(
              size = 20,
              color = "rgb(255,255,255)"
          )
      )
    )
    return fig

def validate(Y_test,Y_pred,name):
    mse = mean_squared_error(Y_test,Y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(Y_test,Y_pred)
    medae = median_absolute_error(Y_test,Y_pred)
    #print("[" + name + "]" + " MSE: ", round(mse,4), "RMSE  : ", round(rmse,4), "MAE: ", round(mae,4), "MedAE: ", round(medae,4))

def compare_models(X,Y):
    # Split data into training and validation set
    #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.01,  shuffle=True, random_state=0) 
    #print("Shapes: X_train: ", X_train.shape, "Y_train: ", Y_train.shape, "X_test: ", X_test.shape, "Y_test", Y_test.shape)
    #print("Metric : negative mean square error (MSE)")

    # Scaling
    sc = StandardScaler()
    sc.fit(X)
    X_train = sc.transform(X)
    Y_train = Y
    #X_test = sc.transform(X_test)

    # PCA
    pc = PCA(n_components=0.98)
    pc.fit(X_train)
    X_train = pc.transform(X_train)
    #X_test = pc.transform(X_test)
    #print (pc.explained_variance_)
    #print (pc.explained_variance_ratio_)
    
    # Polinomial degree
    '''
    poly = PolynomialFeatures(degree=2)
    poly.fit(X_train)
    X_train = poly.transform(X_train)
    X_test = poly.transform(X_test)
    '''

    # user variables to tune
    seed    = 5
    folds   = 5 # 10 = 10%, 5 = 20% for testing
    #5-fold cross validation. This means that 20% of the data is used for testing, this is usually pretty accurate.
    metric  = "neg_mean_squared_error"

    # hold different regression models in a single dictionary
    models = {}
    models["Linear"]        = LinearRegression()
    #models["RANSAC"]        = RANSACRegressor()
    models["Huber"]         = HuberRegressor(max_iter=1000)
    models["TheilSen"]      = TheilSenRegressor()
    #models["SGD"]           = SGDRegressor(max_iter=500,penalty=None, eta0=0.01, tol=0.00001)
    models["Ridge"]         = Ridge()
    models["Lasso"]         = Lasso()
    models["ElasticNet"]    = ElasticNet()
    models["KNN"]           = KNeighborsRegressor(n_neighbors=5)
    models["DecisionTree"]  = DecisionTreeRegressor()
    models["SVR"]           = SVR(gamma="auto")
    models["AdaBoost"]      = AdaBoostRegressor(n_estimators=50)
    models["GradientBoost"] = GradientBoostingRegressor(n_estimators=100)
    models["RandomForest"]  = RandomForestRegressor(n_estimators=100)
    models["ExtraTrees"]    = ExtraTreesRegressor(n_estimators=100)

    # 10-fold cross validation for each model
    model_results = []
    model_names   = []
    for model_name in models:
        model   = models[model_name]
        k_fold  = KFold(n_splits=folds, random_state=seed,shuffle=True)
        results = cross_val_score(model, X_train, Y_train, cv=k_fold, scoring=metric)

        model_results.append(results)
        model_names.append(model_name)
        #print("{}: {}, {}".format(model_name, round(results.mean(), 3), round(results.std(), 3)))

    fig = go.Figure()
    for name,res in zip(model_names,model_results):    
        fig.add_trace(go.Box(y=res,name=name, boxpoints='all'))
    #fig.show()
    return fig

def train(X,Y,selected="Linear", modelName='best_model.sav'):
    # create and fit the best regression model
    seed =5
    models = {}
    models["Linear"]        = LinearRegression()
    #models["RANSAC"]        = RANSACRegressor()
    models["Huber"]         = HuberRegressor(max_iter=1000)
    models["TheilSen"]      = TheilSenRegressor()
    #models["SGD"]           = SGDRegressor(max_iter=500,penalty=None, eta0=0.01, tol=0.00001)
    models["Ridge"]         = Ridge()
    models["Lasso"]         = Lasso()
    models["ElasticNet"]    = ElasticNet()
    models["KNN"]           = KNeighborsRegressor()
    models["DecisionTree"]  = DecisionTreeRegressor()
    models["SVR"]           = SVR()
    models["AdaBoost"]      = AdaBoostRegressor()
    models["GradientBoost"] = GradientBoostingRegressor()
    models["RandomForest"]  = RandomForestRegressor()
    models["ExtraTrees"]    = ExtraTreesRegressor()
    
    best_model = models[selected]

    # Logistic Regression
    pipeline = Pipeline([
        #("sc", StandardScaler()),
        #("pca", PCA(n_components=0.98)),
        ("reg", best_model),
    ])
    pipeline.fit(X, Y)
    
    #best_model.fit(X_train, Y_train)
    # make predictions using the model (train and test)
    Y_pred = pipeline.predict(X)
    #print("[INFO] MSE : {}".format(round(mean_squared_error(Y_test, Y_test_pred), 3)))

    # R2 score coefficient of determination (quanto gli input influscono sulla predizione)
    # 0 male 1 bene
    #validate(Y_train,Y_train_pred,name="Training")
    R2 = pipeline.score(X, Y)
    #print("[Training] R2 Score: ", round(R2_train,3))

    pickle.dump(pipeline, open(modelName, 'wb'))

    fig = plot_fig([Y,Y_pred],["Train Real", "Train Predicted"])
    #print( "R2 Score: ", round(R2,3))
    return fig

def apply_model(X,modelName='best_model.sav'):
    loaded_model = pickle.load(open(modelName, 'rb'))
    y_hat = loaded_model.predict(X)
    return y_hat

# Ipywidget

In [None]:
# Load data 
global X,Y
table = Output()
def data_btn_eventhandler(obj):
    table.clear_output()
    boston = datasets.load_boston()
    df = pd.DataFrame(boston.data,columns=boston.feature_names)
    df["target_price"] = boston.target
    df = df.dropna()
    global X,Y
    X = df.iloc[:,0:-1].values 
    Y = df.iloc[:,-1].values    
    #df = pd.read_csv("https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv")
    with table:
      display(data_table.DataTable(df, include_index=False, num_rows_per_page=10))
data_btn = Button(description="1. Load Data", button_style="info", layout=Layout(height='auto', width='auto'))
data_btn.on_click(data_btn_eventhandler)

# Compare Models
compare_models_image = Output()
def compare_btn_eventhandler(obj):
    compare_models_image.clear_output()
    fig_compare_models = compare_models(X,Y)
    with compare_models_image:
        display(fig_compare_models)
compare_btn = Button(description="2. Find Best Model", button_style="info", layout=Layout(height='auto', width='auto'))
compare_btn.on_click(compare_btn_eventhandler)

# Train
train_image = Output()
def train_btn_eventhandler(obj):
    train_image.clear_output()
    #model_list = ["Linear", "Huber", "TheilSen","Ridge","Lasso","ElasticNet","KNN","DecisionTree","SVR","AdaBoost","GradientBoost","RandomForest","ExtraTrees"]
    chosen_model = train_model_choice.value # "GradientBoost"
    fig_train = train(X,Y,selected=chosen_model,modelName=model_name_text.value) #"best_model.sav")
    with train_image:
        display(fig_train)
train_btn = Button(description="3. Train", button_style="info", layout=Layout(height='auto', width='50%'))
train_btn.on_click(train_btn_eventhandler)

apply_model_image = Output()
def apply_model_btn_eventhandler(obj):
    apply_model_image.clear_output()
    # Test
    boston = datasets.load_boston()
    df = pd.DataFrame(boston.data,columns=boston.feature_names)
    df["target_price"] = boston.target
    df = df.dropna()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.80,  shuffle=True) 
    #print("MODELLO: " + model_name_text.value)
    Y_pred = apply_model(X_test,modelName=model_name_text.value)
    fig_res = plot_fig([Y_test, Y_pred], ["Real", "Predicted"])
    with apply_model_image:
        display(fig_res)

apply_model_btn = Button(description="Indipendente Apply", button_style="info", layout=Layout(height='auto', width='auto'))
apply_model_btn.on_click(apply_model_btn_eventhandler)

model_list = ["Linear", "Huber", "TheilSen","Ridge","Lasso","ElasticNet","KNN","DecisionTree","SVR","AdaBoost","GradientBoost","RandomForest","ExtraTrees"]
train_model_choice = widgets.RadioButtons(options=model_list)

#dataset_url = widgets.Textarea(value='', placeholder='Paste CSV Url Dataset', disabled=False,layout=Layout(height='auto', width='auto'))
model_name_text = widgets.Textarea(value='best_model.sav', placeholder='Model Name', disabled=False,layout=Layout(height='auto', width='50%'))
v0 = widgets.HBox([train_model_choice,train_btn,model_name_text])
v1 = widgets.VBox([data_btn, compare_btn,v0,apply_model_btn], layout=Layout(height='auto', width='30%'))
v2 = widgets.VBox([table,compare_models_image,train_image,apply_model_image],layout=Layout(height='auto', width='70%'))


display(widgets.HBox([v1,v2]))

HBox(children=(VBox(children=(Button(button_style='info', description='1. Load Data', layout=Layout(height='au…