## Deployment.ipynb
<hr>

### File containting all the deployment codes

This notebook contains all the contents of respective files that were required to deploy the model, for predicting time_to_eruption of volcano, on heroku platform

The files that are required are listed below along with its content in respective code cells.

#### 1) app.py
Contains the main streamlit python app code

In [None]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from helper import *
from model import *
from predict import *


def main():

    st.set_page_config(layout="wide")

    st.title("Volcanic Eruption Prediction from Seismic Signals")
    image_url = "https://images.unsplash.com/photo-1519901416153-b3ea11f3fcc6?ixlib=rb-1.2.1&ixid=MXwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHw%3D&auto=format&fit=crop&w=1350&q=80"
    st.image(image_url, width=800)

    st.sidebar.header("About")

    with open('about.txt', 'r') as f:
        about_txt = f.read()

    st.sidebar.write(about_txt)

    st.text("")
    st.text("")
    st.text("")

    exp = st.beta_expander("Instructions : ", expanded=True)
    with exp:
        "1. Please upload a csv file in the file uploader."
        "2. File should contain the 10-D sensor data."
        "3. Visualize the time series by toggling appropriate options."

    st.text("")
    st.text("")
    st.text("")
    st.text("")

    st.subheader("File Uploader :")
    csv = st.file_uploader("Upload a segment file containing the seismic data from 10 sensors", type=['csv'])



    if csv is not None:
        st.text("")
        st.text("")
        st.text("")
        st.subheader("First few entries of the file : ")
        csv_file = pd.read_csv(csv)
        st.write(csv_file.head())
        pred = predict_time_to_erupt(csv_file)

        st.text("")
        st.text("")


        col_1, col_2 = st.beta_columns(2)
        col_2.subheader('Visualize')
        vizualize = col_2.radio('', ['No', 'Yes'])

        col_1.subheader("Time to eruption")
        col_1.text("{} centi-seconds".format(str(pred)))
        hm = ms_hm(pred)
        col_1.text('Which is approximately {} hours {} minutes'.format(hm['hours'], hm['minutes']))

        if vizualize=='Yes':
            with st.spinner(text='Please wait while we plot'):
                csv_file.plot(subplots=True, layout=(5,2), figsize=(20,10), title="Sensor data for the given segment")
                st.text("")
                st.text("")
                st.pyplot(plt)

if __name__=="__main__":
    main()

#### 1.1. helper.py
Contains a small helper function to convert centi seconds to hours-minutes

In [None]:
def ms_hm(value):
    """
    This function will convert from 10-milliseconds(centi second) format to
    hours and minutes.
    """
    no_of_ten_msecs = value
    no_of_msecs = 10 * value
    no_of_secs = int(no_of_msecs/1000)
    no_of_hours = int(no_of_secs/3600)
    no_of_mins = int((no_of_secs % 3600)/60)
    hm ={'hours' : no_of_hours,
         'minutes' : no_of_mins}
    return hm

#### 1.2. model.py
This file contains the model definition (the final model that will be used for deployement)

In [None]:
import numpy as np

class EnsembleRegressor():  
    """
    This module implements a custom ensemble model.
    The training procedure on train set is as follows:
        * splits the train set into D1 and D2.(50-50)
        * now from this D1 sampling is done with replacement 
          to create d1,d2,d3....dk(k samples)
        * k DecisionTree models are now trained on each of these k samples
        (k can be considered as a hyperparameter)
        * now the set aside D2 is passed to the k trained models to obtain a k-dimensional feature set
        * with the help of these feature set along with D2 targets, a metalearner is trained
          which is also a decision tree. This metalearner is our actual model and rest of the base just
          baselearner can be considered as feature extractors
    """
    
    def __init__(self, n_learners = 10, meta_learner = None, oob_size=0.5, max_sample_ratio=None, meta_rs=False, meta_params=None):
        self.n_learners = n_learners
        self.oob_size = oob_size
        self.max_samples = max_samples_ratio if max_sample_ratio is not None else 0.2
        self.tree_list = [DecisionTreeRegressor() for i in range(self.n_learners)]
        
        self.meta_rs = meta_rs
        
        
        
        if meta_learner is None or meta_learner == 'decision_tree':
            self.meta_learner = DecisionTreeRegressor()
        elif meta_learner == 'random_forest':
            self.meta_learner = RandomForestRegressor()
        elif meta_learner == 'xgboost':
            self.meta_learner = XGBRegressor()
        elif meta_learner == 'svr':
            self.meta_learner = SVR()
        elif meta_learner == 'kernel_ridge':
            self.meta_learner = KernelRidge()
        elif meta_learner == 'bayesian_ridge':
            self.meta_learner = BayesianRidge()
            
        if self.meta_rs:
            if not isinstance(meta_params, dict):
                raise ValueError("Hyperparameter Search Mode requires a dictionary of parameters")
            else:
                self.meta_params = meta_params
                self.rs_obj = RandomizedSearchCV(self.meta_learner, self.meta_params, cv=5, n_iter=3, n_jobs=3)
            
        return None
    
    
    def _create_sample(self,X,y,fraction):
        X = X.to_numpy() if isinstance(X, pd.DataFrame) else X
        indices = random.sample(range(len(X)), int(fraction*len(X)))
        return X[indices], y[indices]
    
    def fit(self, X, y):
        
        X_D1, X_D2, y_D1, y_D2 = train_test_split(X,y,test_size=self.oob_size)
        
        D2_predlist = []
        
        for i in range(self.n_learners):
            X_temp, y_temp = self._create_sample(X_D1, y_D1, self.max_samples)
            self.tree_list[i].fit(X_temp, y_temp)
            preds = self.tree_list[i].predict(X_D2)
            
            D2_predlist.append(preds)
        
        new_feature_set = np.stack(D2_predlist, axis=1)
        
        if self.meta_rs:
            self.rs_obj.fit(new_feature_set, y_D2)
            self.meta_learner = self.rs_obj.best_estimator_
        
        self.meta_learner.fit(new_feature_set, y_D2)
        return self
        
    def predict(self, X):
        
        D2_predlist = []
        
        for i in range(self.n_learners):
            preds = self.tree_list[i].predict(X)    
            D2_predlist.append(preds)
        
        new_feature_set = np.stack(D2_predlist, axis=1)
        return self.meta_learner.predict(new_feature_set)

#### 1.3. predict.py
Contain the two final funtions: prediction & scoring

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_absolute_error as mae
from model import *

def predict_time_to_erupt(seg_df):
    
    seg_df = seg_df.fillna(0)
    
    each_row = []
        
    for each_column in seg_df.columns:
        each_row.append(seg_df[each_column].std())
        each_row.append(seg_df[each_column].min())
        each_row.append(seg_df[each_column].max())
        each_row.append(seg_df[each_column].quantile(.3))
        each_row.append(seg_df[each_column].quantile(.6))
        each_row.append(seg_df[each_column].quantile(.8))
        each_row.append(seg_df[each_column].quantile(.9))
        each_row.append(seg_df[each_column].kurt())
    
    features = np.array(each_row).reshape(1,-1)
    
    with open('custEnsemblexgb.pkl', 'rb') as f:
        best_estimator = pickle.load(f)
        
    preds = best_estimator.predict(features)
    
    return preds[0]



def return_mae(seg_df, y):
    
    seg_df = seg_df.fillna(0)
    
    each_row = []
        
    for each_column in seg_df.columns:
        each_row.append(seg_df[each_column].std())
        each_row.append(seg_df[each_column].min())
        each_row.append(seg_df[each_column].max())
        each_row.append(seg_df[each_column].quantile(.3))
        each_row.append(seg_df[each_column].quantile(.6))
        each_row.append(seg_df[each_column].quantile(.8))
        each_row.append(seg_df[each_column].quantile(.9))
        each_row.append(seg_df[each_column].kurt())
    
    features = np.array(each_row).reshape(1,-1)
    
    with open('custEnsemblexgb.pkl', 'rb') as f:
        best_estimator = pickle.load(f)
        
    preds = best_estimator.predict(features)
    
    
    return mae(preds[0], y)  

#### 2) setup.sh
In this file we will specify command to create a new directory streamlit and various initialization parameters.

In [None]:
mkdir -p ~/.streamlit/

echo "\
[server]\n\
port = $PORT\n\
enableCORS = false\n\
headless = true\n\
\n\
" > ~/.streamlit/config.toml

#### 3) Procfile
Required to initiate web app and run setup script & streamlit app

In [None]:
web: sh setup.sh && streamlit run app.py

#### 4) runtime.txt 
For specifying the python version, for environment creation

In [None]:
python-3.7.10

#### 5) requirements.txt
For specifying the version requirement for other dependencies

In [None]:
streamlit==0.80.0
pandas==1.1.0
numpy==1.18.5
scikit-learn==0.22.2.post1
matplotlib==3.3.0
xgboost==1.1.1