# -------------------------------------- Sentosa Starter Pack ------------------------------------

### Table of contents

* [0. Initial Setup](#0.InitialSetup)

* [1. Basic-Utils](#1.Basic-Utils)

* [2. Sentosa-Utils](#2.Sentosa-Utils)

* [3. Machine Learning-Utils](#3.MachineLearning-Utils)

* [4. How to use the utils](#4.Launcher)
   

### 0. Initial Setup <a class="anchor" id="0.InitialSetup"></a>

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt

import networkx as nx
import pylab
from math import sin, cos, sqrt, atan2, radians

import descartes
import folium

#Machine learning packages
import statistics
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [1]:
config={       
   
    "TIDY_DATES": ['Open Timestamp'], #list of columns to be formatted into datetime data types
        
    
    
    "TARGET_VAR": "", #specify what you are predicting
    "ID_VAR": "", #specify the ID variable which doesnot have to be included in the model
    "DROP_LIST": [] #specify list of columns from data frame which doesnot have to be included in model training
    
}

### 1. Basic-Utils <a class="anchor" id="1.Basic-Utils"></a>

In [21]:
def get_tidy_dates(df, config):    
    """
    Format the dates to specific format
    Inputs      : df     = dataframe where dates have to be formatted
                : config = list of date columns
    outputs     : model data frame with formatted date columns
    """
    
    df[config['TIDY_DATES']]=df[config['TIDY_DATES']].apply(pd.to_datetime,format='%d/%m/%Y %p %I:%M:%S')
        
    return df  

In [18]:
def get_hist(df,X,HUE=None,TITLE=None):
    """
    Plot histogram of a variable from a given dataframe
    Inputs      : df     = dataframe where the column to be plotted exists
                : X      = column for which histogram has to be plotted
                : HUE    = [Optional] category column to separate various histogram plots
                : TITLE  = [Optional] Title of the chart
    outputs     : Histogram plot
    """
    
    plt.figure(figsize=(14, 6))
    sns.histplot(data=df, x=X, hue=HUE,element='step')
    plt.title(TITLE, fontsize=15)
    

In [22]:
def get_line(df,X,Y,HUE=None,TITLE=None):
    """
    Plot line chart of a variable from a given dataframe
    Inputs      : df     = dataframe where the column to be plotted exists
                : X      = column name for X-axis, generally the date column
                : Y      = column name to be plotted in the y-axis
                : HUE    = [Optional] category column to separate various line plots
                : TITLE  = [Optional] Title of the chart
    outputs     : Line plot
    """
    
    plt.figure(figsize=(20, 6))
    sns.lineplot(data=df, x=X,y=Y, hue=HUE)
    plt.title(TITLE, fontsize=15)
    

In [23]:
def get_bar(df,X,HUE=None,TITLE=None,):
    """
    Plot bar chart of a variable from a given dataframe
    Inputs      : df     = dataframe where the column to be plotted exists
                : X      = column for which bar chart has to be plotted
                : HUE    = [Optional] category column to separate various bar plots
                : TITLE  = [Optional] Title of the chart
    outputs     : Bar plot
    """
    
    plt.figure(figsize=(12, 5))
    sns.countplot(data=df, x=X, hue=HUE, color='Salmon')
    plt.title(TITLE, fontsize=15)

In [24]:
def plot_scatter(x,y):
    """
    Plot histogram of a variable from a given dataframe
    Inputs      : x      = values to be plotted in X-axis
                : y      = values to be plotted in Y-axis
    outputs     : Scatter plot
    """
    
    plt.figure(figsize=(14, 6))
    plt.scatter(x,y)
    plt.xlabel("x")
    plt.ylabel("y")

### 2. Sentosa-Utils <a class="anchor" id="2.Sentosa-Utils"></a>

In [27]:
def get_time_of_day(df):
    
    """
    Create time of the day for the date column
    Inputs      : df      = sdc_df, bus ridership data
    outputs     : df, updated data frame with a new column called "TIME_OF_DAY"
    """   
    
    
    df['TIME_OF_DAY']=np.where(((df['Open Timestamp'].dt.hour>=16) & (df['Open Timestamp'].dt.hour<23)),"Evening",
                            np.where(((df['Open Timestamp'].dt.hour>=7) & (df['Open Timestamp'].dt.hour<12)),"Morning",
                                            np.where(((df['Open Timestamp'].dt.hour>=12) & (df['Open Timestamp'].dt.hour<16)),"Afternoon",
                                                    "Night")))
    
    return df

In [3]:
def get_zone_attraction_df(df):
    
    """
    Create time of the day for the date column
    Inputs      : df      = attraction_df, list of all attractions in the data provided such as F&B, attraction, etc.
    outputs     : zone_df, updated data frame total attraction summarized and score normalized between 1-10
    """   
    
    zone_df=df.groupby(by='Zone')[['Lat','Long']].mean().reset_index()

    zone_attraction_df=df[df.Sector.isin(['Attraction','F&B','Hotel','Transport'])].pivot_table(index=['Zone'],columns='Sector',values='Places/ Location',aggfunc='count',fill_value=0).reset_index()
    zone_attraction_df['Total_attraction']=zone_attraction_df['Attraction']*0.4+zone_attraction_df['F&B']*0.25+zone_attraction_df['Hotel']*0.25+zone_attraction_df['Transport']*0.10

    zone_df=pd.merge(zone_df,zone_attraction_df, on='Zone', how='left')
    zone_df['Total_attraction']=zone_df['Total_attraction'].fillna(0)
    
    zone_df['Total_attraction_Normalized']=(zone_df['Total_attraction']-zone_df['Total_attraction'].min())/(zone_df['Total_attraction'].max()-zone_df['Total_attraction'].min())*(10-1)+1
    
    return zone_df

In [None]:
def get_zone_map(df): 
    
    """
    Plot map of sentosa based on the attractions/zone data frame
    Inputs      : df      = zone_df, zone or attractions data frame
    outputs     : map of sentosa with all zone plotted and size of zone highlights the number of attractions in respective zones
    """   

    lat=df.Lat
    long=df.Long

    zone_map = folium.Map(location=[1.249, 103.83], zoom_start=14.458)
    for i in range(len(lat)):
        folium.Circle( location=[ lat[i], long[i] ], popup=df.Zone[i], radius=df.Total_attraction[i]*15, color='darkgreen', fill=True, fill_color='darkgreen').add_to( zone_map )

    #zone_map.save('map.html')
    return zone_map


In [None]:
def get_bus_zone_distance(df1,df2):
    
    """
    Calculate distance in metres between two lat and long positions
    Inputs      : df1      = bus_df,containing list of unique bus stops and lat and long
                : df2      = zone_df, the data frame created above with list of all zones and their lat and long
    outputs     : updated df1 with the closest zone to the bus stop marked
    """  
    
    R = 6371
    
    for i in range(len(df2)):

        dlat=np.radians(df1.Lat - df2.Lat[i])
        dlong=np.radians(df1.Long - df2.Long[i])

        a = np.sin(dlat / 2)**2 + np.cos(np.radians(df2.Lat[i])) * np.cos(np.radians(df1.Lat)) * np.sin(dlong / 2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        distance = R * c    
        df1[df2.Zone[i]]=np.round(distance,3)*1000
        
    df1['Zone'] = df1[df2.Zone.unique()].idxmin(axis=1)
    df1=pd.merge(df1,df2[['Zone','Attraction','F&B','Total_attraction','Total_attraction_Normalized']], on='Zone', how='left')

    return df1

In [None]:
def get_attractiveness_factor_busstop(df):
    
    """
    get popularity factor for each bus stops based on the closeby attractions to the bus stop and guests demand at the stop
    Inputs      : df      = bus-df with list of stops and merged with total guests demand in the total data provided
    outputs     : updated data frame with a new column of "Popularity_factor"
    """   
    
    
    df['Total_Guests_Normalized']=((df.Total_Guests-df.Total_Guests.min())/(df.Total_Guests.max()-df.Total_Guests.min()))*(10-1)+1
    
    df['Popularity_factor']=0.6*df['Total_Guests_Normalized']+0.4*df['Total_attraction_Normalized']
    
    df['Popularity_factor']=((df.Popularity_factor-df.Popularity_factor.min())/(df.Popularity_factor.max()-df.Popularity_factor.min()))*(10-1)+1
    
    return df

In [None]:
def get_time_related_features(df):
    """
    Get a dataframe creating all time/seasonality related features on a given date column
    Inputs      : df      = Sentosa dataframe 
    outputs     : Updated data frame with all new time/seasonality features created
    """
    
    #time seasionality    
    df['MONTH_OF_YEAR']=df['Open Timestamp'].dt.month
    df['DAY_OF_MONTH']=df['Open Timestamp'].dt.day
    df['WEEK_OF_YEAR']=df['Open Timestamp'].dt.isocalendar().week
    df['YEAR']=df['Open Timestamp'].dt.year
    df['QUARTER_OF_YEAR']=df['Open Timestamp'].dt.quarter
    df['IS_YEAR_START']=np.where(df.MONTH_OF_YEAR<3,1,0)
    df['IS_YEAR_END']=np.where(df.MONTH_OF_YEAR>10,1,0)
    
    df['TEMP_TIME']=df['Open Timestamp'].dt.hour
    df['TIME_OF_DAY']=np.where((df.TEMP_TIME>=22) & (df.TEMP_TIME<6),"Night",np.where((df.TEMP_TIME>=6) & (df.TEMP_TIME<14),"Morning","Evening"))
    
    df=df.drop('TEMP_TIME', axis=1)
    
    return df

#### 2.1 For Network Maps of Bus and Routes

In [None]:
def get_network_nodes_edges(bus_df,sdc_df):
    
    """
    Create unique nodes and edges from the bus data frame; nodes represent all bus stops and edges represent # guests transferring between two bus stops
    Inputs      : bus_df      = Unique list of all bust stops
                : sdc_df      = sentosa ridership data frame
    outputs     : Nodes and edges data frame
    """

    nodes_df=(bus_df[['Bus Stops','Popularity_factor']].drop_duplicates()).copy()

    edge_df=sdc_df[sdc_df['Total_Guests']>=1].groupby(by=['Bus Stop','next_stop'])['Total_Guests'].sum().reset_index()
    edge_df=edge_df.rename(columns={'Bus Stop':'from','next_stop':'to','Total_Guests':'weight'})
    edge_df=edge_df[~(edge_df['from']==edge_df['to'])]
    
    edge_df['weight']=(edge_df.weight-edge_df.weight.min())/(edge_df.weight.max()-edge_df.weight.min())*10
    

    return nodes_df, edge_df


In [None]:
def get_pos_dict(df):
    
    """
    Get a dictionary for each node i.e., the bus stop and their lat and long. The dictionary will be used to plot the bus stops on the network map
    Inputs      : df          = a dataframe with list of unique bus stops and their lat and long columns                
    outputs     : Dictionary with nodes i.e., bus stop names and their lat/long
    """
    
    routes_lats=df[['Bus Stops','Long','Lat']]
    xy=routes_lats.drop_duplicates().copy()
    xy['pos']=list(zip(xy.Long,xy.Lat))
    routes_lats=xy[['Bus Stops','pos']]
    routes_lats=routes_lats.set_index('Bus Stops')
    routes_lats=routes_lats.to_dict('index')
    pos_dict={}


    for key,value in routes_lats.items():
        for key2,value2 in value.items():
            pos_dict[key] = np.asarray(value2)
            
    return pos_dict

In [2]:
def plot_network_graph(nodes_df,edge_df,pos_dict):
    
    """
    Plot network graph based on the nodes and edges data frame created. Also use the pos_dict to specify where to plot the nodes based on their lat and long
    Inputs      : nodes_df      = Unique list of all bust stops
                : edge_df       = sentosa ridership data frame
                : pos_dict      =
    outputs     : netwrok map of bus stops(nodes) and guests travelling between stops (edges)
    """

    G=nx.from_pandas_edgelist(edge_df, 'from', 'to', edge_attr='weight', create_using=nx.DiGraph() )
    widths = nx.get_edge_attributes(G, 'weight')

    plt.figure(figsize=(30,30))
    
    #pos = nx.circular_layout(G)
    pos=pos_dict
    nx.draw_networkx_nodes(G,pos,
                           nodelist=nodes_df['Bus Stops'],
                           node_size=nodes_df['Popularity_factor']*10**3,
                           cmap=plt.cm.Greys,
                           #node_color='#41554F',
                           alpha=0.4)
    nx.draw_networkx_edges(G,pos,
                           edgelist = widths.keys(),
                           width=list(widths.values()),
                           edge_color='dimgray',
                           alpha=1)
    nx.draw_networkx_labels(G, pos=pos,
                            labels=dict(zip(nodes_df['Bus Stops'],nodes_df['Bus Stops'])),
                            font_color='black', font_size=14)
    plt.box(False)
    plt.show()
    

In [None]:
def get_busstops_guests_hourly(df):    
    
    """
    Create a dataframe which summarizes the total guests visiting each bus stop in a given route every hour
    Inputs      : df          = sdc_df i.e., bus ridership cleaned data
    outputs     : df_hourly, new data frame with hourly level total guests at a given bus stop in a given route 
    """
    

    df_hourly=sdc_df[( (df.TIME_OF_DAY.isin(['Morning','Afternoon','Evening'])))
                    ].groupby([pd.Grouper(key='Open Timestamp', freq='H'), 
                                pd.Grouper(key='Route'),
                                   pd.Grouper(key='Bus Stop'),
                                      pd.Grouper(key='WEEKEND'),])[['Total_Guests']].sum().reset_index()

    df_hourly['HOUR_OF_DAY']=df_hourly['Open Timestamp'].dt.hour
    
    return df_hourly

In [None]:
def get_busstops_guests_hourly_distribution(df, bus_route, bus_stop, weekend): 
    
    
    """
    Plot hourly distribution of total guests at a bus stop and a bus route
    Inputs      : df          = df_hourly, new data frame with hourly level total guests at a given bus stop in a given route
                : bus_route   = specify the bus route A,B or C
                : bus_stop    = specify from the list of available bus stops in Sentosa
                : weekend     = flag of 0= weekday and 1= weekend
    outputs     : Hourly distribution plot of total guests at a bus stop
    """

    #hourly distribution for bus in all routes and all time of day
    df_hourly_distribution=df.groupby(by=['Route','Bus Stop','WEEKEND','HOUR_OF_DAY'])['Total_Guests'].agg(['mean','count','std','min','max']).reset_index()
    df_hourly_distribution['lower_bound']=round(df_hourly_distribution['mean']-(1.96*(df_hourly_distribution['std']/((df_hourly_distribution['count'])**1/2))),2)
    df_hourly_distribution['upper_bound']=round(df_hourly_distribution['mean']+(1.96*(df_hourly_distribution['std']/((df_hourly_distribution['count'])**1/2))),2)
   
    
    #hourly distribution plot for a specific bus stop in a given route
    fig, axes = plt.subplots(ncols=5, nrows=3, figsize=(15,8))
    
    for i, ax in zip(range(7,22), axes.flat):
        sns.histplot((df[(df['Route']==bus_route) & (df['Bus Stop']==bus_stop) &
                         (df['WEEKEND']==weekend) & (df.HOUR_OF_DAY==i)]['Total_Guests']), 
                             ax=ax, kde=True).set_title(bus_stop + " - Hour of day: "+ str(i) +":00")
        
        max_lim=df_hourly_distribution[(df_hourly_distribution['Route']==bus_route) & ( df_hourly_distribution['Bus Stop']==bus_stop)]['max'].max()
        ax.set_xlim(0,max_lim)
    
    
    plt.tight_layout()
    plt.show()

    return df_hourly_distribution


### 3. Machine Learning-Utils <a class="anchor" id="3.MachineLearning-Utils"></a>

In [9]:
def drop_features(model_df, config):
  """
  Drop the list of unnecessary features not to be used for prediction purpose
  Inputs      : df     = dataframe that columns need to be dropped
              : config = list of columns to be dropped
  outputs     : model data frame with dropped columns
  """
  for i in range(0, len(config['DROP_LIST'])):
    try: 
      model_df.drop(config['DROP_LIST'][i], axis=1, inplace=True)
    except:
      model_df = model_df.copy()
      
  return model_df

In [11]:
def prepare_model_data(model_df,config):
  """
  Read the latest generated model data as part of the ML pipeline
  Inputs      : df     = dataframe from which predictor and target variables can be extracted
              : config = list of ID columns (unique identifier) and name of target variable
  outputs     : two data frames with preditor and target variable separately
  """
  X=model_df.drop(config['TARGET_VAR']+config['ID_VAR'], axis=1)
  y=model_df[config['TARGET_VAR']]
  return X,y
  

In [12]:
def train_test_data(X,y,config):
  """
  Split the data into test and training set sequentially
  Inputs      : X         = dataframe with all the predictor variables
              : y         = dataframe with the target variable
              : test_size = % of test data to be split 
  outputs     : 4 data frames consiting of train and test data separately
  """
  X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, shuffle=False) 
  
  return X_train, X_test, y_train, y_test

In [14]:
def get_scaled_data(X,y,X_train, X_test, y_train, y_test):  
    """
    Get scaled data useful for training predictive models
    Inputs      : X       = predictor/independent variables 
                : y       = target/dependent variable
                : X_train = training dataset for predictor variables
                : X_test  = test dataset for predictor variables
                : y_train = training dataset for target variable
                : y_test  = test dataset for predictor variables
                
    outputs     : scaler definition for both train and test data set along-with scaled variables
    """
        
    scalerX = StandardScaler().fit(X)    
    scalery = StandardScaler().fit(y)
    
    X_train_scaled = pd.DataFrame(scalerX.transform(X_train),columns = X_train.columns)
    X_test_scaled = pd.DataFrame(scalerX.transform(X_test),columns = X_test.columns)
    
    y_train_scaled = pd.DataFrame(scalery.transform(y_train),columns = y_train.columns)
    y_test_scaled = pd.DataFrame(scalery.transform(y_test),columns = y_test.columns)
    
    return scalerX, scalery, X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled

In [17]:
def get_scaled_transform(scalery,y_pred_scaled):
    
    """
    Get scaled prediction data back to the original values 
    Inputs      : scalery       = scalar definition of y(dependent variable) dataset, obtained from 'get_scaled_data' function
                : y_pred_scaled = y_pred value which has to be scaled back to normal
    outputs     : updated predictions based on actual range
    """
    
    y_pred = scalery.inverse_transform(y_pred_scaled)
    
    return y_pred


In [15]:
def train_regression_model(X_train,y_train):
    """
    Train a regression model
    Inputs      : X_train      = training data set only independent variables 
                : y_train      = training data set only target variable
    outputs     : trained model file which can be used to predic on test data and also be saved as pickle file
    """
           
    model=LinearRegression().fit(X_train,y_train)       
   
    return model


In [4]:
def initiate_param_grid():
    """    
    [Optional Function]
    Parameter grid for hyperparameter tuning the model for tree based model
    """
    n_estimators = [int(x) for x in np.linspace(start=200, stop=300, num=2)]
    min_samples_split = [int(x) for x in np.linspace(start=100, stop=150, num=1)]
    min_samples_leaf = [int(x) for x in np.linspace(start=60, stop=150, num=1)]
    max_depth = [int(x) for x in np.linspace(start=10, stop=15, num=2)]
    max_features = ['sqrt']
    subsample = [x for x in np.linspace(start=0.5, stop=1.0, num=1)]

    param_grid = {'n_estimators': n_estimators,
                  'max_depth': max_depth,
                  'learning_rate': [0.1],
                  'min_samples_split': min_samples_split,
                  'min_samples_leaf': min_samples_leaf,
                  'max_features': max_features,
                  'subsample': subsample
                }

    return param_grid

In [6]:
def train_classification_model(X_train,y_train,param_grid):
    """
    Model hyperparameter tuning: Iterate through grid search cv parameters to
    find the best fit for the regressor model

    Inputs: X_train    = matrix with list of predictor variables to be used in the
                         predictive model
          : y_train    = target variable column
          : param_grid = hyperparameter tuning grid search cv initialization
          : config

    Output: model      = trained model with the best accuracy obtained from GridSearch CV
    """
   
    gridsearch = GridSearchCV(GradientBoostingClassifier(),
                              param_grid=param_grid, n_jobs=-1)
    
    model = gridsearch.fit(X_train, y_train.values.ravel())
        
   
    return model


In [None]:
def get_feature_importance(model):  
    
    """    
    [Optional Function only for tree based models]
    Get feature importance for the trained tree based model
    """

    variable_importance=(pd.DataFrame(model.best_estimator_.feature_importances_, index=X.columns))

    return variable_importance

In [16]:
def get_prediction(model,X_test):
    """
    [Optional Function only for tree based models]
    Get predicted values for test data set using a trained model file
    Inputs      : model        = trained model file 
                : X_test       = test data set of independent variables for prediction purpose
    outputs     : final predicted values
    """         
    
    y_pred=model.predict(X_test)
    return y_pred

In [7]:
def get_model_statistics(y_test,y_pred):
    """
    [Optional Function only for regression models]
    Get model stats such as MSE, MAPE, etc. for the model based on actual vs predicted values on test data set
    Inputs      : y_test        = actual target variables values from test data set
                : y_test        = predicted target variables values from test data set
    outputs     : Model stats of actual vs predicted values
    """     
    #generating various regression model evaluation stats  
    mse=mean_squared_error(y_test, y_pred)  
    mae=mean_absolute_error(y_test, y_pred)
    mape=np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    #Appending the results to a dataframe
    model_results=[['MSE',mse],['MAE',mae],['MAPE',mape]]
    model_statistics_df=pd.DataFrame(model_results,columns=['Measure', 'Value'])

    return model_statistics_df

In [None]:
def get_confusion_metrics(y_test,y_pred):    
    """
    [Optional Function only for classification models]
    Get confusion matrix for the model based on actual vs predicted values on test data set
    Inputs      : y_test        = actual target variables values from test data set
                : y_test        = predicted target variables values from test data set
    outputs     : confusion matrix of actual vs predicted values
    """     
    
    confusion_matrix=(pd.crosstab(y_test, y_pred, rownames=['Predicted'], colnames=['Actual'], normalize='columns'))*100
    
    plt.figure(figsize=(12,4))
    sns.heatmap(confusion_matrix, annot=True,  fmt='g', cmap='Blues',linewidths=1, linecolor='lightgrey')
    
    
    return confusion_matrix
    


In [None]:
def get_classification_report(y_test, y_pred):
    """
    [Optional Function only for classification models]
    Get classification report for the model based on actual vs predicted values on test data set
    Inputs      : y_test        = actual target variables values from test data set
                : y_test        = predicted target variables values from test data set
    outputs     : classificatio report (precision/recall, etc.) of actual vs predicted values
    """   
    
    print((metrics.classification_report(y_test, y_pred, digits=2)))  
    
    

### 4. How to use the utils <a class="anchor" id="4.Launcher"></a>

In [None]:
#--------------------------------------------NON EXHAUSTIVE USAGE LIST---------------------------------------------------------


#---------------------------------------------------------READ DATA------------------------------------------------------------

sdc_df=pd.read_csv("Bus_Ridership_Jan-Mar21_type2.csv")

#read list of buses in Sentosa and routes and sequence of stops in the route
bus_df=pd.read_csv("bus_list.csv")


#---------------------------------------------------------PROCESS DATA---------------------------------------------------------

#Use "get_tidy_dates" function to convert data into proper format; 
#do update the config file with list of columns which has to converted to datetime format
sdc_df=get_tidy_dates(sdc_df, config)

sdc_df=get_time_of_day(sdc_df)


#------------------------------------------------------- Visualization/Heuristics Model ---------------------------------------

#create hourly level guests data frame
df_hourly=get_busstops_guests_hourly(sdc_df)
#plot hourly guests distribution for the bus stop specified
df_hourly_distribution=get_busstops_guests_hourly_distribution(df_hourly, 'Bus Route A', 'Siloso Point',weekend=0)


#------------------------------------------------------ML Model Build----------------------------------------------------------

#use "prepare_model_data" to divided dataframe into dependent and independent variables
X,y=prepare_model_data(model_df,config)

#use "train_test_data" function to split entire data set into training and testing
X_train, X_test, y_train, y_test=train_test_data(X,y,config)

#use "train_regression_model" to build a model using the training data set created above
model=train_regression_model(X_train,y_train)

#use "get_prediction" function to use the trained model to predict on test data set
y_pred=get_prediction(model,X_test)

#use "get_model_statistics" function to test model accuracy using metrics such as MAPE, MSE for regression based model
model_statistics_df=get_model_statistics(y_test,y_pred)