# Regression @ H3 Level


In [1]:
import os
import sys

# hex2cec
sys.path.insert(0, "/Users/max/Development/green-last-mile/hex2vec")
from src.data.load_data import load_city_tag, load_filter, load_processed_dataset
from src.data.make_dataset import h3_to_polygon


In [2]:
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import geopandas as gpd
import h3
import gcsfs


## Read in the DataFrame


In [3]:
# reading the hex2vec dataframe
tagged_df = pd.read_pickle(
    "/Users/max/Downloads/shared_navish_dataframes_ALL_Boston_all_loc_route_w_tags.pkl"
)


In [4]:
## Remove Planned Service Time Outliers
# q_99 = tagged_df["planned_service_time_sum"].quantile(0.99)
# tagged_df = tagged_df.loc[tagged_df["planned_service_time_sum"] < q_99]


### Remove Columns without Count & Only for Tags that Hex2Vec Likes

In [5]:
SELECTED_TAGS = [
    "aeroway",
    "amenity",
    "building",
    "healthcare",
    "historic",
    "landuse",
    "leisure",
    "military",
    "natural",
    "office",
    "shop",
    "sport",
    "tourism",
    "water",
    "waterway"
]

# tagged_df.loc[*tagged_df.columns[df.columns.str.startswith(tuple(select_tags))]

### Load the Filter and Create a List of Tag Columns


In [6]:
# find the tag columns. This is hacky.
from pathlib import Path

tag_filter = load_filter(
    Path("/Users/max/Development/green-last-mile/hex2vec/filters/from_wiki.json")
)

filtered_tags = [
    f"{tag}_{sub}"
    for tag, subs in tag_filter.items()
    for sub in subs
    # if f"{tag}_{sub}" in tagged_df.columns
]


In [7]:
tag_columns = tagged_df.columns.intersection(filtered_tags)
tag_columns

Index(['amenity_animal_boarding', 'amenity_animal_breeding',
       'amenity_animal_shelter', 'amenity_arts_centre', 'amenity_atm',
       'amenity_baby_hatch', 'amenity_baking_oven', 'amenity_bank',
       'amenity_bar', 'amenity_bbq',
       ...
       'waterway_pressurised', 'waterway_river', 'waterway_riverbank',
       'waterway_soakhole', 'waterway_stream', 'waterway_tidal_channel',
       'waterway_turning_point', 'waterway_water_point', 'waterway_waterfall',
       'waterway_weir'],
      dtype='object', length=970)

In [8]:
amazon_columns = tagged_df.columns.difference(filtered_tags)
amazon_columns

Index(['city', 'city_zone', 'departure_datetime_mean', 'executor_capacity_cm3',
       'h3', 'has_window_max', 'height_mean', 'lat', 'lng',
       'package_id_nunique', 'planned_service_time_max',
       'planned_service_time_mean', 'planned_service_time_min',
       'planned_service_time_sum', 'route_id', 'status', 'volume_mean',
       'volume_sum', 'width_mean', 'zone_id'],
      dtype='object')

### Remove columns that are all 0

In [9]:
# remove columns that sum to 0
drop_col = tag_columns[(tagged_df[tag_columns] == 0).all(axis=0)]
drop_col

Index(['amenity_animal_breeding', 'amenity_baby_hatch', 'amenity_baking_oven',
       'amenity_boat_sharing', 'amenity_brothel', 'amenity_crematorium',
       'amenity_dog_toilet', 'amenity_embassy', 'amenity_funeral_hall',
       'amenity_gambling',
       ...
       'water_oxbow', 'water_reflecting_pool', 'waterway_fairway',
       'waterway_lock_gate', 'waterway_pressurised', 'waterway_riverbank',
       'waterway_soakhole', 'waterway_tidal_channel', 'waterway_turning_point',
       'waterway_water_point'],
      dtype='object', length=349)

In [10]:
tagged_df = tagged_df[tagged_df.columns.difference(drop_col)]
tag_columns = tag_columns.difference(drop_col)

### Remove Rows with all 0s

In [11]:
print(tagged_df.shape[0])
tagged_df = tagged_df.loc[(tagged_df[tag_columns] != 0).any(axis=1)]
print(tagged_df.shape[0])

136746
136746


## Prepare Data for Regression

In [12]:
tagged_df["planned_service_time_sum_log"] = np.log(tagged_df.planned_service_time_sum)
tagged_df.drop('h3', axis=1, inplace=True)
tagged_df["h3_9"] = tagged_df[["lat", "lng"]].apply(lambda x: h3.geo_to_h3(*x, 9), axis=1, raw=True)

### Group the Data on H3 Level

In [13]:
tagged_df = tagged_df.groupby("h3_9").agg({
    "planned_service_time_sum_log": "mean",
    "h3_9": "count",
    # **{
    #     tag: "first"
    #     for tag in tag_columns
    # }
})


### Filter for only H3 with > X Data Points


In [14]:
tagged_df = tagged_df.loc[tagged_df["h3_9"] > 20].copy()
tagged_df.drop("h3_9", axis=1, inplace=True)

In [15]:
print(tagged_df.shape)
tagged_df.head()

(1081, 1)


Unnamed: 0_level_0,planned_service_time_sum_log
h3_9,Unnamed: 1_level_1
892a30281afffff,4.35676
892a3028c23ffff,4.363356
892a3028c27ffff,4.017006
892a3028c33ffff,4.327947
892a3028c37ffff,4.56702


In [16]:
embedding_df = pd.read_pickle("/Users/max/Development/green-last-mile/hex2vec/boston_embedding.pkl")
embedding_df.columns = [f"embedding_{i}" for i in range(embedding_df.shape[1])]
embedding_df.head()

Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_40,embedding_41,embedding_42,embedding_43,embedding_44,embedding_45,embedding_46,embedding_47,embedding_48,embedding_49
h3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892a302120bffff,0.002941,-0.533973,-0.275134,0.489117,-0.304751,-0.324361,0.311838,-0.19766,-0.039428,-0.296189,...,0.212496,-0.228159,0.171335,-0.090209,-0.028852,-0.076019,0.199774,-0.17684,0.187204,-0.538518
892a302120fffff,0.002941,-0.533973,-0.275134,0.489117,-0.304751,-0.324361,0.311838,-0.19766,-0.039428,-0.296189,...,0.212496,-0.228159,0.171335,-0.090209,-0.028852,-0.076019,0.199774,-0.17684,0.187204,-0.538518
892a302122bffff,0.002941,-0.533973,-0.275134,0.489117,-0.304751,-0.324361,0.311838,-0.19766,-0.039428,-0.296189,...,0.212496,-0.228159,0.171335,-0.090209,-0.028852,-0.076019,0.199774,-0.17684,0.187204,-0.538518
892a3021243ffff,0.002941,-0.533973,-0.275134,0.489117,-0.304751,-0.324361,0.311838,-0.19766,-0.039428,-0.296189,...,0.212496,-0.228159,0.171335,-0.090209,-0.028852,-0.076019,0.199774,-0.17684,0.187204,-0.538518
892a3021247ffff,0.002941,-0.533973,-0.275134,0.489117,-0.304751,-0.324361,0.311838,-0.19766,-0.039428,-0.296189,...,0.212496,-0.228159,0.171335,-0.090209,-0.028852,-0.076019,0.199774,-0.17684,0.187204,-0.538518


In [17]:
tagged_df = tagged_df.merge(embedding_df, left_index=True, right_index=True)

## Create a Feature from Center of Boston

In [18]:
Boston_Center = 42.36221987328882, -71.05721434196927

In [19]:
# tagged_df['dist_2_center'] = tagged_df.reset_index()['h3_9'].apply(lambda x: h3.point_dist(h3.h3_to_geo(x), Boston_Center)).values

In [20]:
import plotly.figure_factory as ff

ff.create_distplot(
    [np.exp(tagged_df.planned_service_time_sum_log), ],
    ['data'],
    show_rug=False
)


In [21]:
import numpy as np

ff.create_distplot(
    [tagged_df.planned_service_time_sum_log, ],
    ['data'],
    show_rug=False,
    bin_size=0.2
)


### Convert the Tag Columns to Binary (if desired)


In [22]:
# tagged_df[tag_columns] = tagged_df[tag_columns].gt(0).astype(np.short)


## Regression Model


In [23]:
from sklearn.model_selection import train_test_split

np.random.seed(22)

# getting rid of all the non-numeral data
df_train, df_test = train_test_split(
    tagged_df.copy(), train_size=0.8, test_size=0.2, random_state=900
)


In [24]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


scaler = MinMaxScaler()
scaler.fit(tagged_df)

df_train = pd.DataFrame(
    scaler.transform(df_train), index=df_train.index, columns=df_train.columns
)
df_test = pd.DataFrame(
    scaler.transform(df_test), index=df_test.index, columns=df_test.columns
)


In [25]:
# Dividing data into X and y variables
y_train = df_train["planned_service_time_sum_log"]
X_train = df_train[df_train.columns.difference(["planned_service_time_sum_log"])]


In [26]:
# RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, LogisticRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [27]:
from statsmodels.tools.tools import pinv_extended


def build_model(X, y):
    X = sm.add_constant(X)  # Adding the constant
    model = sm.OLS(y, X)
    lm = model.fit()
    # lm = model.fit_regularized(L1_wt=1)  # fitting the l1 model
    # pinv_wexog,_ = pinv_extended(model.wexog)
    # normalized_cov_params = np.dot(pinv_wexog, np.transpose(pinv_wexog))
    # summary = sm.regression.linear_model.OLSResults(model,lm.params,normalized_cov_params)
    # print(lm.summary())  # model summary
    print(lm.summary())
    return X, lm


def checkVIF(X):
    vif = pd.DataFrame()
    vif["Features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["VIF"] = round(vif["VIF"], 2)
    vif = vif.sort_values(by="VIF", ascending=False)
    return vif


In [33]:
X_train_new, lm = build_model(X_train.astype("float"), y_train.astype("float"))
insig_columns = lm.pvalues.loc[lm.pvalues > 0.05]


                                 OLS Regression Results                                 
Dep. Variable:     planned_service_time_sum_log   R-squared:                       0.446
Model:                                      OLS   Adj. R-squared:                  0.411
Method:                           Least Squares   F-statistic:                     12.89
Date:                          Thu, 08 Sep 2022   Prob (F-statistic):           8.27e-73
Time:                                  07:54:38   Log-Likelihood:                 595.86
No. Observations:                           852   AIC:                            -1090.
Df Residuals:                               801   BIC:                            -847.6
Df Model:                                    50                                         
Covariance Type:                      nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

In [35]:
X_train_new = X_train[
     X_train_new.columns.difference([*insig_columns.index] + ['const'])
]


In [37]:
X_train_new, lm = build_model(X_train_new.astype("float"), y_train.astype("float"))
insig_columns = lm.pvalues.loc[lm.pvalues > 0.05]


                                 OLS Regression Results                                 
Dep. Variable:     planned_service_time_sum_log   R-squared:                       0.204
Model:                                      OLS   Adj. R-squared:                  0.200
Method:                           Least Squares   F-statistic:                     43.48
Date:                          Thu, 08 Sep 2022   Prob (F-statistic):           6.00e-40
Time:                                  07:55:10   Log-Likelihood:                 441.79
No. Observations:                           852   AIC:                            -871.6
Df Residuals:                               846   BIC:                            -843.1
Df Model:                                     5                                         
Covariance Type:                      nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

In [39]:
pvalues = lm.pvalues
imp_pvalues = pvalues.nsmallest(16)

fig = go.Figure(data=[go.Bar(y=imp_pvalues.index, x=imp_pvalues, orientation="h")])
fig.update_layout(
    xaxis_title="p-values",
    yaxis_title="Features",
    title="Most Significant features for OLS Model",
)

fig.show()
# fig.update_layout(barmode='horizontal')


In [40]:
X_train_new = X_train_new[
    X_train_new.columns.difference([*insig_columns.index] + ['const'])
]


In [41]:
X_train_new, lm = build_model(X_train_new.astype("float"), y_train.astype("float"))
insig_columns = lm.pvalues.loc[lm.pvalues > 0.05]

                                 OLS Regression Results                                 
Dep. Variable:     planned_service_time_sum_log   R-squared:                       0.204
Model:                                      OLS   Adj. R-squared:                  0.200
Method:                           Least Squares   F-statistic:                     54.27
Date:                          Thu, 08 Sep 2022   Prob (F-statistic):           9.41e-41
Time:                                  07:56:56   Log-Likelihood:                 441.57
No. Observations:                           852   AIC:                            -873.1
Df Residuals:                               847   BIC:                            -849.4
Df Model:                                     4                                         
Covariance Type:                      nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

In [33]:
# Calculating the Variance Inflation Factor
# checkVIF(X_train_new)


In [42]:
pvalues = lm.pvalues
imp_pvalues = pvalues.nsmallest(16)

fig = go.Figure(data=[go.Bar(y=imp_pvalues.index, x=imp_pvalues, orientation="h")])
fig.update_layout(
    xaxis_title="p-values",
    yaxis_title="Features",
    title="Most Significant features for OLS Model",
)

fig.show()
# fig.update_layout(barmode='horizontal')


In [43]:
# X_train_new = X_train[X_train.columns.difference(['h3_11_mean', 'h3_10_mean', 'h3_10_mean', 'h3_8_mean', 'h3_7_mean', 'h3_6_mean', *tag_columns])]
X_train_new = X_train_new[
    X_train_new.columns.difference([*insig_columns.index] + ['const'])
]


In [44]:
X_train_new

Unnamed: 0,embedding_18,embedding_21,embedding_32,embedding_36
892a3068e83ffff,0.308690,0.621181,0.412619,0.720224
892a33da89bffff,0.305033,0.486293,0.327142,0.629022
892a30658abffff,0.666155,0.487871,0.061944,0.708672
892a331710bffff,0.407955,0.393253,0.210981,0.397654
892a306dbabffff,0.451045,0.547022,0.242418,0.631707
...,...,...,...,...
892a306680bffff,0.746859,0.590655,0.183491,0.435813
892a306ac63ffff,0.366009,0.573224,0.475490,0.859559
892a3065063ffff,0.464851,0.812385,0.318301,0.699847
892a33926afffff,0.868042,0.103382,0.367710,0.422136


In [45]:
lm = sm.OLS(y_train, X_train_new).fit()
y_train_price = lm.predict(X_train_new)


In [46]:
# Plot the histogram of the error terms
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x=(y_train - y_train_price), name="Error Terms", marker_color="indianred"
    )
)


In [47]:
# Dividing into X and y
y_test = df_test["planned_service_time_sum_log"].astype("float")
X_test = df_test[df_test.columns.difference(["planned_service_time_sum_log"])].astype(
    "float"
)


In [48]:
X_train_new.columns

Index(['embedding_18', 'embedding_21', 'embedding_32', 'embedding_36'], dtype='object')

In [49]:
def get_sigf_cols_df(df):
    # Creating X_test_new dataframe by dropping variables from X_test
    # df = sm.add_constant(df)
    df["const"] = 1
    df = df[X_train_new.columns]
    return df


# Adding a constant variable
X_test_new = get_sigf_cols_df(X_test)
# Full_data_h3 = get_sigf_cols_df(Full_data_h3)


In [50]:
# Making predictions
y_pred = lm.predict(X_test_new)


In [51]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)


-0.08593802248474813

In [52]:
# EVALUATION OF THE MODEL
# plot y_test vs. y_pred
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=y_test,
        y=y_pred,
        mode="markers",
        marker=dict(
            color="rgba(152, 0, 0, .8)",
        ),
    )
)

fig.update_layout(
    title="OLS",
    xaxis_title="y_test",
    yaxis_title="y_pred",
    # yaxis_range=[0, 10],
    height=600,
    width=600,
)

#  make the plotly figure square
fig.update_xaxes(scaleanchor="y", scaleratio=1)


### Evaluating the OLS Error


In [51]:
df_test_w_pred = df_test.copy()
tagged_df_test = tagged_df.loc[df_test.index].copy(deep=True)


#### Creating Q5-Q95 windows


In [52]:
# for level in levels:
#     tagged_df_test[f"h3_{level}"] = tagged_df_test[["lat", "lng"]].apply(
#         lambda x: h3.geo_to_h3(*x, level), axis=1, raw=True
#     )
#     grouper = tagged_df_test.groupby(f"h3_{level}")["planned_service_time_sum_log"]
#     tagged_df_test[f"h3_{level}_count"] = grouper.transform("count")
#     tagged_df_test[f"h3_{level}_95_diff"] = grouper.transform(
#         lambda df: np.exp(df.quantile(0.95)) - np.exp(df.quantile(0.5))
#     )
#     tagged_df_test[f"h3_{level}_5_diff"] = grouper.transform(
#         lambda df: np.exp(df.quantile(0.5)) - np.exp(df.quantile(0.05))
#     )
#     # tagged_df[f"h3_{level}_std"] = grouper.transform("std")
#     # tagged_df[f"h3_{level}_std"].fillna(0, inplace=True)
#     # tagged_df_test.drop(f"h3_{level}", axis=1, inplace=True)
tagged_df_test['h3_9_95'] = np.exp(tagged_df_test.planned_service_time_sum_log.quantile(0.95))
tagged_df_test['h3_9_95'] = np.exp(tagged_df_test.planned_service_time_sum_log.quantile(0.05))

#### Selecting Location where Prediction is outside 5% and 95% quantile


In [53]:
df_test_w_pred["planned_service_time_sum_log"] = y_pred
df_test_w_pred = pd.DataFrame(
    scaler.inverse_transform(df_test_w_pred),
    index=df_test_w_pred.index,
    columns=df_test_w_pred.columns,
)
df_test_transformed = pd.DataFrame(
    scaler.inverse_transform(df_test.copy()),
    index=df_test.index,
    columns=df_test.columns,
)
tagged_df_test["predicted_planned_service_time_sum_log"] = df_test_w_pred[
    "planned_service_time_sum_log"
]


#### Reverse the Log Transformation


In [54]:
tagged_df_test[
    ["planned_service_time_sum", "predicted_planned_service_time_sum",]
] = tagged_df_test[
    ["planned_service_time_sum_log", "predicted_planned_service_time_sum_log",]
].apply(
    lambda x: np.exp(x)
)


In [None]:
error_loc = tagged_df_test.loc[
        (
            (
                tagged_df_test["predicted_planned_service_time_sum"]
                - tagged_df_test["planned_service_time_sum"]
            )
            > tagged_df_test["h3_9_95_diff"]
        )
        | (
            (
                tagged_df_test["planned_service_time_sum"]
                - tagged_df_test["predicted_planned_service_time_sum"]
            )
            > tagged_df_test["h3_9_5_diff"]
        )
].copy()

error_loc['error'] = (error_loc['planned_service_time_sum'] - error_loc['predicted_planned_service_time_sum']).abs()

error_loc.head()


In [None]:
import plotly.express as px


In [None]:
from geojson import Feature, FeatureCollection


def hexagons_dataframe_to_geojson(
    df_hex, hex_id_field, geometry_field, value_field, file_output=None
):
    list_features = []

    for i, row in df_hex.iterrows():
        feature = Feature(
            geometry=row[geometry_field],
            id=row[hex_id_field],
            properties={"value": row[value_field]},
        )
        list_features.append(feature)

    return FeatureCollection(list_features)


In [None]:
hex_geo_json = hexagons_dataframe_to_geojson(
    h3_9_gpd,
    "h3_9",
    "geometry",
    "h3_9",
)


In [None]:
# h3_9_gpd["planned_service_time_sum"] = h3_9_gpd["planned_service_time_sum_log"].apply(
#     lambda x: np.exp(x)
# )

# filter for only the hexagons that have a value
h3_9_gpd = h3_9_gpd.loc[h3_9_gpd["h3_9"].isin(error_loc["h3_9"].unique())]


In [None]:
import dotenv
dotenv.load_dotenv()


fig = px.choropleth_mapbox(
    h3_9_gpd,
    geojson=hex_geo_json,
    locations="h3_9",
    # color="blue",
    # color_continuous_scale=px.colors.sequential.Plasma,
    # range_color=(0, 1),
    # mapbox_style="carto-positron",
    # zoom=11,
    center={"lat": error_loc.lat.mean(), "lon": error_loc.lng.mean()},
    opacity=0.4,
    labels={"planned_service_time_sum"},
)

fig.add_trace(
    go.Scattermapbox(
        lat=h3_9_gpd.geometry.centroid.y,
        lon=h3_9_gpd.geometry.centroid.x,
        text=h3_9_gpd[["planned_service_time_sum_5", "planned_service_time_sum_95"]].apply(lambda x: f"[{x[0]:.2f}, {x[1]:.2f}]", axis=1),
        textfont={"color":"white","size":20, "family":"Courier New"},
        mode="text",
        # name="Bergeron"
    )
)

fig.add_trace(
    px.scatter_mapbox(
        error_loc,
        lat="lat",
        lon="lng",
        color="error",
        size="error",
        color_continuous_scale=px.colors.sequential.Greens_r,
        size_max=50,
        zoom=10,
        hover_data=["planned_service_time_sum", "predicted_planned_service_time_sum", ],
        # mapbox_style="open-street-map",
    ).data[0]
)

fig.update_mapboxes(
    go.layout.Mapbox(
        accesstoken=os.environ.get("MAPBOX_KEY"),
        style="mapbox://styles/max-schrader/cl6lhvrfw001516pkh3s6iv7l",
        bearing=0,
        zoom=10,
    )
)

fig.show()


- Creating the entire dataset for prediction:


In [None]:
# np.random.seed(0)

# # getting rid of all the non-numeral data
# tagged_df_test = tagged_df.drop(['lat', 'lng', 'route_id','has_window_max', 'city','city_zone', 'zone_id', 'status', 'departure_datetime_mean',
#                                 'h3', 'planned_service_time_min', 'planned_service_time_max', 'planned_service_time_mean','planned_service_time_sum',
#                                 'planned_service_time_sum_grp_avg', 'planned_service_time_sum_grp_sum','hex_predicted_service_time_sum',
#                                  'discrepency_sum','hex_predicted_service_time_avg','discrepency_avg', 'h3_predicted_service_time_avg', 'h3_predicted_service_time_sum'], axis=1)
# tagged_df_test = pd.DataFrame(
#     scaler.fit_transform(tagged_df_test), index=tagged_df_test.index, columns=tagged_df_test.columns)


# # Now let's use our model to make predictions.
# for_col_df = X_train_new.drop('const', axis=1)
# # Creating X_test_new dataframe by dropping variables from X_test
# tagged_X_test_new = tagged_df_test[for_col_df.columns]

# # Adding a constant variable
# tagged_X_test_new = sm.add_constant(tagged_X_test_new)


# # Making predictions
# tagged_df_hex_sum['h3_predicted_service_time_' + f'{type}'] = lm.predict(Full_data_h3)
# tagged_df['hex_predicted_service_time_'+ f'{type}'] = lm.predict(tagged_X_test_new)

# # Calculating Discrepancies
# tagged_df['discrepency_'+ f'{type}'] = tagged_df['planned_service_time_sum'] - tagged_df['hex_predicted_service_time_'+ f'{type}']

# # For hexagonal level prediction by just using hexagonal fetaures which are summed and averaged
# tagged_df = tagged_df.join(tagged_df_hex_sum.set_index('h3')['h3_predicted_service_time_' + f'{type}'], on='h3')

# # save this dataframe
# pd.to_pickle(tagged_df, '/gcsmount-notebook/navish/dataframes/hexagonal_regression_pred_df.pkl')


In [None]:
# def scatter(x,y, fig):
#     plt.subplot(5,2,fig)
#     plt.scatter(tagged_df[x], tagged_df[y])
#     # plt.suptitle('OLS')
#     plt.xlabel(x)
#     plt.ylabel(y)

# plt.figure(figsize=(10,20))

# scatter('h3_predicted_service_time_'+ f'{type}', 'planned_service_time_sum', 1)
# scatter('h3_predicted_service_time_'+ f'{type}', 'planned_service_time_sum_grp_'+ f'{type}', 2)
# scatter('hex_predicted_service_time_'+ f'{type}', 'planned_service_time_sum', 3)
# scatter('hex_predicted_service_time_'+ f'{type}', 'planned_service_time_sum_grp_'+ f'{type}', 4)


# plt.tight_layout()


In [None]:
# print(lm.summary())


## Regression using other algorithms


In [64]:
np.random.seed(0)

# getting rid of all the non-numeral data
df_train, df_test = train_test_split(
    tagged_df
    # tagged_df_hex.drop(
    #     [
    #         "lat",
    #         "lng",
    #         "has_window_max",
    #         "h3",
    #         "planned_service_time_min",
    #         "planned_service_time_max",
    #         "planned_service_time_mean",
    #     ],
    #     axis=1,
    # ),
    # train_size=0.8,
    # test_size=0.2,
    # random_state=100,
)

scaler = StandardScaler()
scaler.fit(df_train)

# num_vars = ['height', 'width', 'volume', 'intraday_sec', 'planned_service_time']
# df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

df_train = pd.DataFrame(
    scaler.fit_transform(df_train), index=df_train.index, columns=df_train.columns
)

# Dividing data into X and y variables
y_train = df_train.pop("planned_service_time_sum_log")
X_train = df_train.reset_index(drop=True)


In [65]:
# Scaling the test set

# If we want to only scale certain features
# num_vars = ['height','width', 'city_zone_2_num', 'month']
# df_test[num_vars] = scaler.fit_transform(df_test[num_vars])

df_test = pd.DataFrame(
    scaler.fit_transform(df_test), index=df_test.index, columns=df_test.columns
)

# Dividing data into X and y variables
y_test = df_test.pop("planned_service_time_sum_log")
X_test = df_test.reset_index(drop=True)


1. AdaBoost


In [66]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

AdaBoost = AdaBoostRegressor(
    random_state=0,
    base_estimator=DecisionTreeRegressor(max_depth=3),
    loss="linear",
    n_estimators=2000,
)
AdaBoost.fit(X_train, y_train)


In [67]:
AdaBoost.score(X_train, y_train)


0.5200002608270458

In [68]:
AdaBoost.score(X_test, y_test)


0.3852156026696908

In [70]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=y_test,
        y=y_pred,
        mode="markers",
        marker=dict(
            color="rgba(152, 0, 0, .8)",
        ),
    )
)

fig.update_layout(
    title="AdaBoost",
    xaxis_title="y_test",
    yaxis_title="y_pred",
    # yaxis_range=[0, 10],
)


3. ExtraTrees


In [71]:
from sklearn.ensemble import ExtraTreesRegressor

ET = ExtraTreesRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
ET.fit(X_train, y_train)


In [72]:
ET.score(X_train, y_train)


0.9995409805942922

In [73]:
ET.score(X_test, y_test)


0.4648903674160545

In [74]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=y_test,
        y=y_pred,
        mode="markers",
        marker=dict(
            color="rgba(152, 0, 0, .8)",
        ),
    )
)

fig.update_layout(
    title="ET",
    xaxis_title="y_test",
    yaxis_title="y_pred",
    # yaxis_range=[0, 10],
)


4. GradientBoost


In [75]:
from sklearn.ensemble import GradientBoostingRegressor

GB = GradientBoostingRegressor(
    random_state=0, learning_rate=0.1, loss="huber", n_estimators=20
)
GB.fit(X_train, y_train)


In [76]:
GB.score(X_train, y_train)


0.4897715796763037

In [77]:
GB.score(X_test, y_test)


0.40110853618449216

5. RandomForest


In [78]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
RF.fit(X_train, y_train)


In [79]:
RF.score(X_train, y_train)


0.927153538987866

In [80]:
RF.score(X_test, y_test)


0.43775957514056585

### Seeing the Feature Importance


In [None]:
# !pip install shap


In [None]:
import shap


In [None]:
def plot_importance(name, regr):
    sorted_idx = regr.feature_importances_.argsort()
    plt.barh(
        X_train.columns[sorted_idx[-25:]], regr.feature_importances_[sorted_idx[-25:]]
    )
    plt.xlabel(f"{name} Feature Importance")


In [None]:
from sklearn.inspection import permutation_importance


def perm_importance(name, regr):
    perm_importance = permutation_importance(regr, X_test, y_test)
    sorted_idx = perm_importance.importances_mean.argsort()
    plt.barh(
        X_train.columns[sorted_idx[-20:]],
        perm_importance.importances_mean[sorted_idx[-20:]],
    )
    plt.xlabel(f"{name} Permutation Importance")


#### Visualisation using SHAP


In [None]:
def shap_plots(name, regr, plot_type="dot"):

    explainer = shap.TreeExplainer(regr, check_additivity=False)
    shap_values = explainer.shap_values(X_test)

    # Feature value ('dot') or just feature importance ('bar)
    shap.summary_plot(shap_values[:20], X_test, plot_type)


#### Let's compare now!


1. AdaBoost


In [None]:
shap_plots("AdaBoost", AdaBoost)


In [None]:
# perm_importance('AdaBoost', AdaBoost)


In [None]:
# shap_plots('AdaBoost', AdaBoost, 'dot')


2. ExtraTrees


In [None]:
plot_importance("ET", ET)


In [None]:
shap_plots("ET", ET, "dot")


3. GradientBoost


In [None]:
plot_importance("GB", GB)


In [None]:
shap_plots("GB", GB, "dot")


4. RandomForest


In [None]:
plot_importance("RF", RF)


In [None]:
# shap_plots('RF', RF, 'dot')


##### Plotting the within zone discrepencies on Test Data


In [None]:
df_test.head()
