This notebook's purpose is to illustrate how a model might be saved/exported and used in a web server to serve up predictions. 

Modeling building, and the model's feature engineering and final metrics are not the focus of this document.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [4]:
geo_df = geo_df.groupby(['geolocation_zip_code_prefix']).mean()

In [5]:
main_df = main_df.merge(geo_df, how='left', left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix')

In [6]:
main_df = main_df.rename(columns={'geolocation_lat': 'customer_geolocation_lat', 'geolocation_lng': 'customer_geolocation_lng'})

In [7]:
main_df = main_df.merge(geo_df, how='left', left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix')

In [8]:
main_df = main_df.rename(columns={'geolocation_lat': 'seller_geolocation_lat', 'geolocation_lng': 'seller_geolocation_lng'})

In [9]:
main_df = main_df.loc[:, ~main_df.columns.str.contains('^Unnamed')]

In [10]:
def haversine_vectorize(lon1, lat1, lon2, lat2):

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    newlon = lon2 - lon1
    newlat = lat2 - lat1

    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2

    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km

In [11]:
main_df['customer_seller_distance'] = haversine_vectorize(main_df['seller_geolocation_lng'],main_df['seller_geolocation_lat'],main_df['customer_geolocation_lng'],main_df['customer_geolocation_lat'])

In [14]:
main_df['customer_seller_state'] = main_df['customer_state'] + '-' + main_df['seller_state']

In [16]:
#main_df['delivery_date_difference'] = pd.to_datetime(main_df['order_estimated_delivery_date']) - pd.to_datetime(main_df['order_delivered_customer_date'])
#main_df['delivery_date_difference'] = main_df['delivery_date_difference'].dt.days

In [17]:
main_df['delivery_days'] = pd.to_datetime(main_df['order_delivered_customer_date']) - pd.to_datetime(main_df['order_purchase_timestamp'])
main_df['delivery_days'] = main_df['delivery_days'].dt.days

In [18]:
one_hot_state = pd.get_dummies(main_df['customer_seller_state'])

In [19]:
main_df = main_df.join(one_hot_state)

In [20]:
one_hot_month = pd.get_dummies(main_df['order_purchase_month'], prefix = 'month')

In [21]:
main_df = main_df.join(one_hot_month)

In [22]:
main_df = main_df.dropna()

In [24]:
y = main_df['delivery_days']
x1 = main_df[['customer_seller_distance','freight_value','month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12','AC-BA',
 'AC-GO',
 'AC-MA',
 'AC-MG',
 'AC-MS',
 'AC-PR',
 'AC-RJ',
 'AC-RS',
 'AC-SC',
 'AC-SP',
 'AL-AM',
 'AL-BA',
 'AL-CE',
 'AL-DF',
 'AL-GO',
 'AL-MA',
 'AL-MG',
 'AL-PB',
 'AL-PE',
 'AL-PR',
 'AL-RJ',
 'AL-RN',
 'AL-RS',
 'AL-SC',
 'AL-SP',
 'AM-BA',
 'AM-CE',
 'AM-GO',
 'AM-MA',
 'AM-MG',
 'AM-PR',
 'AM-RJ',
 'AM-RS',
 'AM-SC',
 'AM-SP',
 'AP-BA',
 'AP-GO',
 'AP-MA',
 'AP-MG',
 'AP-PE',
 'AP-PR',
 'AP-RJ',
 'AP-RS',
 'AP-SC',
 'AP-SP',
 'BA-BA',
 'BA-CE',
 'BA-DF',
 'BA-ES',
 'BA-GO',
 'BA-MA',
 'BA-MG',
 'BA-MS',
 'BA-MT',
 'BA-PB',
 'BA-PE',
 'BA-PR',
 'BA-RJ',
 'BA-RN',
 'BA-RO',
 'BA-RS',
 'BA-SC',
 'BA-SP',
 'CE-BA',
 'CE-CE',
 'CE-DF',
 'CE-ES',
 'CE-GO',
 'CE-MA',
 'CE-MG',
 'CE-MT',
 'CE-PB',
 'CE-PE',
 'CE-PI',
 'CE-PR',
 'CE-RJ',
 'CE-RN',
 'CE-RS',
 'CE-SC',
 'CE-SP',
 'DF-BA',
 'DF-CE',
 'DF-DF',
 'DF-ES',
 'DF-GO',
 'DF-MA',
 'DF-MG',
 'DF-MS',
 'DF-MT',
 'DF-PB',
 'DF-PE',
 'DF-PR',
 'DF-RJ',
 'DF-RN',
 'DF-RS',
 'DF-SC',
 'DF-SP',
 'ES-BA',
 'ES-CE',
 'ES-DF',
 'ES-ES',
 'ES-GO',
 'ES-MA',
 'ES-MG',
 'ES-MT',
 'ES-PB',
 'ES-PE',
 'ES-PR',
 'ES-RJ',
 'ES-RO',
 'ES-RS',
 'ES-SC',
 'ES-SE',
 'ES-SP',
 'GO-BA',
 'GO-CE',
 'GO-DF',
 'GO-ES',
 'GO-GO',
 'GO-MA',
 'GO-MG',
 'GO-MS',
 'GO-MT',
 'GO-PB',
 'GO-PE',
 'GO-PR',
 'GO-RJ',
 'GO-RS',
 'GO-SC',
 'GO-SE',
 'GO-SP',
 'MA-AM',
 'MA-BA',
 'MA-CE',
 'MA-DF',
 'MA-ES',
 'MA-GO',
 'MA-MA',
 'MA-MG',
 'MA-MT',
 'MA-PB',
 'MA-PE',
 'MA-PI',
 'MA-PR',
 'MA-RJ',
 'MA-RS',
 'MA-SC',
 'MA-SP',
 'MG-AM',
 'MG-BA',
 'MG-CE',
 'MG-DF',
 'MG-ES',
 'MG-GO',
 'MG-MA',
 'MG-MG',
 'MG-MS',
 'MG-MT',
 'MG-PA',
 'MG-PB',
 'MG-PE',
 'MG-PI',
 'MG-PR',
 'MG-RJ',
 'MG-RN',
 'MG-RS',
 'MG-SC',
 'MG-SP',
 'MS-BA',
 'MS-CE',
 'MS-DF',
 'MS-GO',
 'MS-MA',
 'MS-MG',
 'MS-MT',
 'MS-PE',
 'MS-PR',
 'MS-RJ',
 'MS-RS',
 'MS-SC',
 'MS-SP',
 'MT-BA',
 'MT-CE',
 'MT-DF',
 'MT-ES',
 'MT-GO',
 'MT-MA',
 'MT-MG',
 'MT-MS',
 'MT-MT',
 'MT-PB',
 'MT-PE',
 'MT-PR',
 'MT-RJ',
 'MT-RS',
 'MT-SC',
 'MT-SP',
 'PA-BA',
 'PA-CE',
 'PA-DF',
 'PA-ES',
 'PA-GO',
 'PA-MA',
 'PA-MG',
 'PA-MT',
 'PA-PB',
 'PA-PE',
 'PA-PR',
 'PA-RJ',
 'PA-RN',
 'PA-RS',
 'PA-SC',
 'PA-SE',
 'PA-SP',
 'PB-BA',
 'PB-DF',
 'PB-GO',
 'PB-MA',
 'PB-MG',
 'PB-MT',
 'PB-PB',
 'PB-PE',
 'PB-PI',
 'PB-PR',
 'PB-RJ',
 'PB-RN',
 'PB-RS',
 'PB-SC',
 'PB-SP',
 'PE-BA',
 'PE-CE',
 'PE-DF',
 'PE-ES',
 'PE-GO',
 'PE-MA',
 'PE-MG',
 'PE-MT',
 'PE-PB',
 'PE-PE',
 'PE-PI',
 'PE-PR',
 'PE-RJ',
 'PE-RS',
 'PE-SC',
 'PE-SE',
 'PE-SP',
 'PI-BA',
 'PI-CE',
 'PI-DF',
 'PI-ES',
 'PI-GO',
 'PI-MA',
 'PI-MG',
 'PI-PE',
 'PI-PI',
 'PI-PR',
 'PI-RJ',
 'PI-RS',
 'PI-SC',
 'PI-SP',
 'PR-BA',
 'PR-CE',
 'PR-DF',
 'PR-ES',
 'PR-GO',
 'PR-MA',
 'PR-MG',
 'PR-MS',
 'PR-MT',
 'PR-PA',
 'PR-PE',
 'PR-PI',
 'PR-PR',
 'PR-RJ',
 'PR-RN',
 'PR-RO',
 'PR-RS',
 'PR-SC',
 'PR-SE',
 'PR-SP',
 'RJ-BA',
 'RJ-CE',
 'RJ-DF',
 'RJ-ES',
 'RJ-GO',
 'RJ-MA',
 'RJ-MG',
 'RJ-MS',
 'RJ-MT',
 'RJ-PB',
 'RJ-PE',
 'RJ-PR',
 'RJ-RJ',
 'RJ-RN',
 'RJ-RO',
 'RJ-RS',
 'RJ-SC',
 'RJ-SE',
 'RJ-SP',
 'RN-BA',
 'RN-CE',
 'RN-DF',
 'RN-ES',
 'RN-GO',
 'RN-MA',
 'RN-MG',
 'RN-MT',
 'RN-PB',
 'RN-PE',
 'RN-PR',
 'RN-RJ',
 'RN-RN',
 'RN-RS',
 'RN-SC',
 'RN-SP',
 'RO-BA',
 'RO-CE',
 'RO-ES',
 'RO-GO',
 'RO-MG',
 'RO-MT',
 'RO-PE',
 'RO-PR',
 'RO-RJ',
 'RO-RS',
 'RO-SC',
 'RO-SP',
 'RR-BA',
 'RR-MG',
 'RR-PR',
 'RR-SP',
 'RS-BA',
 'RS-CE',
 'RS-DF',
 'RS-ES',
 'RS-GO',
 'RS-MA',
 'RS-MG',
 'RS-MS',
 'RS-MT',
 'RS-PE',
 'RS-PR',
 'RS-RJ',
 'RS-RN',
 'RS-RO',
 'RS-RS',
 'RS-SC',
 'RS-SP',
 'SC-BA',
 'SC-CE',
 'SC-DF',
 'SC-ES',
 'SC-GO',
 'SC-MA',
 'SC-MG',
 'SC-MS',
 'SC-MT',
 'SC-PE',
 'SC-PI',
 'SC-PR',
 'SC-RJ',
 'SC-RN',
 'SC-RS',
 'SC-SC',
 'SC-SP',
 'SE-BA',
 'SE-CE',
 'SE-DF',
 'SE-MA',
 'SE-MG',
 'SE-PB',
 'SE-PE',
 'SE-PR',
 'SE-RJ',
 'SE-RO',
 'SE-RS',
 'SE-SC',
 'SE-SP',
 'SP-BA',
 'SP-CE',
 'SP-DF',
 'SP-ES',
 'SP-GO',
 'SP-MA',
 'SP-MG',
 'SP-MS',
 'SP-MT',
 'SP-PA',
 'SP-PB',
 'SP-PE',
 'SP-PI',
 'SP-PR',
 'SP-RJ',
 'SP-RN',
 'SP-RO',
 'SP-RS',
 'SP-SC',
 'SP-SE',
 'SP-SP',
 'TO-BA',
 'TO-DF',
 'TO-ES',
 'TO-GO',
 'TO-MA',
 'TO-MG',
 'TO-PE',
 'TO-PR',
 'TO-RJ',
 'TO-RS',
 'TO-SC',
 'TO-SP']]
x = sm.add_constant(x1)
results = sm.OLS(y, x).fit()
results.summary()

0,1,2,3
Dep. Variable:,delivery_days,R-squared:,0.281
Model:,OLS,Adj. R-squared:,0.279
Method:,Least Squares,F-statistic:,105.2
Date:,"Thu, 11 Nov 2021",Prob (F-statistic):,0.0
Time:,20:34:08,Log-Likelihood:,-399280.0
No. Observations:,114127,AIC:,799400.0
Df Residuals:,113703,BIC:,803500.0
Df Model:,423,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.5629,0.292,29.366,0.000,7.991,9.134
customer_seller_distance,0.0044,0.000,28.137,0.000,0.004,0.005
freight_value,0.0399,0.002,23.592,0.000,0.037,0.043
month_1,2.0907,0.084,25.019,0.000,1.927,2.254
month_2,4.0906,0.082,49.704,0.000,3.929,4.252
month_3,3.2105,0.077,41.668,0.000,3.059,3.361
month_4,0.3306,0.079,4.197,0.000,0.176,0.485
month_5,-0.4109,0.075,-5.516,0.000,-0.557,-0.265
month_6,-1.6286,0.078,-20.832,0.000,-1.782,-1.475

0,1,2,3
Omnibus:,119237.025,Durbin-Watson:,1.814
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21194553.85
Skew:,4.896,Prob(JB):,0.0
Kurtosis:,69.039,Cond. No.,1.14e+18


In [25]:
X_train, X_test, y_train, y_test = train_test_split(x1, y, test_size=0.2, random_state=0)

In [26]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train) #training the algorithm

LinearRegression()

In [28]:
y_pred = regressor.predict(X_test)

In [29]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 208928.05160641158
Mean Squared Error: 184549471125596.34
Root Mean Squared Error: 13584898.6424484


In [None]:
regressor.predict([[1.0,2.0]])

In [None]:
import pickle
pickle_out=open("classifier.pkl",'wb')
pickle.dump(regressor,pickle_out)
pickle_out.close()