# Train and Predict

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from pprint import pprint
from sklearn.model_selection import train_test_split

from util import load_prediction_todo, diff_timestamps, load_prediction_todo_as_df
from lib.util.dataframe_preprocessor import DataframePreprocessor
from lib.util.df_ops import mat2df, dump
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from numpy import isnan
from sklearn.preprocessing import OneHotEncoder

%load_ext autoreload
%autoreload 2

## Preprocess

In [2]:
df_preprocessor = DataframePreprocessor.init_from_file("./data/historical_data.csv")

Data Imported: (197428, 16)


### Force type for columns

In [3]:
df_preprocessor.force_column_dtype("created_at", str)
df_preprocessor.force_column_dtype("actual_delivery_time", str)
df_preprocessor.force_column_dtype("store_primary_category", str)

In [4]:
df_preprocessor.filter_rows_by_condition("created_at", lambda x: x != "nan")
df_preprocessor.filter_rows_by_condition("actual_delivery_time", lambda x: x != "nan")

Shape Updated from (197428, 16) -> (197428, 16) | 0 Rows Removed.
Shape Updated from (197428, 16) -> (197421, 16) | 7 Rows Removed.


In [5]:
created_at = df_preprocessor.get_dataframe()["created_at"]
actual_delivery_time = df_preprocessor.get_dataframe()["actual_delivery_time"]
delivery_seconds = [diff_timestamps(t1, t2) for (t1, t2) in zip(created_at, actual_delivery_time)]

In [6]:
len(delivery_seconds)

197421

### Add generated column: delivery_seconds

In [7]:
df_preprocessor.remove_cols(["created_at", 'actual_delivery_time'], kind="name")
df_preprocessor.append_column("delivery_seconds", delivery_seconds)

Shape Updated from (197421, 16) -> (197421, 14) | 2 Columns Removed.


In [8]:
df_preprocessor.report_shape()
df_preprocessor.peek_head()

Shape of Data: (197421, 15)


Unnamed: 0,market_id,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,delivery_seconds
0,1.0,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0,3779.0
1,2.0,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0,4024.0
2,3.0,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0,1781.0
3,3.0,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0,3075.0
4,3.0,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0,2390.0


In [9]:
def label_encode(col):
    encoder = LabelEncoder()
    transformed = encoder.fit_transform(df_preprocessor.get_dataframe()[col])
    return encoder, transformed

def transform_categorical_col(col):
    encoder, transformed = label_encode(col)
    df_preprocessor.remove_cols([col], kind="name")
    df_preprocessor.append_column(col, transformed)
    return encoder

### Encode Categorical Columns

In [10]:
store_cat_encoder = transform_categorical_col("store_primary_category")

Shape Updated from (197421, 15) -> (197421, 14) | 1 Columns Removed.


In [11]:
df = df_preprocessor.get_dataframe()
df = df[~np.isnan(df["delivery_seconds"])]
df.shape

(197414, 15)

### Fill nan with default values

In [12]:
def df_mean(df, col):
    val = df[col]
    val = val[~np.isnan(val)]
    return np.mean(val)

def df_median(df, col):
    val = df[col]
    val = val[~np.isnan(val)]
    return np.median(val)

def df_mode(df, col):
    from collections import Counter
    c = Counter(df[col])
    return c.most_common()[0][0]

In [13]:
train_features = [
 'store_primary_category',
 'total_items',
 'subtotal',
 'num_distinct_items',
 'min_item_price',
 'max_item_price',
 'total_onshift_dashers',
 'total_busy_dashers',
 'total_outstanding_orders',
 'estimated_order_place_duration',
 'estimated_store_to_consumer_driving_duration']

In [14]:
nan_cols = []
for r in train_features:
    if any(np.isnan(df[r])):
        nan_cols.append(r)
        print(r)

total_onshift_dashers
total_busy_dashers
total_outstanding_orders
estimated_store_to_consumer_driving_duration


In [15]:
default_vals = {
    "order_protocol": df_mode(df, "order_protocol"),
    "total_onshift_dashers": df_median(df, "total_onshift_dashers"),
    "total_busy_dashers": df_median(df, "total_busy_dashers"),
    "total_outstanding_orders": df_median(df, "total_outstanding_orders"),
    "estimated_order_place_duration": df_median(df, "estimated_order_place_duration"),   
    "estimated_store_to_consumer_driving_duration": df_median(df, "estimated_store_to_consumer_driving_duration"),   
}

default_vals

{'order_protocol': 1.0,
 'total_onshift_dashers': 37.0,
 'total_busy_dashers': 34.0,
 'total_outstanding_orders': 41.0,
 'estimated_order_place_duration': 251.0,
 'estimated_store_to_consumer_driving_duration': 544.0}

In [16]:
def fill_df_with_default(df):
    df.fillna(default_vals, inplace=True)

In [17]:
fill_df_with_default(df)
X = df[train_features].as_matrix()
y = df["delivery_seconds"]

## Train a regression model

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
# Fit regression model
clf = LinearRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE: %.4f" % mse)

MSE: 1117761902.9413


### Analyze feature coefficients

In [20]:
features_scored = []
for feature, coef in zip(train_features, clf.coef_):
    features_scored.append((feature, coef))

features_scored.sort(key = lambda x: -abs(x[1]))
for feature, coef in features_scored:
    print("{:50}: {:.2f}".format(feature, coef))

num_distinct_items                                : 15.03
total_items                                       : -4.89
total_onshift_dashers                             : -2.47
total_outstanding_orders                          : 2.33
total_busy_dashers                                : -1.93
estimated_order_place_duration                    : 0.96
store_primary_category                            : 0.76
max_item_price                                    : 0.11
estimated_store_to_consumer_driving_duration      : 0.10
min_item_price                                    : 0.02
subtotal                                          : 0.01


# Predict



In [21]:
prediction_df = load_prediction_todo_as_df()
prediction_df.drop(["market_id", "order_protocol", "platform", "store_id", "created_at"], inplace=True, axis=1)
print(prediction_df.shape)

(54778, 12)


In [22]:
fill_df_with_default(prediction_df)
encoded = store_cat_encoder.transform([(x if x != "NA" else "nan") for x in prediction_df["store_primary_category"]])
prediction_df["store_primary_category"] = encoded

In [23]:
prediction_df[train_features].head()

Unnamed: 0,store_primary_category,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,50,5.0,7500.0,4.0,800.0,1800.0,4.0,4.0,4.0,446.0,670.0
1,50,5.0,7100.0,4.0,800.0,1500.0,4.0,1.0,1.0,446.0,446.0
2,70,4.0,4500.0,2.0,750.0,1500.0,9.0,7.0,6.0,446.0,504.0
3,50,1.0,1700.0,1.0,1400.0,1400.0,3.0,3.0,3.0,446.0,687.0
4,39,2.0,3150.0,2.0,1525.0,1625.0,4.0,4.0,4.0,446.0,528.0


In [24]:
seconds_predicted = clf.predict(prediction_df[train_features].as_matrix())

In [25]:
len(seconds_predicted)

54778

### Generate output tsv and dump

In [26]:
deliver_ids = prediction_df["delivery_id"]
rows = []
for id, seconds in zip(deliver_ids, seconds_predicted):
    rows.append([id, int(seconds)])

output_df = mat2df(np.array(rows), ["delivery_id", "predicted_delivery_seconds"])

In [27]:
dump(output_df, save_path="./output/predictions.tsv")

Data Dumped to ./output/predictions.tsv
