In [2]:
import pandas as pd
import pyarrow
import nltk

In [3]:
nltk.download('words', download_dir='../data/')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to ../data/...
[nltk_data]   Package words is already up-to-date!


### Read dataset

In [4]:
! ls -la ../data/

total 43440
drwxr-xr-x   4 huangjunming  staff       128 May 24 14:05 [34m.[m[m
drwxr-xr-x  11 huangjunming  staff       352 May 25 15:47 [34m..[m[m
drwxr-xr-x   4 huangjunming  staff       128 May 24 14:05 [34mcorpora[m[m
-rw-r--r--   1 huangjunming  staff  22237983 May 20 21:59 products.parquet.gz


In [5]:
df = pd.read_parquet("../data/products.parquet.gz", engine='pyarrow')

In [6]:
df.sample(5)

Unnamed: 0,product_id,vendor_id,vendor_geohash,product_name,product_description,order_count
325178,325178,1247,w21zg1k,Takoyaki with Cheese,Octopus Balls with Cheese,2
419213,419213,4800,w21z9kq,Poached You Mai Vegetable,,4
595633,595633,383,w21zerf,Kotex Slim Overnight Wing Sanitary Pad - 35cm ...,,14
506708,506708,11007,w21zmp0,Mutton Briyani,,7
256534,256534,912,w23b1s5,Double Filet-O-Fish® Upsized Meal,For serious fish lovers. That's two white-fish...,2


In [7]:
df.shape

(633148, 6)

### Remove non English text in product name and description and non-valid items

In [8]:
def remove_non_english(in_text):
    if in_text is None:
        return None
    else:
        return ' '.join([w for w in nltk.wordpunct_tokenize(in_text) 
                         if w.lower() in words or w.encode().isalpha()])

In [9]:
# Remove non english text
df['product_name'] = df['product_name'].apply(lambda x: remove_non_english(x))
df['product_description'] = df['product_description'].apply(lambda x: remove_non_english(x))

# remove rows without product name 
df = df.loc[df['product_name'] != '']

In [10]:
df.sample(5)

Unnamed: 0,product_id,vendor_id,vendor_geohash,product_name,product_description,order_count
622967,622967,6098,w21ze4y,Plain Prata,,20
196798,196798,6780,w21zbg7,Masala Tea,Hot,1
392118,392118,9074,w21z7k9,Mini Strudel,,3
79466,79466,6372,w23b673,DODO YTF Sambal Chilli Paste Chilled g,,1
214941,214941,9332,w21z64s,Melvados Hot Spicy Tortilla Chips g,,1


### Data Analysis

In [11]:
n_rows = df.shape[0]
n_product = df['product_name'].nunique()
n_location = df['vendor_geohash'].nunique()
n_vendor = df['vendor_id'].nunique()
min_orders, max_orders = min(df['order_count']), max(df['order_count'])
print(f"There are in total {n_rows} transactions with "
      f"{n_product} unique menus and {n_vendor} unique vendors " 
      f"in {n_location} differenct locations with monthly "
      f"orders range from {min_orders} to {max_orders}")

There are in total 632825 transactions with 115685 unique menus and 13312 unique vendors in 2587 differenct locations with monthly orders range from 1 to 1386


## Problem Definitions
The problem can be defined as given a food delivery transaction dataset, providing a vendor with new menu suggestions which has the highest sales potential. For example, vendor A is similar as vendor B (e.g. they are both Chinese restaurant), a menu has high orders sold by vendor B has the very high potential for vendor A, a good menu suggestion can be provided to vendor A.

It is quite similar to the classic movie recommandation problem. The vendor_id is the user_id, menu_id is the movie_id, the monthly orders is the rating that vendor made for the menu. Therefore, it can be regarded as a ml recomandation task. We can use collaborating filtering algorithm to solve this problem.

### Menu Recommandation Model building

In [12]:
from typing import List

import numpy as np

from surprise import accuracy
from surprise import AlgoBase, CoClustering, KNNBasic, KNNWithMeans, KNNWithZScore, NMF, SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import KFold

#### Utility functions for menu suggestions

In [13]:
def get_new_menus(meta_df: pd.DataFrame, vendor_id: int) -> np.ndarray:
    """
        Return all the menus that are not exist on current vendor
    """
    vendor_mask = meta_df['vendor_id'] == vendor_id
    exist_menus = meta_df.loc[vendor_mask, 'menu_id'].unique()
    all_menus = meta_df['menu_id'].unique()
    return all_menus[~np.isin(all_menus, exist_menus)]

def make_menu_suggestions(model: AlgoBase, 
                          vendor_id: int, 
                          meta_df: pd.DataFrame, 
                          menu_meta: pd.DataFrame, 
                          n: int) -> pd.DataFrame:
    """
        Make n menu suggestions which has the top orders potential
    """
    test_set = [(vendor_id, menu_id, 0.0) for menu_id in get_new_menus(meta_df, vendor_id)]
    predictions = pd.DataFrame(model.test(test_set)).sort_values(by=['est'], ascending=False)
    suggestions = predictions.set_index('iid')[['est']].join(menu_meta).head(n)
    print(f"The top {n} new menu suggestions for vendor {vendor_id} are as followed:")
    for ind, (indx, row) in enumerate(suggestions.iterrows()):
        print(f"{ind+1}. menu id: {indx}, estimated monthly orders: {np.ceil(row['est'])}, menu name: {row['product_name']}.")
    return suggestions

#### Model related functions

In [19]:
def get_models(model_name: str) -> AlgoBase:
    """
        Return model object from model name
    """
    models = {
        "CoClustering": CoClustering(),
        "SVD": SVD(),
        "KNNBasic": KNNBasic(),
        "KNNWithMeans": KNNWithMeans(),
        "KNNWithZScore": KNNWithZScore(),
        "NMF": NMF()
    }
    return models[model_name]

def model_selection(models: List[str], 
                    data: Dataset, 
                    n_folds: int) -> (str, float, dict):
    """
        Select the best model based on the cross validation experiment errors
    """
    errors = {model: cross_validation(model, data, n_folds) for model in models}
    best_model = min(errors, key=errors.get)
    best_model_error = errors[best_model]
    print(f"Best model is {best_model} with cv error of {best_model_error}")
    return best_model, best_model_error, errors

def cross_validation(model_name: str, 
                    data: Dataset, 
                    n_folds: int) -> float:
    """
        Cross validation experiment for the given model and dataset
    """
    print(f"Processing cv for model {model_name} with {n_folds} folds.")
    kf = KFold(n_splits=n_folds)
    algo = get_models(model_name)
    errors = []
    for indx, (trainset, testset) in enumerate(kf.split(data)):
        print(f"Processing fold {indx+1}")
        # train and test algorithm.
        algo.fit(trainset)
        predictions = algo.test(testset)

        # Compute and print Root Mean Squared Error
        errors.append(accuracy.rmse(predictions, verbose=True))
    return np.array(errors).mean()

def retrain_model(model_name: str, data: Dataset) -> AlgoBase:
    """
        Retrain the model with the full data as trainset
    """
    algo = get_models(model_name)
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    return algo

#### Data preprocessing for model

In [20]:
# Give menu id to each unique product name
df = df.assign(menu_id=df.product_name.astype('category').cat.codes)

# Get the map between menu id and product name
menu_meta = df.groupby(by=['menu_id'])[['product_name']].first()

# prepare the dataframe for the recommandation model
meta_df = df[['vendor_id', 'menu_id', 'order_count']]

min_orders, max_orders = min(df['order_count']), max(df['order_count'])

In [21]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(min_orders, max_orders))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(meta_df, reader)


In [22]:
models_list = ['SVD', 'CoClustering', 'KNNBasic', 'KNNWithMeans', 'KNNWithZScore', 'NMF']

In [23]:
# Select the best model based on the cross validation experiment
best_m, best_e, errors = model_selection(models_list, data, 3)

Processing cv for model SVD with 3 folds.
Processing fold 1
RMSE: 1345.1261
Processing fold 2
RMSE: 1344.9618
Processing fold 3
RMSE: 1343.8459
Processing cv for model CoClustering with 3 folds.
Processing fold 1
RMSE: 17.8695
Processing fold 2
RMSE: 18.4623
Processing fold 3
RMSE: 18.8413
Processing cv for model KNNBasic with 3 folds.
Processing fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 18.7267
Processing fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 17.7277
Processing fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 18.0003
Processing cv for model KNNWithMeans with 3 folds.
Processing fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 16.3905
Processing fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 17.5636
Processing fold 3
Computing the msd similarity matrix...
Done computing similarity mat

In [24]:
# Retrain the best model with all the data as trainset
model = retrain_model(best_m, data)

Computing the msd similarity matrix...
Done computing similarity matrix.


### Make some suggestions with the best model and see the performance

In [25]:
# 1164 is a vendor who sells asia food, the menus suggested are similar asia food with the best sales potential
vendor = 1164
menus_exist = df.loc[df['vendor_id'] == vendor, 'product_name'].unique()
print(f"Menu that vendor {vendor} is selling: {menus_exist}")
suggests = make_menu_suggestions(model, vendor, meta_df, menu_meta, 3)

Menu that vendor 1164 is selling: ['Shanghai Fried Rice With Grilled Chicken'
 'Egg Fried Rice With Grilled Chicken' 'Fresh Ramen With Seasoned Prawns'
 'Egg Fried Rice With Braised Beef' 'Fresh Ramen With Grilled Chicken'
 'Shanghai Fried Rice With Braised Beef' 'Egg Fried Rice'
 'Fresh Udon With Grilled Chicken' 'Egg Fried Rice With Seasoned Prawns'
 'Truffle Fried Rice MAX' 'Truffle Fried Rice With Smoked Turkey Pastrami'
 'Truffle Fried Rice With Braised Beef' 'Truffle Fried Rice'
 'Egg Fried Rice HE With Braised Beef' 'Truffle Udon With Grilled Chicken'
 'Egg Fried Rice HE' 'Shanghai Fried Rice HE With Braised Beef'
 'Truffle Udon With Smoked Turkey Pastrami'
 'Shanghai Fried Rice HE With Grilled Chicken'
 'Fresh Ramen HE With Seasoned Prawns'
 'Truffle Fried Rice With Seasoned Prawns'
 'Fresh Udon HE With Seasoned Prawns'
 'Fresh Ramen With Smoked Turkey Pastrami'
 'Fresh Udon With Smoked Turkey Pastrami'
 'Egg Fried Rice With Smoked Turkey Pastrami'
 'Truffle Fried Rice With Gri

In [26]:
# 58 is a cafe, who provides coffee, breakfast and deserts. The menus suggested are similar food with the best sales potential
vendor = 58
menus_exist = df.loc[df['vendor_id'] == vendor, 'product_name'].unique()
print(f"Menu that vendor {vendor} is selling: {menus_exist}")
suggests = make_menu_suggestions(model, vendor, meta_df, menu_meta, 3)

Menu that vendor 58 is selling: ['Pancakes' 'Chicago Cheesecake' 'Eggs Ben' 'Red Velvet Sliced Cake'
 'Chocolate of a Thousand Leaves' 'Red Velvet Ice Blended'
 'Pandan Gula Melaka Cake' 'The Original Mocha Ice Blended'
 'New York Chicken Sausage Puff' 'Pure Dark Chocolate Ice Blended'
 'Skinny Triple Chocolate Muffin' 'Red Velvet Hot Cocoa'
 'Speculoos Ice Blended' 'Inches Tortilla Pizza'
 'Belgian Dark Chocolate Layered Cake' 'Hazelnut Americano' 'Double Latte'
 'Classic Green' 'Herbal Infusion Tea' 'Fruit Swedish Berries'
 'Moroccan Mint Latte' 'Italian Espresso Capsule' 'Food for Thought'
 'African Sunrise Tea Latte' 'Swedish Berries Tin Loose Leaf'
 'Earl Grey Tea Latte' 'Red Velvet Cake' 'Special Dutch Chocolate Powder'
 'Today s Brew Coffee' 'Sparkling Cold Brew Lemon'
 'Buster s Weekly Cheesecake' 'Earl Grey Tin' 'Caramel Latte'
 'Sparkling Cold Brew Passion Fruit' 'Cold Brew Vanilla Bean Latte'
 'Cold Brew Passion Fruit' 'Cold Brew' 'Americano'
 'English Breakfast Tea Latte' '

### Future Improvement

If I was given more time, I can try the following steps to improve the accuray of the solution further
1. For the current model selection, there is no hyper-parameter tuning steps. For a further improvement, conducting a grid search or random search of the hyper-parameters (various similarity measurement, error metrics, and so on).

2. More features can be included. The location of vendor can be also important to measure vendor similarity.

3. More data. To increase the scale of the dataset can reduce the sparcity of the data so that having better recommandations.

4. More complicated models (state of the arts)

5. Instead of collaborating filtering, we can also try the content based recommandation algorithm. In this case, we can use the product name and description to measure the similarity of the menu. And make menu recommandation which is very similiar to the best-seller menu that a vendor currently has.

6. More granular data cleaning. We can keep the menu meta info like the weights, how many stueck, and etc.