## CustomData to try

In [177]:
custom_dict = {'UserID':[1, 1, 1, 1, 1, 2, 2, 3],
               'OrderID':[100, 100, 100, 200, 200, 300, 400, 500],
               'ItemID':[10, 11, 11, 12, 15, 15, 16, 17],
               'Description':[' McCombo 12 with Double Cheeseburger with bacons',
                             'McToast, hashbrow_ 4 and_ a drink_ of choice ',
                             'Happy Meal-with Pancakes and a drink of choice',
                             'Double File-o-Fishs',
                             'Cheeseburgers_',
                             'Cheeseburgers_',
                             'McCombo with_ Caesar_ Roll',
                             'Happy Meal-with Pancakes and a drink of choice'],
              'Count' :[1, 1, 1, 3, 4, 1, 2, 1]}
custom_data = pd.DataFrame(custom_dict)

## Loading and previewing data

In [178]:
import pandas as pd
import numpy as np
import spacy
import re
from tqdm import tqdm

In [179]:
custom_data = pd.read_csv('userItems.csv')
custom_data = custom_data.sample(frac=0.001, random_state=3)
custom_data.shape

(2513, 6)

In [65]:
nlp = spacy.load('en_core_web_sm')

def normalize(text):
    text = re.sub(r'[^a-zA-Z ]', '', text) # remove all non latin characters 
    text = nlp(text.lower())
    tokens = [token.lemma_ for token in text if len(token) > 2 and  # leave raw tokens if length > 2
                                                not token.is_stop and # and if token is not a stop word
                                                token.pos_ != 'ADJECTIVE'] # and it is a NOT AN ANJECTIVE
    tokens = list(set(tokens))  # delete repeating words
    return ' '.join(tokens) if tokens else ' ' #returning clear text description if any words left, else empty str
    
def tokenize(text):
    return text.split()

In [66]:
tqdm.pandas()
custom_data['Description'] = custom_data['Description'].progress_apply(normalize)
custom_data.dropna(inplace=True)
custom_data.head(5)

100%|██████████| 2513/2513 [00:44<00:00, 56.56it/s]


Unnamed: 0.1,Unnamed: 0,UserID,OrderID,ItemID,Number,Description
2490850,2490850,10239,197368,4236,1.0,cocacola
463944,463944,249486,730032,50290,1.0,chicken spicy village wing potato basket
51013,51013,139161,704642,2461,1.0,msize village potato lsize
2184155,2184155,157924,960227,50267,1.0,chefburger sesame chicken iceberg fillet sauce...
1425810,1425810,156095,490098,27686,1.0,chicken mayonnaise fillet boil draniki cheese


### Save clear description data and dict ItemID->categories and dict category->ItemIds

In [180]:
# custom_data.to_csv('userItemsSampleClear.csv')
custom_data = pd.read_csv('userItemsSampleClear.csv')

In [181]:
dic = pd.Series(custom_data['Description'].values, index=custom_data['ItemID']).to_dict()
dictItemIdCat = {key: list(map(str, value.split())) for key, value in dic.items()}

In [182]:
def invert_dict(d): 
    inverse = dict() 
    for key in d: 
        # Go through the list that is saved in the dict:
        for item in d[key]:
            # Check if in the inverted dict the key exists
            if item not in inverse: 
                # If not create a new list
                inverse[item] = [key] 
            else: 
                inverse[item].append(key) 
    return inverse

In [183]:
import csv
def save_dict(name, dic):
    w = csv.writer(open(name, 'w'))
    for key, val in dic.items():
        w.writerow([key, val])

def read_dict(name):
    with open(name) as csv_file:
        reader = csv.reader(csv_file)
        mydict = dict(reader)
        return mydict

In [184]:
dictCatItemId = invert_dict(dictItemIdCat)

In [185]:
save_dict('dictItemIdCat.csv', dictItemIdCat)
save_dict('dictCatItemId.csv', dictCatItemId)

## Vectorizing

In [186]:
custom_data = pd.read_csv('userItemsSampleClear.csv')
dictItemIdCat = read_dict('dictItemIdCat.csv')
dictCatItemId = read_dict('dictCatItemId.csv')

In [187]:
custom_data.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,UserID,OrderID,ItemID,Number,Description
0,2490850,2490850,10239,197368,4236,1.0,cocacola
1,463944,463944,249486,730032,50290,1.0,chicken spicy village wing potato basket
2,51013,51013,139161,704642,2461,1.0,msize village potato lsize


In [188]:
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer = CountVectorizer()
categories_data = vectorizer.fit_transform(custom_data['Description'])
categories_voc = vectorizer.vocabulary_

In [189]:
custom_data['categories'] = [np.array(row) for row in categories_data.toarray()]
custom_data['categories'] = custom_data['categories'] * custom_data['Number']
custom_data.drop(['Description', 'Number', 'ItemID', 'Unnamed: 0', 'Unnamed: 0.1'],
                 axis=1, inplace=True)

In [190]:
custom_data.head(3)

Unnamed: 0,UserID,OrderID,categories
0,10239,197368,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,249486,730032,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,139161,704642,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## GroupBy UserID, OrderID, ItemID

In [191]:
cat = sorted(categories_voc.items(), key=lambda kv:kv[1])
cat = [c[0] for c in cat]
custom_data[cat] = pd.DataFrame(custom_data['categories'].values.tolist(), index = custom_data.index)
custom_data.drop('categories', axis=1, inplace=True)

In [192]:
categorized_custom_data =custom_data.groupby(['UserID', 'OrderID']).sum().groupby('UserID').mean()

In [193]:
categorized_custom_data.head(5)

Unnamed: 0_level_0,aburi,achik,achikchuchuk,adana,addition,adjarian,adjika,adygha,adyghe,age,...,yellow,yoghurt,yolk,york,young,yum,zero,zest,zira,zucchini
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Trying new user

Assume our user has made some order

In [204]:
customers_data = ['Basket 16 spicy wings 432/135g  16 chicken wings Chefburger 176g  bun with sesame fillet',
                 'chicken mayonnaise fillet boil draniki cheese Chefburger 176g  bun with sesame  chicken fillet']

In [206]:
customer_order = customers_data[0]
norm_customer_order = normalize(customer_order)
vec_customer_order = vectorizer.transform([norm_customer_order]).toarray()

In [207]:
database = categorized_custom_data.values

In [208]:
from sklearn.metrics import pairwise_distances
def get_close_users(matrix, new_user, top_n=3, metric='euclidean'):
    dist = pairwise_distances(matrix, new_user.reshape(1,-1), metric=metric)    
    dist = dist.flatten()
    idx = np.argsort(dist)[:top_n]    
    return matrix[idx]

def predict(new_user, closest, top_n=3):
    answer = []
    missing = np.where(new_user == 0)
    #missing = np.array([x[1] for x in missing])
    missing = missing[1]
    columns_max = closest.max(axis=0)
    idx = np.argsort(columns_max)
    unsorted = np.intersect1d(missing, idx)
    
    buf = np.in1d(idx, missing)[::-1]
    for i,item in enumerate(buf):
        if item == True:
            answer.append(idx[::-1][i])
    return answer[:top_n]

In [209]:
closest = get_close_users(database, vec_customer_order)
indexes = predict(vec_customer_order, closest)
indexes

[853, 1174, 1221]

In [210]:
reccomended = [[key for key in categories_voc.keys() if categories_voc[key] == x] for x in indexes]
reccomended = [r[0] for r in reccomended]
print('Ordered dish and description: ', customer_order)
print('Reccomended additional categories: ', reccomended)

Ordered dish and description:  Basket 16 spicy wings 432/135g  16 chicken wings Chefburger 176g  bun with sesame fillet
Reccomended additional categories:  ['potato', 'village', 'zucchini']


In [211]:
dictCatItemId[reccomended[0]]

'[50290, 2461, 70725, 46613, 70727, 12244, 20759, 12348, 22424, 69811, 50269, 48866, 20704, 28510, 18262, 35153, 15615, 50296, 752, 50291, 5152, 2324, 32356, 4773, 20876, 9583, 18225, 10939, 35115, 2317, 37699, 4223, 2330, 72154, 75058, 2320, 35147, 50308, 33225, 1129, 44831, 46620, 13475, 46615, 2261, 21632, 48877, 38500, 35141, 6070, 2425, 11094, 4760, 24638, 2329, 38457, 38333, 46610, 56206, 12176, 23005, 2122, 14005, 37838, 5591, 12036, 50769, 35140, 42184, 56277, 48965, 15194, 1860, 4774, 31875, 4499, 6844, 28296, 2318, 20949, 9381, 2323, 10138, 38430, 20894, 58540, 48874, 56869, 9629, 74528, 16005, 10457, 46624, 74426, 11101, 35126, 40517, 27375, 11132, 11092, 8332, 28044, 55085, 7209, 24809, 52217, 23017, 1461, 38541]'