In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
tea_df = pd.read_csv("scraper/clean_data.csv")
tea_df['flavors'].head()

0    Floral, Rainforest, Honeysuckle, Orchids, Pepp...
1    Dark Chocolate, Chocolate, Dates, Malt, Orchid...
2              Honey, Lemon, Nuts, Flowers, Lemongrass
3    Cocoa, Dark Chocolate, Malt, Vanilla, Apple, A...
4    Sweet, Mineral, Nutty, Roasted, Salty, Butter,...
Name: flavors, dtype: object

### Create a list of unique flavors

In [3]:
flavors = set()
for tea in tea_df['flavors']:
    for flavs in tea.split(","):
        flavors.add(flavs.strip().title())

ordered_flavors = sorted(list(flavors))
number_of_flavors = len(flavors)

print("Total number of flavors:", number_of_flavors)

Total number of flavors: 358


### Indexing flavors

In [4]:
flavor_to_index = {}
index_to_flavor = {}
i = 0
for flavor in ordered_flavors:
    flavor_to_index[flavor] = i
    index_to_flavor[i] = flavor
    i += 1
    
print("Index of Floral:", flavor_to_index["Floral"], "\nFlavor with index 5:", index_to_flavor[4])

Index of Floral: 118 
Flavor with index 5: Apple


### Co-occurence Matrix for Flavors

In [5]:
co_occurence = np.zeros([number_of_flavors, number_of_flavors])

for tea in tea_df['flavors']:
    flavlist = tea.split(",")
    for flav1 in range(len(flavlist)):
        for flav2 in range(flav1,len(flavlist)):
            indx1 = flavor_to_index[flavlist[flav1].strip().title()]
            indx2 = flavor_to_index[flavlist[flav2].strip().title()]
            co_occurence[indx1][indx2] += 1
            co_occurence[indx2][indx1] += 1

# Zero diagonal
for i in range(number_of_flavors):
    co_occurence[i][i] = 0

    
print("Co-occurence shape:", co_occurence.shape)
co_occurence[:5]

Co-occurence shape: (358, 358)


array([[ 0.,  0.,  3., ...,  0.,  0.,  0.],
       [ 0.,  0., 11., ...,  0.,  2.,  1.],
       [ 3., 11.,  0., ...,  1.,  3.,  3.],
       [ 2.,  9., 26., ...,  0.,  1.,  1.],
       [ 2., 15., 57., ...,  2.,  9.,  2.]])

### Get top K co-occurred flavors given a list of flavors

In [6]:
def get_complements(flavors_query, top_x):
    random_flavor = random.choice(flavors_query)
    print("Random flavor:", random_flavor)
    top_flavors = np.argsort(co_occurence[flavor_to_index[random_flavor]][:])[::-1]
    return [index_to_flavor[i] for i in top_flavors][:top_x]

print("Top 5 co-occurred flavors: ", get_complements(['Apple', 'Orange'], 5))

Random flavor: Apple
Top 5 co-occurred flavors:  ['Sweet', 'Fruity', 'Cinnamon', 'Floral', 'Spices']
