In [11]:
import pandas as pd
import numpy as np
import random
import json

In [3]:
tea_df = pd.read_csv("scraper/clean_data.csv")
tea_df['features_flavors'].head()

0    Floral, Rainforest, Honeysuckle, Orchids, Pepp...
1    Dark Chocolate, Chocolate, Dates, Malt, Orchid...
2    Honey, Lemon, Nuts, Flowers, Lemongrass, Delic...
3    Cocoa, Dark Chocolate, Malt, Vanilla, Apple, A...
4    Sweet, Mineral, Nutty, Roasted, Salty, Butter,...
Name: features_flavors, dtype: object

### Create a list of unique flavors

In [5]:
flavors = set()
for tea in tea_df['features_flavors']:
    for flavs in tea.split(","):
        flavors.add(flavs.strip().title())

ordered_flavors = sorted(list(flavors))
number_of_flavors = len(flavors)

print("Total number of flavors:", number_of_flavors)

Total number of flavors: 1446


### Indexing flavors

In [6]:
flavor_to_index = {}
index_to_flavor = {}
i = 0
for flavor in ordered_flavors:
    flavor_to_index[flavor] = i
    index_to_flavor[i] = flavor
    i += 1
    
print("Index of Floral:", flavor_to_index["Floral"], "\nFlavor with index 5:", index_to_flavor[4])

Index of Floral: 461 
Flavor with index 5: Accent


In [12]:
data = json.dumps(flavor_to_index)
with open("features_to_index.json","w") as f:
  f.write(data)

In [13]:
data = json.dumps(index_to_flavor)
with open("index_to_features.json","w") as f:
  f.write(data)

### Co-occurence Matrix for Flavors

In [10]:
co_occurence = np.zeros([number_of_flavors, number_of_flavors])

for tea in tea_df['features_flavors']:
    flavlist = tea.split(",")
    for flav1 in range(len(flavlist)):
        for flav2 in range(flav1,len(flavlist)):
            indx1 = flavor_to_index[flavlist[flav1].strip().title()]
            indx2 = flavor_to_index[flavlist[flav2].strip().title()]
            co_occurence[indx1][indx2] += 1
            co_occurence[indx2][indx1] += 1

# Zero diagonal
for i in range(number_of_flavors):
    co_occurence[i][i] = 0

np.savetxt("co_occurence_features.txt", co_occurence)
print("Co-occurence shape:", co_occurence.shape)
co_occurence[:5]

Co-occurence shape: (1446, 1446)


array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [17]:
list = co_occurence.tolist()
with open("co_oc_features.txt", "w") as output:
    output.write(str(list))

### Get top K co-occurred flavors given a list of flavors

In [8]:
def get_complements(flavors_query, top_x):
    random_flavor = random.choice(flavors_query)
    print("Random flavor:", random_flavor)
    top_flavors = np.argsort(co_occurence[flavor_to_index[random_flavor]][:])[::-1]
    return [index_to_flavor[i] for i in top_flavors][:top_x]

print("Top 5 co-occurred flavors: ", get_complements(['Apple', 'Orange'], 5))

Random flavor: Orange
Top 5 co-occurred flavors:  ['Strong', 'Black', 'Floral', 'Malt', 'Sweet']
