In [1]:
import pandas
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [2]:
tea_df = pandas.read_csv("../data/scraper/clean_data.csv")

In [3]:
tea_df.describe()

Unnamed: 0,id,reviewCount,ratingValue,wantIt,ownIt
count,15641.0,15641.0,15641.0,15641.0,15641.0
mean,46950.915862,8.716386,77.140272,3.252222,9.334697
std,24948.396619,22.517419,5.680778,11.43988,31.174744
min,2.0,1.0,39.0,0.0,0.0
25%,27788.0,1.0,75.0,0.0,1.0
50%,48504.0,2.0,78.0,0.0,2.0
75%,70482.0,7.0,81.0,2.0,6.0
max,83585.0,671.0,89.0,315.0,979.0


# Vectorize tea flavors

In [4]:
flavors = tea_df.flavors.values.tolist() # convert from pandas.core.series.Series to list of flavors

def tokenize(text):
    return text.lower().split(", ")

In [5]:
count_vec = CountVectorizer(flavors, tokenizer=tokenize)    # create count vectorizer on flavors
tea_by_flavors = count_vec.fit_transform(flavors).toarray() # transform count vectorizer to matrix
index_to_flavor = {i:str(v) for i, v in enumerate(count_vec.get_feature_names())} # create lookup from index to flavor

In [6]:
assert tea_by_flavors.shape == (15641, 359) # 15641 total teas, 359 total flavors

# Sample usage

In [7]:
# CAUTION: Use default integer index created by pandas (not id) to access a tea
# NOTE: We should probably fix id's to be consecutive.
#       They currently are not because we assigned id's before filtering out incomplete data.
#       If we fix this, we can safely use id's to access teas, ensuring that tea_df is sorted 
#       by id ***PRIOR*** to vectorizing.

def flavors_from_vector(vectorized_flaves):
    flaves = []
    for i in range(0, vectorized_flaves.size): # iterate over list of vectorized flavors for given tea
        if vectorized_flaves[i] == 1:          # if flavor present in tea
            flaves.append(index_to_flavor[i])  # lookup name of flavor and append to list
    return flaves                              # return list of names of flavors

flaves100 = tea_by_flavors[100]                # get flavors of tea with integer index (NOT ID) #100
print("Results: \n" + str(flaves100))
print("\nFlavors from results: \n" + str(flavors_from_vector(flaves100)))
print("\nFrom csv: \n" + tea_df.iloc[100].flavors) # for comparing to actual flavors

Results: 
[0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0]

Flavors from results: 
['apricot', 'baked bread', 'bitter', 'broth', 'brown sugar', 'butter', 'cacao', 'cantaloupe', 'caramel', 'chestnut', 'chocolate', 'cocoa', 'creamy', 'dark bittersweet', 'dark chocolate', 'dried fruit', 'earth', 'floral', 'gardenias', 'gr

## Explanation of results
Every element in the array represents a flavor. A `1` in position `i` means that the tea conatins flavor `i`. The name of the flavor can then be accessed from the `index_to_flavor` map by doing the following: `index_to_flavor[i]`.