In [2]:
import pandas as pd
import datetime
import numpy as np

In [4]:
# Create data frame from file
df = pd.read_csv('/home/trungdung/Downloads/Data_Final.csv')
df["Date_Time"] = pd.to_datetime(df["Date_Time"], format="%Y-%m-%d %H:%M:%S")
df["Week"] = df["Date_Time"].dt.week
# df = df.sort_values(by='Date_Time')
# df.groupby('Mild')["Unit_sales"].sum()

In [5]:
def sales_share(col_name, data_frame):
    """
    Calculate sales shares for each value in col_name of data_frame by week
    :param col_name: column name    
    :param data_frame: input data frame
    :return data frame, index = week, data = sale shares
    """
    group = data_frame.groupby(['Week',col_name])['Unit_sales'].sum()
    g = pd.DataFrame(group).reset_index().pivot('Week', col_name).fillna(0)
    g = g.div(g.sum(axis=1), axis=0)
    return g

item_sale_share = sales_share("Brand", df)
item_sale_share

Unnamed: 0_level_0,Unit_sales,Unit_sales
Brand,MainStream,Niche
Week,Unnamed: 1_level_2,Unnamed: 2_level_2
10,0.714286,0.285714
11,0.729167,0.270833
12,0.507937,0.492063
13,0.842593,0.157407


In [4]:
def attribute_sim_df(attribute_name, data_frame):
    """
    Calculate similarity of each pair value in attribute_name
    of data_frame based on sales shares
    :param attribute_name: name of attribute/column to calculate similarity
    :param data_frame: input data frame
    :return: data frame, index and column = unique values in attribute_name column
                         data = similarities     
    """
    g = sales_share(attribute_name,data_frame)
    col_index = g.columns.get_level_values(1).tolist()

    #  If an attribute has more than 2 values, similarity = correlation coefficient
    #  else similarity = 2 * standard deviation (because correlation = -1)    
    if(len(col_index) > 2):  # more than 2 attribute values
        attribute_corr = pd.DataFrame(
            np.corrcoef(g.transpose()),
            columns=col_index,
            index=col_index
        )
    elif len(col_index) == 2:  # exactly 2 attribite values
        att_val_sim = 2 * np.std(g,axis=0).tolist()[0]
        attribute_corr = pd.DataFrame(
            np.array([[1, att_val_sim], [att_val_sim, 1]]),
            columns=col_index,
            index=col_index
        )
    else:  # 1 attribite value
        attribute_corr = pd.DataFrame(data=[1], columns=col_index, index=col_index)
    return attribute_corr

item_sim = attribute_sim_df("Item_ID", df)
item_sim 

Unnamed: 0,Yo000,Yo001,Yo002,Yo003,Yo004,Yo005,Yo006,Yo007,Yo008,Yo009
Yo000,1.0,-0.88914,-0.32165,0.607633,-0.399842,0.452544,0.328531,-0.358216,-0.390496,-0.114709
Yo001,-0.88914,1.0,0.207374,-0.879208,0.329386,0.005708,-0.471205,-0.065115,0.730258,0.138081
Yo002,-0.32165,0.207374,1.0,0.19884,-0.713396,-0.304701,0.76197,0.642015,0.338696,-0.89873
Yo003,0.607633,-0.879208,0.19884,1.0,-0.463035,-0.387284,0.736002,0.521077,-0.747278,-0.412112
Yo004,-0.399842,0.329386,-0.713396,-0.463035,1.0,-0.226014,-0.895382,-0.184979,-0.242279,0.94726
Yo005,0.452544,0.005708,-0.304701,-0.387284,-0.226014,1.0,-0.205791,-0.91223,0.567226,0.024737
Yo006,0.328531,-0.471205,0.76197,0.736002,-0.895382,-0.205791,1.0,0.563744,-0.126392,-0.919778
Yo007,-0.358216,-0.065115,0.642015,0.521077,-0.184979,-0.91223,0.563744,1.0,-0.4042,-0.432106
Yo008,-0.390496,0.730258,0.338696,-0.747278,-0.242279,0.567226,-0.126392,-0.4042,1.0,-0.268747
Yo009,-0.114709,0.138081,-0.89873,-0.412112,0.94726,0.024737,-0.919778,-0.432106,-0.268747,1.0


In [5]:
def post_process_sim_val(sim_val):
    """
    post-process similarity value
    :param sim_val: input similarity value
    """
    t_sim = sim_val
    if t_sim < 0.99:
        t_sim = max(0, -t_sim)
    return t_sim
        
def post_process_sim(sim_df):
    """
    Post-process similarity (convert similarity to dissimilarity)
    :param sim_df: similarity data frame of item/attribute
    :return: Post-processed similarity data frame
    """
    n_col = sim_df.shape[0]
    # In case of binary attribute values, nothing to change 
    if n_col > 2:
        return sim_df.applymap(post_process_sim_val)
    else:
        return sim_df

post_process_sim(item_sim)

Unnamed: 0,Yo000,Yo001,Yo002,Yo003,Yo004,Yo005,Yo006,Yo007,Yo008,Yo009
Yo000,1.0,0.88914,0.32165,0.0,0.399842,0.0,0.0,0.358216,0.390496,0.114709
Yo001,0.88914,1.0,0.0,0.879208,0.0,0.0,0.471205,0.065115,0.0,0.0
Yo002,0.32165,0.0,1.0,0.0,0.713396,0.304701,0.0,0.0,0.0,0.89873
Yo003,0.0,0.879208,0.0,1.0,0.463035,0.387284,0.0,0.0,0.747278,0.412112
Yo004,0.399842,0.0,0.713396,0.463035,1.0,0.226014,0.895382,0.184979,0.242279,0.0
Yo005,0.0,0.0,0.304701,0.387284,0.226014,1.0,0.205791,0.91223,0.0,0.0
Yo006,0.0,0.471205,0.0,0.0,0.895382,0.205791,1.0,0.0,0.126392,0.919778
Yo007,0.358216,0.065115,0.0,0.0,0.184979,0.91223,0.0,1.0,0.4042,0.432106
Yo008,0.390496,0.0,0.0,0.747278,0.242279,0.0,0.126392,0.4042,1.0,0.268747
Yo009,0.114709,0.0,0.89873,0.412112,0.0,0.0,0.919778,0.432106,0.268747,1.0


In [6]:
def attribute_item_sim(attribute_name, data_frame):
    """
    Calculate item-attribute similarity
    :param attribute_name: attribute to calculate with "Item_ID"
    :param data_frame: input data frame
    :return: item-attribute similarity 
    """
    item_sim = []
    attribute_sim = []
    
    # post-process-similarity
    # calculate item-item similarity
    item_corr = post_process_sim(attribute_sim_df('Item_ID', data_frame))
    
    # calculate attribute-values similarity
    attribute_corr = post_process_sim(attribute_sim_df(attribute_name, data_frame))
    
    # create data frame to define item_ID and it attribute value
    table_attribute_item = data_frame.loc[:,['Item_ID',attribute_name]]
    table_attribute_item = table_attribute_item.groupby('Item_ID').max()
    
    len_item_ids = table_attribute_item.shape[0]  # number of Item_ID
    
    # prepare table item similarity and attribute value similarity
    # for each pair of item (including the same), append their similarity to array item_sim
    # then find and put their attribute value similarity to array attribute_sim
    for i in range(len_item_ids):
        item_id_i = table_attribute_item.index.tolist()[i]
        attribute_i = table_attribute_item.iloc[:,0].tolist()[i]
        for j in range(i, len_item_ids):
            # operate with item-item similarity
            item_id_j = table_attribute_item.index.tolist()[j]
            item_sim_val = item_corr.at[item_id_i, item_id_j]
            item_sim.append(item_sim_val)
            # find and add relevant attribute value similarity
            attribute_j = table_attribute_item.iloc[:,0].tolist()[j]
            attribute_sim_val = attribute_corr.at[attribute_i,attribute_j]
            attribute_sim.append(attribute_sim_val)
    
    #  in case there's only one item or attribute value
    if (len(set(item_sim)) == 1) or (len(set(attribute_sim)) == 1):
        return 1.0
    else:
        return np.corrcoef([item_sim, attribute_sim])[0, 1]

In [7]:
print("Size :" ,attribute_item_sim('Size', df))
print("Organic :" ,attribute_item_sim('Organic', df))
print("Brand :" ,attribute_item_sim('Brand', df))
print("Mild :" ,attribute_item_sim('Mild', df))
print("Flavor :" ,attribute_item_sim('Flavor', df))

Size : 0.38255673972922344
Organic : 0.26198126747036
Brand : 0.062374235302752545
Mild : 0.4897732545474556
Flavor : 0.34491652526130007


In [8]:
# Item-Mild is max ==> most significant attribute is "Mild", values of which is "Mild" & "Non-mild"
df.groupby('Mild')['Unit_sales'].sum()

Mild
Mild        254
Non-mild    132
Name: Unit_sales, dtype: int64

In [9]:
df_mild = df.loc[df['Mild'] == 'Mild']
df_mild

Unnamed: 0,Date_Time,Store_ID,User_ID,Item_ID,Unit_sales,Category,Size,Organic,Brand,Mild,Flavor,Week
0,2018-03-10 06:00:00,1,11,Yo008,4,Yogurt,Large,Organic,MainStream,Mild,Special,10
1,2018-03-21 06:00:00,1,12,Yo008,3,Yogurt,Large,Organic,MainStream,Mild,Special,12
3,2018-03-10 06:00:00,1,7,Yo004,3,Yogurt,Medium,Organic,MainStream,Mild,Non-flavored,10
6,2018-03-24 06:00:00,1,12,Yo006,2,Yogurt,Medium,Organic,Niche,Mild,MainStream,12
7,2018-03-28 06:00:00,1,1,Yo003,1,Yogurt,Small,Non-organic,Niche,Mild,Special,13
12,2018-03-14 06:00:00,1,7,Yo004,5,Yogurt,Medium,Organic,MainStream,Mild,Non-flavored,11
13,2018-03-21 06:00:00,1,3,Yo000,4,Yogurt,Large,Organic,Niche,Mild,Non-flavored,12
14,2018-03-28 06:00:00,1,19,Yo008,4,Yogurt,Large,Organic,MainStream,Mild,Special,13
15,2018-03-24 06:00:00,1,19,Yo003,3,Yogurt,Small,Non-organic,Niche,Mild,Special,12
17,2018-03-31 06:00:00,1,8,Yo006,5,Yogurt,Medium,Organic,Niche,Mild,MainStream,13


In [10]:
print("Size :" ,attribute_item_sim('Size', df_mild))
print("Organic :" ,attribute_item_sim('Organic', df_mild))
print("Brand :" ,attribute_item_sim('Brand', df_mild))
print("Flavor :" ,attribute_item_sim('Flavor', df_mild))

Size : 0.6043098264883451
Organic : 0.19203978215362547
Brand : -0.018570923182703886
Flavor : 0.5712849172745973


In [11]:
# In Mild sub-section, Item-Size similarity is max ==> 
# ==> most significant attribute in this subsection is Size
df_mild.groupby('Size')['Unit_sales'].sum()

Size
Large     75
Medium    92
Small     87
Name: Unit_sales, dtype: int64

In [12]:
df_mild_large = df_mild.loc[df_mild['Size'] == "Large"]
df_mild_large

Unnamed: 0,Date_Time,Store_ID,User_ID,Item_ID,Unit_sales,Category,Size,Organic,Brand,Mild,Flavor,Week
0,2018-03-10 06:00:00,1,11,Yo008,4,Yogurt,Large,Organic,MainStream,Mild,Special,10
1,2018-03-21 06:00:00,1,12,Yo008,3,Yogurt,Large,Organic,MainStream,Mild,Special,12
13,2018-03-21 06:00:00,1,3,Yo000,4,Yogurt,Large,Organic,Niche,Mild,Non-flavored,12
14,2018-03-28 06:00:00,1,19,Yo008,4,Yogurt,Large,Organic,MainStream,Mild,Special,13
23,2018-03-14 06:00:00,1,7,Yo008,1,Yogurt,Large,Organic,MainStream,Mild,Special,11
29,2018-03-07 06:00:00,2,14,Yo008,4,Yogurt,Large,Organic,MainStream,Mild,Special,10
30,2018-03-21 06:00:00,1,1,Yo008,4,Yogurt,Large,Organic,MainStream,Mild,Special,12
34,2018-03-14 06:00:00,1,13,Yo000,4,Yogurt,Large,Organic,Niche,Mild,Non-flavored,11
35,2018-03-14 06:00:00,1,15,Yo000,1,Yogurt,Large,Organic,Niche,Mild,Non-flavored,11
38,2018-03-07 06:00:00,1,19,Yo008,3,Yogurt,Large,Organic,MainStream,Mild,Special,10


In [13]:
print("Organic :" ,attribute_item_sim('Organic', df_mild_large))
print("Brand :" ,attribute_item_sim('Brand', df_mild_large))
print("Flavor :" ,attribute_item_sim('Flavor', df_mild_large))

# In sub-section Mild_Large item-attribute = 1
# ==> There no significant attribute
# sub-section Mild_Large is a leaf node
# as we can see, there are two type of item, each with their attribute values

Organic : 1.0
Brand : 0.9999999999999998
Flavor : 0.9999999999999998


In [14]:

df_mild_medium = df_mild.loc[df_mild['Size'] == "Medium"]
df_mild_medium

Unnamed: 0,Date_Time,Store_ID,User_ID,Item_ID,Unit_sales,Category,Size,Organic,Brand,Mild,Flavor,Week
3,2018-03-10 06:00:00,1,7,Yo004,3,Yogurt,Medium,Organic,MainStream,Mild,Non-flavored,10
6,2018-03-24 06:00:00,1,12,Yo006,2,Yogurt,Medium,Organic,Niche,Mild,MainStream,12
12,2018-03-14 06:00:00,1,7,Yo004,5,Yogurt,Medium,Organic,MainStream,Mild,Non-flavored,11
17,2018-03-31 06:00:00,1,8,Yo006,5,Yogurt,Medium,Organic,Niche,Mild,MainStream,13
19,2018-03-07 06:00:00,2,1,Yo006,1,Yogurt,Medium,Organic,Niche,Mild,MainStream,10
20,2018-03-31 06:00:00,1,2,Yo004,2,Yogurt,Medium,Organic,MainStream,Mild,Non-flavored,13
22,2018-03-31 06:00:00,1,2,Yo004,3,Yogurt,Medium,Organic,MainStream,Mild,Non-flavored,13
28,2018-03-31 06:00:00,1,14,Yo006,5,Yogurt,Medium,Organic,Niche,Mild,MainStream,13
32,2018-03-07 06:00:00,1,7,Yo004,4,Yogurt,Medium,Organic,MainStream,Mild,Non-flavored,10
36,2018-03-17 06:00:00,1,14,Yo006,3,Yogurt,Medium,Organic,Niche,Mild,MainStream,11


In [15]:
print("Organic :" ,attribute_item_sim('Organic', df_mild_large))
print("Brand :" ,attribute_item_sim('Brand', df_mild_medium))
print("Flavor :" ,attribute_item_sim('Flavor', df_mild_medium))

# In sub-section Mild_Medium item-attribute = 1
# ==> There no significant attribute
# sub-section Mild_Medium is a leaf node
# as we can see, there are two type of item, each with their attribute values

Organic : 1.0
Brand : 1.0
Flavor : 1.0


In [16]:
df_mild_small = df_mild.loc[df_mild['Size'] == "Small"]
df_mild_small

Unnamed: 0,Date_Time,Store_ID,User_ID,Item_ID,Unit_sales,Category,Size,Organic,Brand,Mild,Flavor,Week
7,2018-03-28 06:00:00,1,1,Yo003,1,Yogurt,Small,Non-organic,Niche,Mild,Special,13
15,2018-03-24 06:00:00,1,19,Yo003,3,Yogurt,Small,Non-organic,Niche,Mild,Special,12
18,2018-03-21 06:00:00,1,5,Yo003,1,Yogurt,Small,Non-organic,Niche,Mild,Special,12
25,2018-03-31 06:00:00,1,15,Yo003,4,Yogurt,Small,Non-organic,Niche,Mild,Special,13
26,2018-03-21 06:00:00,1,17,Yo003,1,Yogurt,Small,Non-organic,Niche,Mild,Special,12
27,2018-03-17 06:00:00,1,7,Yo003,1,Yogurt,Small,Non-organic,Niche,Mild,Special,11
31,2018-03-14 06:00:00,1,20,Yo003,5,Yogurt,Small,Non-organic,Niche,Mild,Special,11
33,2018-03-10 06:00:00,1,15,Yo001,3,Yogurt,Small,Organic,MainStream,Mild,MainStream,10
41,2018-03-28 06:00:00,2,6,Yo003,1,Yogurt,Small,Non-organic,Niche,Mild,Special,13
46,2018-03-10 06:00:00,1,19,Yo003,4,Yogurt,Small,Non-organic,Niche,Mild,Special,10


In [17]:
print("Organic :" ,attribute_item_sim('Organic', df_mild_small))
print("Brand :" ,attribute_item_sim('Brand', df_mild_small))
print("Flavor :" ,attribute_item_sim('Flavor', df_mild_small))
# In sub-section Mild_Small item-attribute = 1
# ==> There no significant attribute
# sub-section Mild_Small is a leaf node
# as we can see, there are two type of item, each with their attribute values

Organic : 1.0
Brand : 1.0
Flavor : 1.0


In [18]:
df_non_mild = df.loc[df['Mild'] == 'Non-mild']
df_non_mild

Unnamed: 0,Date_Time,Store_ID,User_ID,Item_ID,Unit_sales,Category,Size,Organic,Brand,Mild,Flavor,Week
2,2018-03-28 06:00:00,1,20,Yo005,2,Yogurt,Medium,Organic,MainStream,Non-mild,Special,13
4,2018-03-24 06:00:00,1,15,Yo007,2,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,12
5,2018-03-14 06:00:00,1,1,Yo005,3,Yogurt,Medium,Organic,MainStream,Non-mild,Special,11
8,2018-03-07 06:00:00,2,8,Yo007,1,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,10
9,2018-03-17 06:00:00,1,3,Yo005,5,Yogurt,Medium,Organic,MainStream,Non-mild,Special,11
10,2018-03-28 06:00:00,1,4,Yo009,3,Yogurt,Large,Non-organic,MainStream,Non-mild,Non-flavored,13
11,2018-03-21 06:00:00,1,1,Yo002,1,Yogurt,Small,Non-organic,MainStream,Non-mild,Special,12
16,2018-03-17 06:00:00,2,7,Yo009,3,Yogurt,Large,Non-organic,MainStream,Non-mild,Non-flavored,11
21,2018-03-24 06:00:00,1,4,Yo007,4,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,12
24,2018-03-21 06:00:00,1,16,Yo007,3,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,12


In [19]:
print("Size :" ,attribute_item_sim('Size', df_non_mild))
print("Brand :" ,attribute_item_sim('Brand', df_non_mild))
print("Organic :" ,attribute_item_sim('Organic', df_non_mild))
print("Flavor :" ,attribute_item_sim('Flavor', df_non_mild))

Size : 0.5052202305226957
Brand : 1.0
Organic : 0.4352731405679618
Flavor : 0.46243958567570204


In [20]:
# In sub-section Non-mild, Item-Size similarity is max
# ==> Size is significant attribute
df_non_mild.groupby('Size')['Unit_sales'].sum()

Size
Large     79
Medium    34
Small     19
Name: Unit_sales, dtype: int64

In [21]:
df_non_mild_small = df_non_mild.loc[df_non_mild['Size'] == "Small"]
df_non_mild_small
# As we can see, every item in sub-section Non-mild_Small is the same > leaf node

Unnamed: 0,Date_Time,Store_ID,User_ID,Item_ID,Unit_sales,Category,Size,Organic,Brand,Mild,Flavor,Week
11,2018-03-21 06:00:00,1,1,Yo002,1,Yogurt,Small,Non-organic,MainStream,Non-mild,Special,12
42,2018-03-28 06:00:00,1,4,Yo002,3,Yogurt,Small,Non-organic,MainStream,Non-mild,Special,13
43,2018-03-07 06:00:00,2,3,Yo002,4,Yogurt,Small,Non-organic,MainStream,Non-mild,Special,10
53,2018-03-28 06:00:00,1,4,Yo002,1,Yogurt,Small,Non-organic,MainStream,Non-mild,Special,13
74,2018-03-31 06:00:00,1,17,Yo002,2,Yogurt,Small,Non-organic,MainStream,Non-mild,Special,13
82,2018-03-24 06:00:00,1,19,Yo002,2,Yogurt,Small,Non-organic,MainStream,Non-mild,Special,12
91,2018-03-07 06:00:00,1,20,Yo002,5,Yogurt,Small,Non-organic,MainStream,Non-mild,Special,10
112,2018-03-10 06:00:00,1,15,Yo002,1,Yogurt,Small,Non-organic,MainStream,Non-mild,Special,10


In [22]:
df_non_mild_medium = df_non_mild.loc[df_non_mild['Size'] == "Medium"]
df_non_mild_medium
# As we can see, every item in sub-section Non-mild_Medium is the same > leaf node

Unnamed: 0,Date_Time,Store_ID,User_ID,Item_ID,Unit_sales,Category,Size,Organic,Brand,Mild,Flavor,Week
2,2018-03-28 06:00:00,1,20,Yo005,2,Yogurt,Medium,Organic,MainStream,Non-mild,Special,13
5,2018-03-14 06:00:00,1,1,Yo005,3,Yogurt,Medium,Organic,MainStream,Non-mild,Special,11
9,2018-03-17 06:00:00,1,3,Yo005,5,Yogurt,Medium,Organic,MainStream,Non-mild,Special,11
49,2018-03-31 06:00:00,1,17,Yo005,3,Yogurt,Medium,Organic,MainStream,Non-mild,Special,13
60,2018-03-21 06:00:00,1,17,Yo005,2,Yogurt,Medium,Organic,MainStream,Non-mild,Special,12
65,2018-03-24 06:00:00,1,19,Yo005,1,Yogurt,Medium,Organic,MainStream,Non-mild,Special,12
69,2018-03-31 06:00:00,1,9,Yo005,5,Yogurt,Medium,Organic,MainStream,Non-mild,Special,13
73,2018-03-21 06:00:00,1,10,Yo005,5,Yogurt,Medium,Organic,MainStream,Non-mild,Special,12
96,2018-03-07 06:00:00,1,17,Yo005,3,Yogurt,Medium,Organic,MainStream,Non-mild,Special,10
117,2018-03-28 06:00:00,1,6,Yo005,5,Yogurt,Medium,Organic,MainStream,Non-mild,Special,13


In [23]:
df_non_mild_large = df_non_mild.loc[df_non_mild['Size'] == "Large"]
df_non_mild_large
# As we can see, every item in sub-section Non-mild_Large is the same > leaf node
# End of the tree

Unnamed: 0,Date_Time,Store_ID,User_ID,Item_ID,Unit_sales,Category,Size,Organic,Brand,Mild,Flavor,Week
4,2018-03-24 06:00:00,1,15,Yo007,2,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,12
8,2018-03-07 06:00:00,2,8,Yo007,1,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,10
10,2018-03-28 06:00:00,1,4,Yo009,3,Yogurt,Large,Non-organic,MainStream,Non-mild,Non-flavored,13
16,2018-03-17 06:00:00,2,7,Yo009,3,Yogurt,Large,Non-organic,MainStream,Non-mild,Non-flavored,11
21,2018-03-24 06:00:00,1,4,Yo007,4,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,12
24,2018-03-21 06:00:00,1,16,Yo007,3,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,12
44,2018-03-31 06:00:00,1,19,Yo007,3,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,13
47,2018-03-10 06:00:00,1,1,Yo007,5,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,10
48,2018-03-17 06:00:00,1,1,Yo009,4,Yogurt,Large,Non-organic,MainStream,Non-mild,Non-flavored,11
52,2018-03-10 06:00:00,1,5,Yo007,5,Yogurt,Large,Non-organic,MainStream,Non-mild,MainStream,10


In [24]:
print("Brand :" ,attribute_item_sim('Brand', df_non_mild_large))
print("Organic :" ,attribute_item_sim('Organic', df_non_mild_large))
print("Flavor :" ,attribute_item_sim('Flavor', df_non_mild_large))

Brand : 1.0
Organic : 1.0
Flavor : 0.9999999999999999
