In [1]:
import pandas as pd
import json
from tqdm import tqdm
import ast
import numpy as np
from collections import deque
from pandas import DataFrame

In [2]:
def str2list(x):
    if len(x) > 0:
        if x[0] != '[':
            list = [x]
        else: 
            list = ast.literal_eval(x)
            if len(list) == 0: list = ['Empty']
    else: list = ['Empty']

    return list

In [None]:
basic_info = pd.read_csv('/opt/ml/wine/data/basic_info_total.csv', encoding='utf-8-sig')
with open('/opt/ml/wine/code/feature_map/item2idx.json','r') as f:
    item2idx = json.load(f)

In [None]:
item_data = pd.read_csv('/opt/ml/wine/server/data/item_df_allfeature.csv', encoding='utf-8-sig')

In [None]:
wine_df = pd.read_csv('/opt/ml/wine/data/wine_df.csv', encoding='utf-8-sig')

In [None]:
wine_df['grape'].fillna('[]', inplace = True)
wine_df['grape'] = wine_df['grape'].apply(lambda x: str2list(x))

In [None]:
wine_df['item_id'] = wine_df['url'].map(item2idx)
wine_df = wine_df[wine_df['item_id'].isna()==False]
wine_df['item_id'] = wine_df['item_id'].astype(int).astype('category')

In [None]:
basic_info['item_id'] = basic_info['url'].map(item2idx)
basic_info = basic_info[basic_info['item_id'].isna()==False]
basic_info['item_id'] = basic_info['item_id'].astype(int).astype('category')

In [None]:
basic_info.set_index('item_id', inplace= True)
item_data.set_index('item_id', inplace= True)

In [None]:
no_key = 0
for item_id in tqdm(item_data.index):
    try:
        item_data.loc[item_id,'winery'] = basic_info.loc[item_id,'winery']
        item_data.loc[item_id,'grape'] = basic_info.loc[item_id,'grapes']
        item_data.loc[item_id,'country'] = basic_info.loc[item_id,'country']
        item_data.loc[item_id,'region'] = basic_info.loc[item_id,'region1']
        item_data.loc[item_id,'region2'] = basic_info.loc[item_id,'region2']
        item_data.loc[item_id,'region3'] = basic_info.loc[item_id,'region3']
        item_data.loc[item_id,'region4'] = basic_info.loc[item_id,'region4']
        item_data.loc[item_id,'wine_style'] = basic_info.loc[item_id,'wine_style']
        item_data.loc[item_id,'allergens'] = basic_info.loc[item_id,'allergens']
        item_data.loc[item_id,'alcohol content'] = basic_info.loc[item_id,'alcohol content']
    except KeyError:
        try:
            item_data.loc[item_id,'grape'] = wine_df.loc[item_id,'grape']
        except:
            no_key +=1
print(no_key)

In [None]:
item_data.rename(columns = {'region':'region1'}, inplace= True)

In [None]:
item_data['item_id'] = item_data.index

In [None]:
item_data.to_csv('/opt/ml/wine/data/item_df_allfeature.csv',index = False, encoding='utf-8-sig')

In [None]:
item_data.vectors

In [None]:
with open('/opt/ml/wine/data/sample_vectors.json','r') as f: 
    sample_vec = json.load(f)

In [None]:
def get_item_vector(df, vector_path):
    with open(vector_path,'r') as f: 
        vectors = json.load(f)
    vector_list = []

    for id in tqdm(item_data.item_id):
        id = str(id)
        if id in vectors.keys():
            vector_list.append(np.array(vectors[id]))
        else:
            vector_list.append(None)
    df['vectors'] = vector_list
    return df


In [None]:
import re

def keep_only_english(text):
    try:
        english_text = re.sub(r'[^a-zA-Z\s]', '', text)
        return english_text.lower().strip()
    except: return None

In [None]:
def find_vectors(columns_name : deque, grouped_vectors: DataFrame):
    column, name = columns_name.popleft()
    if name is not None:
        grouped_vectors = grouped_vectors.query(f"{column} == '{name}'")
        if columns_name:
            vector = find_vectors(columns_name, grouped_vectors.drop(column,axis = 1))
        else: 
            vector = grouped_vectors.vectors.mean()
            
        return vector
    else:
        return grouped_vectors.vectors.mean()
    

In [None]:
def fill_vectors(df : DataFrame, vector_path: str, )
    df = get_item_vector(df, vector_path)

    for col in ['country','region1', 'winetype', 'wine_style']:
        df[col] = df[col].apply(keep_only_english)
    
    vector_item = df[df.vectors.isna()==False]

    grouped_vectors = vector_item.groupby([
        'country',
        'region1', 
        'winetype',
        'wine_style'
    ]).agg({'vectors': 'mean'}).reset_index()

    non_vectors = item_data[df['vectors'].isna()==True]
    non_vectors_cols = list(grouped_vectors.columns)
    non_vectors_cols.append('item_id')
    non_vectors = non_vectors.loc[:, non_vectors_cols]

    vectors = []
    for index, row in tqdm(non_vectors.iterrows()):
        columns_name = deque()

        for column, name in zip(row.keys(), row.values):
            if column != 'vectors':
                columns_name.append((column, name))
            else: break
        vector = find_vectors(columns_name, grouped_vectors)
        vectors.append(vector)
    
    non_vectors['vectors'] = vectors
    not_filled = non_vectors[non_vectors['vectors'].isna()==True]
    filled = non_vectors[non_vectors['vectors'].isna()==False]
    mean_vector = filled['vectors'].mean()
    
    not_filled['vectors'] = [ mean_vector for _ in range(len(not_filled))]
    filled_total = pd.concat([filled, not_filled], axis=0)

    no_vectors = item_data[item_data['vectors'].isna()==True]
    no_vectors.drop('vectors',axis = 1, inplace = True)
    yes_vectors = item_data[item_data['vectors'].isna()==False]

    no_vectors.reset_index(drop = True, inplace = True)
    filled_total.reset_index(drop = True, inplace = True)
    no_vectors = no_vectors.sort_values(by='item_id', ascending=False)
    filled_total = filled_total.sort_values(by='item_id', ascending=False)
    no_vectors.set_index('item_id', inplace = True)
    no_vectors['item_id'] = no_vectors.index
    yes_vectors['item_id'] = yes_vectors.index

    filled_total.set_index('item_id', inplace = True)
    filled_total['item_id'] = filled_total.index

    no_vectors['vectors'] = filled_total['vectors']
    no_vectors.reset_index(drop = True, inplace = True)
    yes_vectors.reset_index(drop = True, inplace = True)

    item_data_with_vectors = pd.concat([no_vectors,yes_vectors], axis=0)

    item_data_with_vectors = item_data_with_vectors.sort_values(by='item_id', ascending=True).reset_index(drop = True)
    return item
d



In [None]:
item_data_with_vectors = item_data_with_vectors.sort_values(by='item_id', ascending=True).reset_index(drop = True)


In [None]:
item_data_with_vectors.to_csv('/opt/ml/wine/data/item_data_sample_vec.csv', index = False, encoding = 'utf-8-sig')

In [3]:
item_data_with_vectors = pd.read_csv('/opt/ml/wine/data/item_data_sample_vec.csv', encoding = 'utf-8-sig')

In [4]:
def string2array(x):
    x = x.replace('\n', '').strip('[]')
    x_list = [float(i) for i in x.split(' ') if len(i) != 0]
    return np.array(x_list)

In [5]:
item_data_with_vectors['vectors'] = item_data_with_vectors['vectors'].apply(string2array)

In [6]:
wine_vectors = []
for vector in item_data_with_vectors['vectors']:
    wine_vectors.append(vector)
wine_vectors = np.array(wine_vectors)

In [41]:
item_to_fill = item_data_with_vectors.iloc[83,:]


In [16]:
item_vector = item_to_fill.vectors
None_col = list(item_to_fill.index[item_to_fill.isna()])

item_data_with_vectors.set_index('item_id', inplace=True)
item_data_with_vectors['item_id'] = item_data_with_vectors.index
item_ids = list(item_data_with_vectors['item_id'])
vector_dimension = item_vector.shape[0]

index = faiss.IndexFlatL2(vector_dimension)
index = faiss.IndexIDMap2(index)
index.add_with_ids(wine_vectors, item_ids)



In [32]:
# Faiss expects the query vectors to be normalized
to_search = np.expand_dims(item_vector, axis=0)
to_search = np.ascontiguousarray(to_search.astype(np.float32))
faiss.normalize_L2(to_search)

k = index.ntotal
distances, searched_wine_ids = index.search(to_search, k=30)




In [39]:
result = []
for ids, dists in zip(searched_wine_ids[0], distances[0]): 
    result.append((ids, dists))

sim_items = item_data_with_vectors.loc[[x[0] for x in result], :]
sim_items['distance'] = 0

for id, dist in result: sim_items.loc[id, 'distance'] = 1/dist

In [43]:
None_col = ['wine_style','grape']

In [72]:
def count_pairing(data : pd.DataFrame):
    dict = defaultdict(float)
    for pairings, dist in zip(data.pairing, data.distance):
        for menu in pairings.split(' '):
            dict[menu] += dist
    return max(dict , key=lambda k: dict[k])

In [101]:
def count_grape(data : pd.DataFrame):
    dict = defaultdict(float)
    for grapes, dist in zip(data.grape, data.distance):
        try: grapes = ast.literal_eval(grapes)
        except: pass

        for grape in grapes:
            dict[grape.lower()] += dist

    return [max(dict , key=lambda k: dict[k])]

def count_most(data : pd.DataFrame, column):
    dict = defaultdict(float)
    for feat, dist in zip(data[column], data.distance):
        dict[feat] += dist
    return max(dict , key=lambda k: dict[k])


In [102]:
from collections import defaultdict
def most_close(sim_items : DataFrame):
    result = {}
    for col in sim_items.columns:
        if col == 'pairing':
            result[col] = count_pairing(sim_items)
        elif col == 'grape':
            result[col] = count_grape(sim_items)
        else:
            result[col] = count_most(sim_items, col)
    return result


In [103]:
import faiss
from pandas import Series

wine_vectors = []
for vector in item_data_with_vectors['vectors']:
    wine_vectors.append(vector)
wine_vectors = np.array(wine_vectors)


def find_most_sim_item(df : DataFrame, to_fill_item_id: int, wine_vectors : np.array):

    ###index should be wine_id/item_id
    item_to_fill = item_data_with_vectors.loc[to_fill_item_id,:]

    item_vector = item_to_fill.vectors
    None_col = list(item_to_fill.index[item_to_fill.isna()])
    
    None_col.append('distance')

    item_data_with_vectors.set_index('item_id', inplace=True)
    item_data_with_vectors['item_id'] = item_data_with_vectors.index
    item_ids = list(item_data_with_vectors['item_id'])
    vector_dimension = item_vector.shape[0]

    index = faiss.IndexFlatL2(vector_dimension)
    index = faiss.IndexIDMap2(index)
    index.add_with_ids(wine_vectors, item_ids)

    # Faiss expects the query vectors to be normalized
    to_search = np.expand_dims(item_vector, axis=0)
    to_search = np.ascontiguousarray(to_search.astype(np.float32))
    faiss.normalize_L2(to_search)

    k = index.ntotal
    distances, searched_wine_ids = index.search(to_search, k=20)

    result = []
    for ids, dists in zip(searched_wine_ids[0], distances[0]): 
        result.append((ids, dists))

    sim_items = item_data_with_vectors.loc[[x[0] for x in result], :]
    sim_items['distance'] = 0
    
    for id, dist in result: sim_items.loc[id, 'distance'] = 1/dist

    sim_items = sim_items.loc[:, None_col]
    to_fill = most_close(sim_items)
    
    for col, val in to_fill.items():
        item_data_with_vectors.loc[to_fill_item_id, col] = val

    return item_data_with_vectors
    

In [97]:
item_to_fill = item_data_with_vectors.loc[0,:]

item_vector = item_to_fill.vectors
None_col = list(item_to_fill.index[item_to_fill.isna()])

In [104]:
find_most_sim_item(item_data_with_vectors, 0 , wine_vectors)

Unnamed: 0_level_0,country,region1,winery,winetype,grape,name,vintage,house,price,wine_rating,...,Soft,Acidic,Fizzy,Gentle,region2,region3,region4,vectors,distance,item_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,france,bordeaux,château tour de biot,redwine,"['merlot', 'cabernet_sauvignon', 'cabernet_fra...",Bordeaux 2018,2018.0,Château_Tour_de_Biot,18.99,3.4,...,0.00,0.00,0.00,0.00,Piemonte,Barbaresco,,"[0.516273478, 0.692428665, 0.381556366, 0.0450...",0.005957,0
1,italy,northernitaly,pio cesare,sparklingwine,['Moscato Bianco'],Moscato d'Asti 2020,2020.0,Pio_Cesare,21.99,4.2,...,0.00,0.00,0.00,0.00,Piemonte,Moscato_d'Asti,,"[0.39246925, 0.55693234, 0.41948521, 0.4692937...",,1
2,france,beaujolais,g. descombes,redwine,['Gamay'],Régnié 2020,2020.0,G._Descombes,31.99,4.0,...,0.00,0.00,0.00,0.00,Régnié,,,"[0.45079248, 0.51020185, 0.48914464, 0.4918099...",,2
3,unitedstates,california,ava grace,roswine,['Pinot Gris'],Rosé 2019,2019.0,AVA_Grace,8.99,3.6,...,0.00,0.00,0.00,0.00,,,,"[0.51873369, 0.42762921, 0.48912925, 0.5379725...",,3
4,france,champagne,Louis_Dumont,sparklingwine,Chardonnay PinotNoir PinotMeunier,Brut Rosé Champagne N.V.,,Louis_Dumont,,3.7,...,5.88,94.12,94.12,5.88,,,,"[0.49408021, 0.52453716, 0.4605697, 0.47825829...",,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74917,spain,jerezxrssherry,Lustau,fortifiedwine,['Cabernet Sauvignon'],La Ina Fino N.V.,,Lustau,,3.5,...,0.00,0.00,0.00,0.00,,,,"[0.49900747, 0.50112946, 0.50321889, 0.5041905...",,74917
74918,spain,andaluca,lustau,redwine,['Nebbiolo'],Barbaresco Masseria 2016,2016.0,Vietti,208.33,4.1,...,33.04,66.96,0.00,0.00,Jerez-Xérès-Sherry,,,"[0.49900747, 0.50112946, 0.50321889, 0.5041905...",,74918
74919,italy,northernitaly,vietti,redwine,['nebbiolo'],Gevrey Chambertin Premier Cru Bel Air 2014,2014.0,Domaine_Taupenot-Merme,333.00,4.3,...,0.00,0.00,0.00,0.00,Piemonte,Barbaresco,,"[0.48767044, 0.48793782, 0.51979431, 0.5030191...",,74919
74920,france,bourgogne,domaine taupenot-merme,redwine,['Cabernet Franc'],Morey-Saint-Denis 2019,2019.0,Domaine_des_Lambrays,160.00,4.4,...,0.00,0.00,0.00,0.00,Côte_de_Nuits,Gevrey-Chambertin,Gevrey-Chambertin_Premier_Cru,"[0.41090245, 0.32463001, 0.55826734, 0.8391787...",,74920
