In [None]:
import pandas as pd
import numpy as np
import joblib
import ast
from joblib import Parallel, delayed
import json
import os
from tqdm import tqdm

In [None]:
item_data = pd.read_csv('/opt/ml/wine/data/wine_df.csv')

In [None]:
def drop_columns(df):
    to_drop = ['Red Fruit','Tropical','Tree Fruit','Oaky',
               'Ageing','Black Fruit','Citrus','Dried Fruit','Earthy',
               'Floral','Microbio','Spices', 'Vegetal',
               'Unnamed: 58', 'None_child', 'Unnamed: 60', 'Unnamed: 61', 
               'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64']
    
    for c in to_drop:
        try:
            df.drop(c, axis = 1, inplace= True)  
        except Exception as e: 
            print(e)
    
    return df

def fill_na(df):
    with open('/opt/ml/wine/code/data/meta_data/string_columns.json','r',encoding='utf-8') as f:  
        col = json.load(f)
        df[col] = df[col].fillna('')
    with open('/opt/ml/wine/code/data/meta_data/dict_columns.json','r',encoding='utf-8') as f:  
        col = json.load(f)
        df[col] = df[col].fillna('{}')
    with open('/opt/ml/wine/code/data/meta_data/list_columns.json','r',encoding='utf-8') as f:  
        col = json.load(f)
        df[col] = df[col].fillna('[]')
    return df

In [58]:
item_data = fill_na(item_data)
item_data = drop_columns(item_data)

"['None_child'] not found in axis"


In [59]:
notes = ['Red Fruit','Tropical','Tree Fruit','Oaky',
        'Ageing','Black Fruit','Citrus','Dried Fruit','Earthy',
        'Floral','Microbio','Spices', 'Vegetal']


In [60]:
def str2list(x):
    if len(x) > 0:
        if x[0] != '[':
            list = [x]
        else: 
            list = ast.literal_eval(x)
    else: list = []
    
    return list

def feature_mapper(df, column):
    unique_val = df[column].unique()
    feature2idx = {f:i for i, f in enumerate(unique_val)}
    idx2feature = {i:f for i, f in enumerate(unique_val)}

    if not os.path.exists('/opt/ml/wine/code/meta_data/'): 
        os.makedirs('/opt/ml/wine/code/meta_data/')

    with open(f'/opt/ml/wine/code/feature_map/{column}2idx.json','w',encoding='utf-8') as f:  
        json.dump(feature2idx, f, ensure_ascii=False)
    with open(f'/opt/ml/wine/code/feature_map/idx2{column}.json','w',encoding='utf-8') as f:  
        json.dump(idx2feature, f, ensure_ascii=False)

    return feature2idx, idx2feature

def list_feature_mapper(df, column):

    df[column] = df[column].apply(lambda x: str2list(x))

    exploded = item_data[column].explode(column)
    unique_val = set(list(exploded))
    feature_dic = {}

    feature2idx = {f:i for i, f in enumerate(unique_val)}
    idx2feature = {i:f for i, f in enumerate(unique_val)}

    if not os.path.exists('/opt/ml/wine/code/meta_data/'): 
        os.makedirs('/opt/ml/wine/code/meta_data/')

    with open(f'/opt/ml/wine/code/feature_map/{column}2idx.json','w',encoding='utf-8') as f:  
        json.dump(feature2idx, f, ensure_ascii=False)
    with open(f'/opt/ml/wine/code/feature_map/idx2{column}.json','w',encoding='utf-8') as f:  
        json.dump(idx2feature, f, ensure_ascii=False)

    return df, feature2idx, idx2feature

def map_all_single_features(df):
    single_category_columns = ['country', 'region', 'winery', 'winetype', 'vintage', 'house', 'wine_style']
    for c in single_category_columns:
        feature_mapper(df, c)
    return  

def map_all_list_features(df):
    list_columns = ['grape','pairing']
    for c in list_columns:
        df ,_ ,_ = list_feature_mapper(df, c)
    return df 


def note_mapper(df, note_col):

    note = note_col

    note_col = note_col + '_child'
    note_col = note_col.replace(' ','_')
    
    try:
        df[note_col] = df[note_col].apply(lambda x: ast.literal_eval(x))
    except Exception as e:
        print(e)
    
    unique_val = []
    for note_dic in df[note_col]:
        unique_val.extend(list(note_dic.keys()))
    unique_val = list(set(unique_val))

    feature2idx = {f:i for i, f in enumerate(unique_val)}
    idx2feature = {i:f for i, f in enumerate(unique_val)}

    if not os.path.exists('/opt/ml/wine/code/meta_data/'): 
        os.makedirs('/opt/ml/wine/code/meta_data/')

    with open(f'/opt/ml/wine/code/feature_map/{note}2idx.json','w',encoding='utf-8') as f:  
        json.dump(feature2idx, f, ensure_ascii=False)
    with open(f'/opt/ml/wine/code/feature_map/idx2{note}.json','w',encoding='utf-8') as f:  
        json.dump(idx2feature, f, ensure_ascii=False)

    return feature2idx, idx2feature

def expand_notes(df):
    notes = ['Red Fruit','Tropical','Tree Fruit','Oaky',
        'Ageing','Black Fruit','Citrus','Dried Fruit','Earthy',
        'Floral','Microbio','Spices', 'Vegetal']
    
    i = 0
    for note_col in tqdm(notes):

        note_df = []

        feature2idx, idx2feature = note_mapper(df, note_col)

        for note_dic in tqdm(df[note_col.replace(' ','_') + '_child']):
            row_data = [0 for i in range(len(feature2idx))]

            for note in note_dic:
                row_data[feature2idx[note]] = note_dic[note]
  
            note_df.append(row_data)
        
        columns = [idx2feature[i] for i in range(len(idx2feature))]
        note_df = pd.DataFrame(note_df, columns=columns)

        if i == 0:
            result = note_df
            i += 1
        else:
            result = pd.concat([result, note_df], axis=1)

        df.drop(note_col.replace(' ','_') + '_child', axis = True, inplace = True)

    df = pd.concat([df, result], axis=1)
    return df
  

In [61]:
map_all_single_features(item_data)
item_data = map_all_list_features(item_data)
item_data = expand_notes(item_data)

100%|██████████| 77834/77834 [00:00<00:00, 123438.43it/s]
100%|██████████| 77834/77834 [00:00<00:00, 440574.99it/s]
100%|██████████| 77834/77834 [00:00<00:00, 277462.38it/s]
100%|██████████| 77834/77834 [00:00<00:00, 102804.04it/s]
100%|██████████| 77834/77834 [00:00<00:00, 287393.99it/s]
100%|██████████| 77834/77834 [00:00<00:00, 249694.79it/s]
100%|██████████| 77834/77834 [00:00<00:00, 144319.22it/s]
100%|██████████| 77834/77834 [00:00<00:00, 436518.41it/s]
100%|██████████| 77834/77834 [00:00<00:00, 111863.32it/s]
100%|██████████| 77834/77834 [00:00<00:00, 298654.98it/s]
100%|██████████| 77834/77834 [00:00<00:00, 153797.63it/s]
100%|██████████| 77834/77834 [00:00<00:00, 160188.55it/s]
100%|██████████| 77834/77834 [00:00<00:00, 312132.57it/s]
100%|██████████| 13/13 [00:54<00:00,  4.18s/it]


In [66]:
item_data.to_csv('/opt/ml/wine/data/item_data.csv', encoding='utf-8-sig')