In [24]:
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
    
def create_lookup(df, column, save=None):
    df_expanding = df.explode(column)
    
    lookup = pd.factorize(df_expanding[column], sort = True)[1]
    lookup = dict(zip(lookup, range(len(lookup))))
    
    if save:
        with open(save, 'wb') as f:
            pickle.dump(lookup, f)
    
    return lookup

if __name__ == '__main__':
    train = pd.read_json('../data/train.json')
    val = pd.read_json('../data/val.json')
    test = pd.read_json('../data/test.json')
    
    train_index, val_index = train_test_split(train.index, test_size = 0.2, random_state = 1)

    train.loc[train_index, 'dataset'] = 0
    train.loc[val_index, 'dataset'] = 1
    val['dataset'] = 2
    test['dataset'] = 3

    full = pd.concat([train, val, test], axis = 0, ignore_index = True)
    del train, val, test
    
    full.loc[val_index, 'songs'] = full.loc[val_index, 'songs'].apply(lambda x: np.random.choice(x, size=int(len(x)*0.5), replace=False))
    full.loc[val_index, 'songs'] = full.loc[val_index, 'songs'].apply(lambda x: x if np.random.choice([0,1], p=[0.2, 0.8]) == 1 else [])

    for column in ['songs', 'tags']:
        lookup = create_lookup(full, column, save = f'../output/lookup_{column}_0620.pickle')
        full[column] = full[column].apply(lambda x: list(map(lambda y: lookup[y], x)))
        
    full.to_json('../output/full(0620).json')