# Create train and validation JSONL files for fine-tuning openAI models

In [1]:
import pandas as pd
import numpy as np 
from typing import Any
import pickle
import random
import json

from main.openai.prompt_model import create_prompt
from main.data.session_dataset import *


In [2]:
WORKING_DIR = "beauty"
TOP_K = 20

In [3]:
with open(f"{WORKING_DIR}/dataset.pickle", "rb") as dataset_pickle:
    dataset = pickle.load(dataset_pickle)

In [4]:
train = dataset.get_train_data()
train = train.groupby("SessionId")["ItemId"].apply(list).to_dict()
list(train.items())[:10]

[(0, [9677, 5760, 1839, 524, 7551]),
 (2, [4065, 5045, 10279, 10540, 2626, 11956]),
 (3,
  [9531,
   5537,
   9768,
   8779,
   8911,
   11086,
   6789,
   7036,
   6217,
   10960,
   4489,
   933,
   9706,
   9323,
   10080]),
 (4,
  [4100,
   7052,
   9932,
   1332,
   2506,
   7679,
   11286,
   11881,
   3615,
   4859,
   850,
   3981,
   566,
   5171,
   10426,
   445,
   8718]),
 (5, [2517, 3332, 126, 8021, 8175]),
 (6, [1670, 11680, 2786, 3763, 2321, 5314, 1391, 624]),
 (7, [45, 6796, 11531, 11162, 5401]),
 (8,
  [7653,
   2701,
   5611,
   1238,
   2887,
   9529,
   3267,
   6288,
   723,
   3336,
   6539,
   12000,
   5689,
   7790,
   9094]),
 (10, [1032, 6720, 4091, 8479, 10829]),
 (11, [9199, 3690, 8743, 8387, 11513, 6157, 8995])]

In [5]:
# Get the average session length
np.mean([len(session) for session in train.values()])

8.057015092230296

In [6]:
product_names = pd.read_csv(f"{WORKING_DIR}/products_lookup.csv", usecols=["global_product_id", "name"])
product_names.head(10)

Unnamed: 0,global_product_id,name
0,678,WAWO 15 Color Professionl Makeup Eyeshadow Cam...
1,11255,Xtreme Brite Brightening Gel 1oz.
2,4526,Prada Candy By Prada Eau De Parfum Spray 1.7 O...
3,8486,Versace Bright Crystal Eau de Toilette Spray f...
4,1095,Stella McCartney Stella
5,3876,Avalon Biotin B-Complex Thickening Conditioner...
6,490,"Better Living Classic Two Chamber Dispenser, W..."
7,7431,Better Living The Ulti-Mate Dispenser
8,1208,Crabtree and Evelyn - Gardener's Ultra-Moist...
9,10226,Crabtree and Evelyn 2792 Gardeners Hand Ther...


In [7]:
product_id_to_name = product_names[["global_product_id", "name"]].set_index("global_product_id").to_dict()["name"]
list(product_id_to_name.items())[:10]

[(678,
  'WAWO 15 Color Professionl Makeup Eyeshadow Camouflage Facial Concealer Neutral Palette'),
 (11255, 'Xtreme Brite Brightening Gel 1oz.'),
 (4526, 'Prada Candy By Prada Eau De Parfum Spray 1.7 Oz For Women'),
 (8486, 'Versace Bright Crystal Eau de Toilette Spray for Women, 3 Ounce'),
 (1095, 'Stella McCartney Stella'),
 (3876, 'Avalon Biotin B-Complex Thickening Conditioner, 14 Ounce'),
 (490, 'Better Living Classic Two Chamber Dispenser, White'),
 (7431, 'Better Living The Ulti-Mate Dispenser'),
 (1208,
  "Crabtree  and  Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ"),
 (10226, 'Crabtree  and  Evelyn 2792 Gardeners Hand Therapy (100ml, 3.4 oz)')]

In [8]:
min_session_length = 2
num_validation_cases = 500

train_cases = []
validation_cases = []

# Shuffle session ids to get a random validation set.
session_ids = list(train.keys())
random.shuffle(session_ids)

for i, session_id in enumerate(session_ids): 
    train_session = train[session_id]

    # We skip sessions that are too short.
    if len(train_session) < min_session_length:
        continue
    
    # Create prompt-completion pair
    prompt, completion = create_prompt.create_prompt_completion_from_session(train_session, product_id_to_name, 1)
    prompt_completion_pair = {"prompt": prompt, "completion": completion}

    # Add to validation or training set.
    if i < num_validation_cases: 
        validation_cases.append(prompt_completion_pair)
    else: 
        train_cases.append(prompt_completion_pair)

# Convert training to JSONL.
train_cases = [json.dumps(train_case) for train_case in train_cases]
train_string = '\n'.join(train_cases)

# Convert validation to JSONL.
validation_cases = [json.dumps(validation_case) for validation_case in validation_cases]
validation_string = '\n'.join(validation_cases)

with open(f"main/openai/prompt_model/train_cases_{WORKING_DIR}.jsonl", "w") as f:
    f.write(train_string) 

with open(f"main/openai/prompt_model/validation_cases_{WORKING_DIR}.jsonl", "w") as f:
    f.write(validation_string) 

print(f"Num tokens, roughly: {len(train_string) / 3}")
print(f"Costs to train ADA one epoch, roughly: {(len(train_string) / 3) * (0.0004 / 1000)}")

Num tokens, roughly: 3536536.0
Costs to train ADA one epoch, roughly: 1.4146144
