In [3]:
import pandas as pd

In [4]:
train_features_df = pd.read_json(r"D:\youtube\hackathon\training_data\train.features", lines=True)
train_labels_df = pd.read_json(r"D:\youtube\hackathon\training_data\train.labels", lines=True)
print(train_features_df.shape, train_labels_df.shape)
train_df = pd.merge(train_features_df, train_labels_df, on="indoml_id")
train_df["label"] = train_df['supergroup']  + " __" + train_df['group'] + " __" + train_df['module'] + " __" + train_df['brand']
count_df = train_df["label"].value_counts().reset_index()
train_df.shape

(561838, 4) (561838, 5)


(561838, 9)

In [5]:
# Identify single labels
single_label = count_df[count_df["count"] < 5]["label"].tolist()
single_label_df = train_df[train_df["label"].isin(single_label)]

train_df = train_df[~train_df["label"].isin(single_label)]
train_df.shape

(553187, 9)

In [6]:
# From the remaining ones, pick one sample per label
df_min_samples = train_df.groupby('label').apply(lambda x: x.sample(1)).reset_index(drop=True)
undersampled_df = pd.concat([single_label_df, df_min_samples], axis=0)
print("Undersampled df shape: ", undersampled_df.shape)
train_df = train_df[~train_df["indoml_id"].isin(df_min_samples["indoml_id"])]
train_df.shape

Undersampled df shape:  (17430, 9)


(544408, 9)

In [7]:
# From the reamining data, add 30000 rows
# perform stratified split based on brand
from sklearn.model_selection import train_test_split
remaining_df, val_df = train_test_split(train_df, train_size=8800, stratify=train_df['label'], random_state=42)
undersampled_df = pd.concat([undersampled_df, remaining_df], axis=0)
undersampled_df.shape

(26230, 9)

In [8]:
undersampled_df.shape

(26230, 9)

In [9]:
### Validation
missing_labels = set(count_df["label"]) - set(undersampled_df["label"])
count = len(missing_labels)

print(f"Number of missing labels: {count}")


Number of missing labels: 0


In [10]:
undersampled_df["price"].median()

1.6

In [11]:
undersampled_df["price"].fillna(undersampled_df["price"].median(), inplace=True)

In [12]:
undersampled_df["prompt"] =  undersampled_df["description"] + " | " + undersampled_df["retailer"] + " | " + str(undersampled_df["price"])

In [13]:
undersampled_df.head()

Unnamed: 0,indoml_id,description,retailer,price,supergroup,group,module,brand,label,prompt
2334,2334,pbaby disposable bed mats 10 s,herbgrove,2.99,baby care,baby care detail unknown total,baby care,pure baby,baby care __baby care detail unknown total __b...,pbaby disposable bed mats 10 s | herbgrove | 2...
2366,2366,sense weaning spoon,vibrantmart,3.49,baby care,baby feeding accessories,baby feeding accessories,crispcorner,baby care __baby feeding accessories __baby fe...,sense weaning spoon | vibrantmart | 2334 ...
2383,2383,dm br o bottle,producify,1.35,baby care,baby feeding accessories,baby feeding accessories,dr browns,baby care __baby feeding accessories __baby fe...,dm br o bottle | producify | 2334 2.99\n2...
2384,2384,mur jcy bmb cct,freshnosh,5.0,baby care,baby feeding accessories,baby feeding accessories,dr browns,baby care __baby feeding accessories __baby fe...,mur jcy bmb cct | freshnosh | 2334 2.99\n...
2385,2385,dr js sterilising fluid 1 l,herbgrove,0.89,baby care,baby feeding accessories,baby feeding accessories,dr johnsons,baby care __baby feeding accessories __baby fe...,dr js sterilising fluid 1 l | herbgrove | 2334...


In [14]:
import json
from tqdm import tqdm
max_length = 0
label_list = []
prompt_list = []
for _, row in tqdm(undersampled_df.iterrows()):
    template = []
    desc = row["description"]
    retailer = row["retailer"]
    price = row["price"]
    human_val = f"{desc} | {retailer} | {price}"
    prompt_list.append(human_val)
    gpt_val = row["label"]
    label_list.append(gpt_val)


0it [00:00, ?it/s]

26230it [00:01, 22088.27it/s]


In [15]:
df = pd.DataFrame(zip(prompt_list, label_list), columns = ['prompt','completion'])
df.head()

Unnamed: 0,prompt,completion
0,pbaby disposable bed mats 10 s | herbgrove | 2.99,baby care __baby care detail unknown total __b...
1,sense weaning spoon | vibrantmart | 3.49,baby care __baby feeding accessories __baby fe...
2,dm br o bottle | producify | 1.35,baby care __baby feeding accessories __baby fe...
3,mur jcy bmb cct | freshnosh | 5.0,baby care __baby feeding accessories __baby fe...
4,dr js sterilising fluid 1 l | herbgrove | 0.89,baby care __baby feeding accessories __baby fe...


In [16]:
df.shape

(26230, 2)

In [69]:
df.to_json("openai_training.jsonl", orient='records', lines=True)

In [17]:
!openai tools fine_tunes.prepare_data -f openai_training.jsonl -q

Analyzing...

- Your file contains 26230 prompt-completion pairs
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-you

In [18]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset


Dataset({
    features: ['prompt', 'completion'],
    num_rows: 26230
})

In [19]:
# import huggingface_hub
# huggingface_hub.login()

In [20]:
# dataset.push_to_hub("srinathmkce/indoml_openai_training", private=True)

In [21]:
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")


{'prompt': 'pbaby disposable bed mats 10 s | herbgrove | 2.99', 'completion': 'baby care __baby care detail unknown total __baby care __pure baby'}


In [23]:
length = 0
for conversaation in tqdm(dataset):
    token_count = len(encoding.encode(json.dumps(conversaation)))
    length += token_count

length

100%|██████████| 26230/26230 [00:02<00:00, 11633.57it/s]


1161143

In [24]:
length / 10 ** 6

1.161143

In [25]:
undersampled_df["label"].value_counts().reset_index()

Unnamed: 0,label,count
0,clothing & personal accessories __clothing & p...,627
1,stationery & printed material & services __sta...,570
2,garden & flora __garden & flora detail unknown...,189
3,home furnishings & decor __home furnishings & ...,182
4,toys __toys detail unknown total __toys __rece...,180
...,...,...
12880,food ambient __fruit nuts ambient __nuts ambie...,1
12881,food perishable __sweet pastry dough fresh __s...,1
12882,food perishable __sweet pastry dough fresh __s...,1
12883,food ambient __fruit nuts ambient __nuts ambie...,1


In [26]:
undersampled_df.head()

Unnamed: 0,indoml_id,description,retailer,price,supergroup,group,module,brand,label,prompt
2334,2334,pbaby disposable bed mats 10 s,herbgrove,2.99,baby care,baby care detail unknown total,baby care,pure baby,baby care __baby care detail unknown total __b...,pbaby disposable bed mats 10 s | herbgrove | 2...
2366,2366,sense weaning spoon,vibrantmart,3.49,baby care,baby feeding accessories,baby feeding accessories,crispcorner,baby care __baby feeding accessories __baby fe...,sense weaning spoon | vibrantmart | 2334 ...
2383,2383,dm br o bottle,producify,1.35,baby care,baby feeding accessories,baby feeding accessories,dr browns,baby care __baby feeding accessories __baby fe...,dm br o bottle | producify | 2334 2.99\n2...
2384,2384,mur jcy bmb cct,freshnosh,5.0,baby care,baby feeding accessories,baby feeding accessories,dr browns,baby care __baby feeding accessories __baby fe...,mur jcy bmb cct | freshnosh | 2334 2.99\n...
2385,2385,dr js sterilising fluid 1 l,herbgrove,0.89,baby care,baby feeding accessories,baby feeding accessories,dr johnsons,baby care __baby feeding accessories __baby fe...,dr js sterilising fluid 1 l | herbgrove | 2334...
