In [1]:
import google.generativeai as genai
from google.colab import userdata
from time import sleep
import time
import json
import numpy as np
import tqdm

In [3]:
TOPIC = "cars"

#load Microsoft Phi titles
microsoft_titles = json.load(open("/content/drive/MyDrive/SETROW//Data/microsoft_titles.json","r"))

#clean the titles (remove ", ', \n and whitespace)
cleaned_microsoft_titles = [item.replace('"', '')\
                .replace("'", '')\
                .replace('\n', '').strip() for item in microsoft_titles]

#drop duplicated titles if any
cleaned_microsoft_titles = np.unique(cleaned_microsoft_titles)

In [4]:
#create training data. Show example titles
training_data = [
    {'text_input': f"You are a helpful ecommerce assistant. Please generate 5 different email titles about {TOPIC} to enhance user engagement. Each title should be separated by ';'.",
     'output': ";".join(cleaned_microsoft_titles[i:i+5])}
    for i in range(0, len(cleaned_microsoft_titles), 5)
    if i+5 <= len(cleaned_microsoft_titles)
]

print(training_data[0])

{'text_input': "You are a helpful ecommerce assistant. Please generate 5 different email titles about cars to enhance user engagement. Each title should be separated by ';'.", 'output': '2023 Best-Selling Cars: Reviews & Savings Tips to Upgrade Your Ride!;2023 Hottest Sedans: Unleash High Performance on the Road!;2023 Spring Car Deals Update: Unlock Exclusive Discounts & Latest Models;2023 Top 5 Sedans for Fuel Efficiency & Comfort: Your Perfect Ride Awaits!;2023 Top Gift Ideas: Exclusive Limited-Edition Cars – Upgrade Your Style Today!'}


I used title batches of 5 due to training data size recommendatations (20-200).

In [5]:
model = "models/gemini-1.5-flash-001-tuning"
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

operation = genai.create_tuned_model(
    display_name="increment",
    source_model=model,
    epoch_count=4,
    batch_size=4,
    learning_rate=0.0001, #source: https://ai.google.dev/gemini-api/docs/model-tuning#advanced-settings
    training_data=training_data)

for status in operation.wait_bar():
    time.sleep(10)

result = operation.result()
print(result)

  0%|          | 0/40 [00:00<?, ?it/s]

TunedModel(name='tunedModels/increment-qsz98zbgbtpu',
           source_model='models/gemini-1.5-flash-001-tuning',
           base_model='models/gemini-1.5-flash-001-tuning',
           display_name='increment',
           description='',
           temperature=1.0,
           top_p=0.95,
           top_k=64,
           state=<State.ACTIVE: 2>,
           create_time=datetime.datetime(2024, 12, 28, 8, 32, 59, 963610, tzinfo=datetime.timezone.utc),
           update_time=datetime.datetime(2024, 12, 28, 8, 34, 48, 853811, tzinfo=datetime.timezone.utc),
           tuning_task=TuningTask(start_time=datetime.datetime(2024, 12, 28, 8, 33, 1, 90023, tzinfo=datetime.timezone.utc),
                                  complete_time=datetime.datetime(2024, 12, 28, 8, 34, 48, 853811, tzinfo=datetime.timezone.utc),
                                  snapshots=[...],
                                  hyperparameters=Hyperparameters(epoch_count=4,
                                                       

In [6]:
#use tuned model to get title suggestions
tuned_model = genai.GenerativeModel(model_name = result.name)

#generate 5 different email titles about cars having max 1000 tokens
config = {
  "max_output_tokens": 1000,
  "temperature": 2,
}

#get several responses to obtain 30 samples. Save the responses in a json file
tuned_response = []
for i in tqdm.tqdm(range(1,7)):
  response = tuned_model.generate_content(f"You are a helpful ecommerce assistant. Please generate 5 different email titles about {TOPIC} to enhance user engagement. Each title should be separated by ';'.",
          generation_config = genai.types.GenerationConfig(**config))

  tuned_response.extend(response.text.split(";"))



  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:10<00:52, 10.56s/it][A
 33%|███▎      | 2/6 [00:20<00:40, 10.07s/it][A
 50%|█████     | 3/6 [00:31<00:31, 10.47s/it][A
 67%|██████▋   | 4/6 [00:40<00:19,  9.93s/it][A
 83%|████████▎ | 5/6 [00:42<00:07,  7.18s/it][A
100%|██████████| 6/6 [00:44<00:00,  7.49s/it]


In [7]:
#save the responses
with open(f"/content/drive/MyDrive/SETROW/Data/gemini_tuned_response.json","w") as f:
    json.dump(tuned_response,f)