In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import json
import os
from pathlib import Path
from pprint import pprint
import re
import sys
import traceback
from typing import Generator
if '..' not in sys.path: sys.path.append('..')

from datasets import Dataset, load_dataset
from openai import AzureOpenAI
from pydantic import BaseModel
from pydantic_yaml import parse_yaml_raw_as, parse_yaml_file_as, to_yaml_str, to_yaml_file



# Generate summarization dataset with LLM
## Setup

In [3]:
AZURE_OPENAI_KEY = os.environ.get('MLLM_AZURE_OPENAI_KEY')
AZURE_OPENAI_ENDPOINT = 'https://hackathon-spanish-openai.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview'
# AZURE_OPENAI_ENDPOINT = 'https://hackathon-spanish-openai.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-15-preview'
OPENAI_MODEL_NAME = 'gpt-4o'
# API_VERSION = '2024-05-13'
# API_VERSION = '2024-08-06'
API_VERSION = '2024-08-01-preview'
LANG_MAIN_ID, LANG_MAIN_NAME = 'en', 'English'
LANG_LEARN_ID, LANG_LEARN_NAME = 'es', 'Spanish'

In [4]:
HOME_PATH = Path(os.path.expandvars('$HOME'))
DATA_PATH = HOME_PATH / 'data'
LL_DATA_PATH = DATA_PATH / 'lldata'
LL_CONVS_PATH = LL_DATA_PATH / f'convs_m_{OPENAI_MODEL_NAME}_v_{API_VERSION}'
LL_CONVS_PATH.mkdir(parents=True, exist_ok=True)

## Test OpenAI API

In [5]:
client = AzureOpenAI(
  api_key = AZURE_OPENAI_KEY,
  api_version = API_VERSION,
  azure_endpoint = AZURE_OPENAI_ENDPOINT,
)

In [7]:
response = client.chat.completions.create(
    model=OPENAI_MODEL_NAME, # model = 'deployment_name'.
    messages=[
        {'role': 'system', 'content': 'Assistant is a large language model trained by OpenAI.'},
        {'role': 'user', 'content': 'Who were the founders of Microsoft? Answer without saying names, but you can comment on person\'s appearance and traits.'}
    ]
)

# print(response)
print(response.model_dump_json(indent=2))
print(response.choices[0].message.content)

{
  "id": "chatcmpl-BgGyqQrq7kuRNh8MA7TobBMJzWHtV",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The founders of Microsoft include two individuals. One of them is a bespectacled, tech-savvy individual known for his deep interest in programming and software development from a young age. The other is his school friend, who shares a similar enthusiasm for computing and technology, and is often remembered for his more outgoing and assertive personality. Together, these two visionaries went on to build one of the world's most influential technology companies.",
        "refusal": null,
        "role": "assistant",
        "annotations": [],
        "audio": null,
        "function_call": null,
        "tool_calls": null
      },
      "content_filter_results": {
        "hate": {
          "filtered": false,
          "severity": "safe"
        },
        "protected_material_code": {
          "filtered"

## Wiki articles summarization sample

In [6]:
wiki_ds_name = '20200501.en'
wiki_ds_subdir = 'wikipedia'
data_path = DATA_PATH
dss_wiki = load_dataset(wiki_ds_subdir, wiki_ds_name, beam_runner='DirectRunner', cache_dir=str(data_path))
dss_wiki


Reusing dataset wikipedia (/Users/misha/data/wikipedia/20200501.en/1.0.0/009f923d9b6dd00c00c8cdc7f408f2b47f45dd4f5fb7982a21f9448f4afbe475)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'text'],
        num_rows: 6078422
    })
})

In [7]:
ds_wiki = dss_wiki['train']
ds_wiki

Dataset({
    features: ['title', 'text'],
    num_rows: 6078422
})

In [8]:
ds_wiki[0]

{'title': 'Yangliuqing',
 'text': 'Yangliuqing () is a market town in Xiqing District, in the western suburbs of Tianjin, People\'s Republic of China. Despite its relatively small size, it has been named since 2006 in the "famous historical and cultural market towns in China".\n\nIt is best known in China for creating nianhua or Yangliuqing nianhua. For more than 400 years, Yangliuqing has in effect specialised in the creation of these woodcuts for the New Year.  wood block prints using vivid colourschemes to portray traditional scenes of children\'s games often interwoven with auspiciouse objects.\n\n, it had 27 residential communities () and 25 villages under its administration.\n\nShi Family Grand Courtyard\n\nShi Family Grand Courtyard (Tiānjīn Shí Jiā Dà Yuàn, 天津石家大院) is situated in Yangliuqing Town of Xiqing District, which is the former residence of wealthy merchant Shi Yuanshi - the 4th son of Shi Wancheng, one of the eight great masters in Tianjin. First built in 1875, it cove

In [12]:
prompt_template = '''
In a couple of sentences summarize next article:
Title: {title}
Text: {text}
'''

In [None]:
i = 0
item = ds_wiki[i]
prompt = prompt_template.format(title=item['title'][:100], text=item['text'][:1000])
print(prompt)


In a couple of sentences summarize next article:
Title: Yangliuqing
Text: Yangliuqing () is a market town in Xiqing District, in the western suburbs of Tianjin, People's Republic of China. Despite its relatively small size, it has been named since 2006 in the "famous historical and cultural market towns in China".

It is best known in China for creating nianhua or Yangliuqing nianhua. For more than 400 years, Yangliuqing has in effect specialised in the creation of these woodcuts for the New Year.  wood block prints using vivid colourschemes to portray traditional scenes of children's games often interwoven with auspiciouse objects.

, it had 27 residential communities () and 25 villages under its administration.

Shi Family Grand Courtyard

Shi Family Grand Courtyard (Tiānjīn Shí Jiā Dà Yuàn, 天津石家大院) is situated in Yangliuqing Town of Xiqing District, which is the former residence of wealthy merchant Shi Yuanshi - the 4th son of Shi Wancheng, one of the eight great masters in Tianjin

In [16]:
response = client.chat.completions.create(
    model=OPENAI_MODEL_NAME, # model = 'deployment_name'.
    messages=[
        {'role': 'system', 'content': 'Assistant is a large language model trained by OpenAI.'},
        {'role': 'user', 'content': prompt}
    ]
)

# print(response)
print(response.model_dump_json(indent=2))
print(response.choices[0].message.content)

{
  "id": "chatcmpl-BgdDMYCHZcuKHCYBn3IVAhOWZNyAW",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Yangliuqing is a market town in Xiqing District, Tianjin, China, recognized since 2006 for its historical and cultural significance. It is renowned for its traditional New Year woodblock prints, known as Yangliuqing nianhua, a craft it has specialized in for over 400 years. The town is also home to the Shi Family Grand Courtyard, the historic residence of a wealthy merchant family.",
        "refusal": null,
        "role": "assistant",
        "annotations": [],
        "audio": null,
        "function_call": null,
        "tool_calls": null
      },
      "content_filter_results": {
        "hate": {
          "filtered": false,
          "severity": "safe"
        },
        "protected_material_code": {
          "filtered": false,
          "detected": false
        },
        "protected_material_tex

In [None]:
class LlmSumGen:
    prompt_template: str
    azure_openai_key: str
    api_version: str
    azure_endpoint: str
    openai_model_name: str
    max_title_len: int
    max_text_len: int
    client: AzureOpenAI

    def __init__(
            self, prompt_template: str, azure_openai_key: str, api_version: str, azure_endpoint: str, openai_model_name: str,
            max_title_len: int = 0, max_text_len: int = 0,
        ):
        self.prompt_template = prompt_template
        self.azure_openai_key = azure_openai_key
        self.api_version = api_version
        self.azure_endpoint = azure_endpoint
        self.openai_model_name = openai_model_name
        self.max_title_len = max_title_len
        self.max_text_len = max_text_len
        self.client = AzureOpenAI(
            api_key=self.azure_openai_key,
            api_version=self.api_version,
            azure_endpoint=self.azure_endpoint,
        )
    
    def get_response(self, title: str, text: str):
        title_inp = title
        if self.max_title_len > 0:
            title_inp = title[:self.max_title_len]
        text_inp = text
        if self.max_text_len > 0:
            text_inp = text[:self.max_text_len]
        prompt = prompt_template.format(title=title_inp, text=text_inp)
        try:
            response = self.client.chat.completions.create(
                model=self.openai_model_name, # model = 'deployment_name'.
                messages=[
                    {'role': 'system', 'content': 'Assistant is a large language model trained by OpenAI.'},
                    {'role': 'user', 'content': prompt}
                ]
            )
            return response.choices[0].message.content
        except:
            print(title_inp, text_inp)
            traceback.print_exc()
    

In [19]:
llm_sum_gen = LlmSumGen(
    prompt_template=prompt_template, azure_openai_key=AZURE_OPENAI_KEY, api_version=API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT, openai_model_name=OPENAI_MODEL_NAME,
    max_title_len=100, max_text_len=5000,
)

In [20]:
item = ds_wiki[i]
print(item)

{'title': 'Yangliuqing', 'text': 'Yangliuqing () is a market town in Xiqing District, in the western suburbs of Tianjin, People\'s Republic of China. Despite its relatively small size, it has been named since 2006 in the "famous historical and cultural market towns in China".\n\nIt is best known in China for creating nianhua or Yangliuqing nianhua. For more than 400 years, Yangliuqing has in effect specialised in the creation of these woodcuts for the New Year.  wood block prints using vivid colourschemes to portray traditional scenes of children\'s games often interwoven with auspiciouse objects.\n\n, it had 27 residential communities () and 25 villages under its administration.\n\nShi Family Grand Courtyard\n\nShi Family Grand Courtyard (Tiānjīn Shí Jiā Dà Yuàn, 天津石家大院) is situated in Yangliuqing Town of Xiqing District, which is the former residence of wealthy merchant Shi Yuanshi - the 4th son of Shi Wancheng, one of the eight great masters in Tianjin. First built in 1875, it cover

In [21]:
sum_gen_str = llm_sum_gen.get_response(title=item['title'], text=item['text'])
print(sum_gen_str)

Yangliuqing is a historical market town in Xiqing District, Tianjin, known primarily for its traditional nianhua wood block prints. The town is home to the Shi Family Grand Courtyard, a large and well-preserved residence of a wealthy merchant family from the 19th century that now functions as a folk custom museum and popular filming location.
