<a href="https://colab.research.google.com/github/florianraith/notebooks/blob/main/dataloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import torch
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests

%matplotlib inline

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Flores/

Mounted at /content/drive
/content/drive/MyDrive/Flores


In [4]:
src = "English"
tgt = "German"

# See https://github.com/openlanguagedata/flores
src_filename = 'dev.eng_Latn'
tgt_filename = 'dev.deu_Latn'

In [5]:
src_data = pd.read_fwf(src_filename, header=None)
tgt_data = pd.read_fwf(tgt_filename, header=None)
print("Loaded files "  + src_filename + " and " + tgt_filename)

Loaded files dev.eng_Latn and dev.deu_Latn


In [6]:
src_data[:10], tgt_data[:10]

(                                                   0
 0  On Monday, scientists from the Stanford Univer...
 1  Lead researchers say this may bring early dete...
 2  The JAS 39C Gripen crashed onto a runway at ar...
 3  The pilot was identified as Squadron Leader Di...
 4  Local media reports an airport fire vehicle ro...
 5  28-year-old Vidal had joined Barça three seaso...
 6  Since moving to the Catalan-capital, Vidal had...
 7  The protest started around 11:00 local time (U...
 8  Just after 11:00, protesters blocked traffic o...
 9  At 11:20, the police asked the protesters to m...,
                                                    0                   1  \
 0  Am Montag haben die Wisenschaftler der Stanfor...  hergestellt werden   
 1  Führende Forscher sagen, dass dies die Früherk...                 NaN   
 2  Der JAS 39C Gripen stürzte gegen 9:30 Uhr Orts...                 NaN   
 3  Der Pilot wurde als Staffelführer Dilokrit Pat...                 NaN   
 4  Lokale Medien be

In [7]:
def sample_translations(n):
  # idx = torch.randint(0, len(src_data), (n, ))
  idx = torch.randperm(len(src_data))[:n]

  src_sample = src_data.loc[idx]
  tgt_sample = tgt_data.loc[idx]

  return list(zip(src_sample.iloc[:, 0].tolist(), tgt_sample.iloc[:, 0].tolist()))

sample_translations(1)

[('Besides white sand beaches and mountain landscapes, the country is home to the oldest European city in the Americas, now part of Santo Domingo.',
  'Neben weißen Sandstränden und Berglandschaften beheimatet das Land auch die älteste europäische Stadt Amerikas, die heute zu Santo Domingo gehört.')]

In [36]:
def generate_prompt(examples, few_shot_template, zero_shot_template, example_template, split):
  translations = sample_translations(examples + 1)
  examples_prompt = [
      example_template.replace("[src]", src)
      .replace("[tgt]", tgt)
      .replace("[x]", x)
      .replace("[y]", y)
      for (x, y) in translations[1:]
    ]

  prompt_template = few_shot_template if examples > 0 else zero_shot_template

  return (
    translations,
    prompt_template
      .replace("[exm]", split.join(examples_prompt))
      .replace("[src]", src)
      .replace("[tgt]", tgt)
      .replace("[x]", translations[0][0])
  )

In [48]:
TEMPLATE_ZHANG_ET_AL_23 = [
  "[exm] \n [src]: [x] \n [tgt]:",
  "[src]: [x] \n [tgt]:",
  "[src]: [x] \n [tgt]: [y]",
  " \n ",
]

# TEMPLATE_LLAMA = [
#    "Given Examples: [exm], Translate '[x]' from [src] to [tgt]. Just output the translation. Do not output any additional information or notes. Output only the [tgt] sentence. Do not add any further examples.",
#    "Translate '[x]' from [src] to [tgt]. Just output the translation. Do not output any additional information or notes. Output only the [tgt] sentence.",
#    "[src]: '[x]' to [tgt]: '[y]'",
#    ", ",
# ]


TEMPLATE_LLAMA = [
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a professional translator. Translate the [src] sentences into [tgt], providing only the translation without any additional text.\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n[exm]\n[src]: [x]\n[tgt]:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n",
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a professional translator. Translate the [src] sentences into [tgt], providing only the translation without any additional text.\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n[src]: [x]\n[tgt]:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n",
    "[src]: [x]\n[tgt]: [y]",
    "\n"
]

# (translations, prompt) = generate_prompt(2, *TEMPLATE_LLAMA)

In [44]:
def ask(prompt):
  url = 'http://hiaisc.isl.iar.kit.edu/llm/generate'

  data = {
      "inputs": prompt,
      "parameters": {
          "max_new_tokens": 300
      }
  }

  headers = {
      'Content-Type': 'application/json'
  }

  response = requests.post(url, json=data, headers=headers)

  if response.status_code == 200:
    return response.json()['generated_text']
  else:
      print(f"Request failed with status code {response.status_code}")
      print(response.text)

In [52]:
(translations, prompt) = generate_prompt(2, *TEMPLATE_LLAMA)

In [58]:
# Separate the examples and the last translation
examples = translations[1:]
last_input, last_actual_translation = translations[0]

output = ask(prompt)

print("Example:")
for src_sent, tgt_sent in examples:
    print(f"{src}: {src_sent}\n{tgt}: {tgt_sent}\n")

print("Input:")
print(last_input)

print("\nOutput:")
print(output)

print("\nActual Translation:")
print(last_actual_translation)

Example:
English: However, the discovery of his tomb in 1922 made him a celebrity. While many tombs of the past were robbed, this tomb was left virtually undisturbed.
German: Die Entdeckung seiner Grabkammer im Jahr 1922 machte ihn jedoch zu einer Berühmtheit. Während viele Gräber der alten Zeit ausgeraubt wurden, blieb sein Grab nahezu unberührt.

English: It was based on the German alphabet and one character "Õ/õ" was added.
German: Sie basierte auf dem deutschen Alphabet und ein Buchstabe, nämlich „Õ/õ“, wurde hinzugefügt.

Input:
Modern Education accused him of printing large advertisements on buses without authorisation and lying by saying that he was the chief English tutor.

Output:
Die Moderne Bildung beschuldigte ihn, große Anzeigen auf Bussen ohne Genehmigung zu drucken und zu behaupten, er sei der Hauptlehrer für Englisch, was eine Lüge war.

Actual Translation:
Das moderne Bildungswesen beschuldigte ihn, große Werbung ohne Genehmigung an Bussen anbringen zu lassen und zu lü