<a href="https://colab.research.google.com/github/eyaler/constrained/blob/main/compact/ted_multilingual_counter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
!pip install PyICU

import icu
from collections import Counter
from math import exp, log
import re


def clean(s):
  return re.sub('[\u200b-\u200f♪♫]', '', s).strip()


trans = str.maketrans('ς' + 'ךםןףץ' + '٩٨٧٦٥٤٣٢١٠' + '،' + '\xa0', 'σ' + 'כמנפצ' + '9876543210' + ',' + ' ')
print([(chr(k), chr(v)) for k, v in trans.items()])


def norm(s):
  return s.translate(trans).lower()


def perplexity(cnt):
  s = sum(cnt.values())
  return exp(-sum(log(c / s) * c for c in cnt.values()) / s)

[('ς', 'σ'), ('ך', 'כ'), ('ם', 'מ'), ('ן', 'נ'), ('ף', 'פ'), ('ץ', 'צ'), ('٩', '9'), ('٨', '8'), ('٧', '7'), ('٦', '6'), ('٥', '5'), ('٤', '4'), ('٣', '3'), ('٢', '2'), ('١', '1'), ('٠', '0'), ('،', ','), ('\xa0', ' ')]


# TED2020

    @inproceedings{reimers-2020-multilingual-sentence-bert,
      title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
      author = "Reimers, Nils and Gurevych, Iryna",
      booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing",
      month = "11",
      year = "2020",
      publisher = "Association for Computational Linguistics",
      url = "https://arxiv.org/abs/2004.09813",
    }

https://arxiv.org/abs/2004.09813

https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/parallel-sentences-source-files.zip

https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/parallel-sentences.tsv.gz

In [36]:
!wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/parallel-sentences.tsv.gz -O parallel-sentences.tsv.gz
!gunzip -f parallel-sentences.tsv.gz

--2025-12-15 10:31:16--  https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/parallel-sentences.tsv.gz
Resolving public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)... 130.83.167.186
Connecting to public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)|130.83.167.186|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 581354214 (554M) [application/octet-stream]
Saving to: ‘parallel-sentences.tsv.gz’


2025-12-15 10:33:00 (5.37 MB/s) - ‘parallel-sentences.tsv.gz’ saved [581354214/581354214]



In [37]:
import csv


bi = {}

with open('parallel-sentences.tsv', newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
      clean_he = clean(row['he'])
      if clean_he:
        for lang in reader.fieldnames[1:]:
          clean_lang = clean(row[lang])
          if clean_lang:
            if lang not in bi:
              bi[lang] = [0, 0, Counter(), Counter(), 0]
            bi[lang][0] += len(clean_lang)
            bi[lang][1] += len(clean_he)
            bi[lang][2].update(norm(clean_lang))
            bi[lang][3].update(norm(clean_he))
            bi[lang][4] += 1

ted_2020 = dict(sorted([(k, (icu.Locale(k).getDisplayName(), v[0] / v[1], len(v[2]), perplexity(v[2]), perplexity(v[3]), v[4])) for k, v in bi.items()], key=lambda x: (x[1][1], x[0])))

for code, (lang, rlen, symbols, perp, pheb, sents) in ted_2020.items():
  print(f'({code}) {lang}\t{round(rlen, 3):.3f} {symbols} {round(perp, 1):.1f} {round(perp / pheb * rlen, 3):.3f} ({sents})')

(zh) Chinese	0.393 3855 499.3 9.953 (15672)
(zh-tw) Chinese (Taiwan)	0.431 6007 468.1 10.267 (343524)
(zh-cn) Chinese (China)	0.437 5293 449.3 9.990 (343160)
(ja) Japanese	0.556 3482 225.9 6.385 (329001)
(ko) Korean	0.690 2602 116.3 4.081 (341553)
(am) Amharic	0.786 279 63.3 2.497 (1023)
(he) Hebrew	1.000 174 19.7 1.000 (348918)
(la) Latin	1.003 30 18.1 0.954 (20)
(tlh) Klingon	1.082 42 21.9 1.266 (60)
(ar) Arabic	1.090 306 23.9 1.324 (343957)
(arq) Algerian Arabic	1.150 123 40.2 2.390 (1398)
(ps) Pashto	1.165 102 21.8 1.307 (962)
(kk) Kazakh	1.179 114 26.0 1.559 (9484)
(szl) Silesian	1.192 50 24.8 1.521 (102)
(sl) Slovenian	1.196 102 20.6 1.256 (42583)
(th) Thai	1.197 199 41.6 2.540 (153995)
(pl) Polish	1.198 181 25.4 1.545 (285764)
(tt) Tatar	1.201 72 26.0 1.578 (263)
(cs) Czech	1.206 157 27.0 1.661 (164327)
(as) Assamese	1.212 65 32.1 2.079 (41)
(sk) Slovak	1.220 127 26.2 1.624 (102958)
(mr) Marathi	1.220 159 30.6 1.906 (20287)
(gu) Gujarati	1.229 189 30.3 1.894 (10317)
(sr) Serbian

# TED-Parallel-Corpus

    @techreport{kulkarni:hal-04702210,
      TITLE = {{TED-Parallel-Corpus}},
      AUTHOR = {Kulkarni, Ajinkya},
      URL = {https://hal.science/hal-04702210},
      NUMBER = {none},
      INSTITUTION = {{CDAC ; Idiap Research Institure}},
      YEAR = {2015},
      MONTH = Jan,
      KEYWORDS = {Machine translation ; Neural machine translation NMT ; NLP ; Multilingual},
      PDF = {https://hal.science/hal-04702210v1/file/ted_parallel_corpus.pdf},
      HAL_ID = {hal-04702210},
      HAL_VERSION = {v1},
    }

https://hal.science/hal-04702210

https://github.com/ajinkyakulkarni14/How-I-Extracted-TED-talks-for-parallel-Corpus-

https://github.com/ajinkyakulkarni14/TED-Multilingual-Parallel-Corpus/

In [38]:
!wget https://github.com/ajinkyakulkarni14/TED-Multilingual-Parallel-Corpus/raw/refs/heads/master/Multilingual_Parallel_Corpus/Multi_lingual_Parallel_corpus_1.zip -O Multi_lingual_Parallel_corpus_1.zip
!unzip -o Multi_lingual_Parallel_corpus_1.zip

--2025-12-15 10:45:18--  https://github.com/ajinkyakulkarni14/TED-Multilingual-Parallel-Corpus/raw/refs/heads/master/Multilingual_Parallel_Corpus/Multi_lingual_Parallel_corpus_1.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ajinkyakulkarni14/TED-Multilingual-Parallel-Corpus/refs/heads/master/Multilingual_Parallel_Corpus/Multi_lingual_Parallel_corpus_1.zip [following]
--2025-12-15 10:45:18--  https://raw.githubusercontent.com/ajinkyakulkarni14/TED-Multilingual-Parallel-Corpus/refs/heads/master/Multilingual_Parallel_Corpus/Multi_lingual_Parallel_corpus_1.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 

In [39]:
last = 0
sents = 0
seen = set()
multi = {}
counters = {}

with open(r'Multilingual_Parllel_corpus.txt', encoding='utf8') as f:
  for line in f.readlines():
    line = clean(line)
    if re.match(r'\d+:[a-z]{2}(-[a-z]{2})?:', line):
      num = int(line.split(':', 1)[0])
      if num != last:
        assert not seen or len(seen) == 9, (last, seen)
        seen = set()
        last = num
        sents += 1
      parts = line.split(':', 2)
      lang = parts[1]
      if lang in seen:
        continue
      seen.add(lang)
      if lang not in multi:
        multi[lang] = 0
        counters[lang] = Counter()
      text = parts[2]
    else:
      text = ' ' + line
    multi[lang] += len(text)
    counters[lang].update(norm(text))

print(f'{sents=}')
ted_para = dict(sorted([(k, (icu.Locale(k).getDisplayName(), v / multi['he'], len(counters[k]), perplexity(counters[k]))) for k, v in multi.items()], key=lambda x: (x[1][1], x[0])))
pheb_para = perplexity(counters['he'])
for code, (lang, rlen, symbols, perp) in ted_para.items():
  print(f'({code}) {lang}\t{round(rlen, 3):.3f} {symbols} {round(perp, 1):.1f} {round(perp / pheb_para * rlen, 3):.3f}')

sents=349049
(he) Hebrew	1.000 121 20.1 1.000
(ar) Arabic	1.104 268 24.2 1.323
(ru) Russian	1.322 149 25.3 1.660
(nl) Dutch	1.347 125 19.0 1.269
(es) Spanish	1.372 171 19.8 1.347
(pt-br) Portuguese (Brazil)	1.377 129 20.1 1.375
(it) Italian	1.399 122 19.2 1.332
(de) German	1.475 136 19.9 1.458
(fr) French	1.512 150 20.5 1.542


In [40]:
import json


exclude = ['la',  'hup']  # Latin has only 20 parallel sentences, and Hupa 1

output = []
for code, (lang, rlen, *rest) in sorted(ted_2020.items(), key=lambda x: x[0] != 'he'):
  if code in exclude:
    continue
  d = dict(Code=code, Lang=lang, Length_2020=round(rlen, 3))
  if rest:
    symbols, perp, pheb, sents = rest
    d.update(dict(Symbols_2020=symbols, Perplexity_2020=round(perp, 1), Inefficiency_2020=round(perp / pheb * rlen, 3), Sentences_2020=sents))
  if code in ted_para:
    rlen, symbols, perp = ted_para[code][1:]
    d.update(dict(Length_Parallel=round(rlen, 3), Symbols_Parallel=symbols, Perplexity_Parallel=round(perp, 1), Inefficiency_Parallel=round(perp / pheb_para * rlen, 3)))
  output.append(d)
for code, (lang, rlen, symbols, perp) in ted_para.items():
  if code not in exclude and code not in ted_2020:
    output.append(dict(Code=code, Lang=lang, Length_Parallel=round(rlen, 3), Symbols_Parallel=symbols, Perplexity_Parallel=round(perp, 1), Inefficiency_Parallel=round(perp / pheb_para * rlen, 3)))
with open('compact.json', 'w') as f:
  json.dump(output, f)