<a href="https://colab.research.google.com/github/chottokun/colaboratory/blob/main/NLLB200_fugMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
!nvidia-smi

Sun Mar  5 04:31:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P0    27W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [48]:
!pip install -q transformers sentencepiece pysbd

In [49]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [50]:
import pysbd

def nllb_transtate(text, from_lang, to_lang):
  inputs = tokenizer(text, return_tensors="pt")
  translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[to_lang], max_length=100
  )
  return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

def nllb_transtate_ja_en(text):
  seg_jp = pysbd.Segmenter(language="ja", clean=False)
  result=[]
  for txt in seg_jp.segment(text):
    result.append(nllb_transtate(txt, from_lang = "jpn_Jpan", to_lang = "eng_Latn"))
  return " ".join(result)

def nllb_transtate_en_ja(text):
  seg_jp = pysbd.Segmenter(language="en", clean=False)
  result=[]
  for txt in seg_jp.segment(text):
    result.append(nllb_transtate(txt, from_lang = "eng_Latn", to_lang = "jpn_Jpan"))
  return " ".join(result)

In [51]:
%%time
text = """I had seen little of Holmes lately.
My marriage had drifted us away from each other.
"""
# print(nllb_transtate(text, from_lang = "eng_Latn", to_lang = "jpn_Jpan"))

print("no segment:\n",nllb_transtate(text, from_lang = "eng_Latn", to_lang = "jpn_Jpan"))
print("segment:\n", nllb_transtate_en_ja(text)) 

no segment:
 最近ホームズに会ったことがほとんどなかった 私の結婚は 私たちを離れさせた
segment:
 最近ホームズを見なかった 結婚は 私たちを離れさせた
CPU times: user 6.43 s, sys: 142 ms, total: 6.57 s
Wall time: 6.78 s


In [52]:
%%time
text = """最近はホームズとほとんど会っていなかった.
私の結婚で私達の関係は疎遠になっていた.
"""
print("no segment:\n",nllb_transtate(text, from_lang = "jpn_Jpan", to_lang = "eng_Latn"))
print("segment:\n", nllb_transtate_ja_en(text))

no segment:
 I've been in a relationship with my husband, and my relationship with him has been very strained.
segment:
 I've barely met with Holmes lately. My marriage had made our relationship distant.
CPU times: user 10 s, sys: 86.3 ms, total: 10.1 s
Wall time: 21.6 s


In [53]:
!pip install -q transformers sentencepiece pysbd

In [None]:
from transformers import pipeline
import pysbd

seg_en = pysbd.Segmenter(language="en", clean=False)
seg_jp = pysbd.Segmenter(language="ja", clean=False)

fugu_translator_en_ja = pipeline('translation', model='staka/fugumt-en-ja')
fugu_translator_ja_en = pipeline('translation', model='staka/fugumt-ja-en')

In [56]:
def fugumt_transtate_en_ja(txt):
  return fugu_translator_en_ja(seg_en.segment(txt))

def fugumt_transtate_ja_en(txt):
  return fugu_translator_ja_en(seg_jp.segment(txt))

In [57]:
text = """I had seen little of Holmes lately.
My marriage had drifted us away from each other.
"""
print(fugumt_transtate_en_ja(text))

[{'translation_text': '最近ホームズにほとんど会わなかった。'}, {'translation_text': '私の結婚は私たちをお互いから遠ざけていた。'}]


In [58]:
%%time
text = """最近はホームズとほとんど会っていなかった.
私の結婚で私達の関係は疎遠になっていた.
"""
print(fugumt_transtate_ja_en(text))

[{'translation_text': "I haven't seen much of Holmes lately."}, {'translation_text': 'Our relationship was estranged by my marriage.'}]
CPU times: user 2.14 s, sys: 10.9 ms, total: 2.15 s
Wall time: 2.58 s
