# Using MarianMT for translation

taken from : https://www.kaggle.com/erelin6613/marianmt-translation-for-non-english-inputs/data

In [82]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/contradictory-my-dear-watson/sample_submission.csv
/kaggle/input/contradictory-my-dear-watson/train.csv
/kaggle/input/contradictory-my-dear-watson/test.csv


In [83]:
!pip install transformers --quiet

In [84]:
import os
from tqdm.notebook import tqdm
from transformers import MarianMTModel, MarianTokenizer

In [85]:
root_dir = '/kaggle/input/contradictory-my-dear-watson/'
train_path = 'train.csv'
test_path = 'test.csv'

In [86]:
train_df = pd.read_csv(os.path.join(root_dir, train_path))
test_df = pd.read_csv(os.path.join(root_dir, test_path))

In [87]:
train_df.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [88]:
models = {k : f'Helsinki-NLP/opus-mt-{k}-en' for k in train_df.lang_abv.unique()}
models

{'en': 'Helsinki-NLP/opus-mt-en-en',
 'fr': 'Helsinki-NLP/opus-mt-fr-en',
 'th': 'Helsinki-NLP/opus-mt-th-en',
 'tr': 'Helsinki-NLP/opus-mt-tr-en',
 'ur': 'Helsinki-NLP/opus-mt-ur-en',
 'ru': 'Helsinki-NLP/opus-mt-ru-en',
 'bg': 'Helsinki-NLP/opus-mt-bg-en',
 'de': 'Helsinki-NLP/opus-mt-de-en',
 'ar': 'Helsinki-NLP/opus-mt-ar-en',
 'zh': 'Helsinki-NLP/opus-mt-zh-en',
 'hi': 'Helsinki-NLP/opus-mt-hi-en',
 'sw': 'Helsinki-NLP/opus-mt-sw-en',
 'vi': 'Helsinki-NLP/opus-mt-vi-en',
 'es': 'Helsinki-NLP/opus-mt-es-en',
 'el': 'Helsinki-NLP/opus-mt-el-en'}

In [101]:
!pip install translators --quiet

In [102]:
# defining a translator using google translate
import translators as ts

def translate_google(string, lang):
   
    string  = ts.google(query_text=string,
                       from_language=lang,
                       to_language='en',
                       sleep_seconds=1)
    return string

Using United States server backend.


In [121]:
!pip install mosestokenizer -quiet

Collecting mosestokenizer
  Downloading mosestokenizer-1.1.0.tar.gz (37 kB)
Collecting docopt
  Downloading docopt-0.6.2.tar.gz (25 kB)
Collecting openfile
  Downloading openfile-0.0.7-py3-none-any.whl (2.4 kB)
Collecting uctools
  Downloading uctools-1.3.0.tar.gz (4.6 kB)
Collecting toolwrapper
  Downloading toolwrapper-2.1.0.tar.gz (3.2 kB)
Building wheels for collected packages: mosestokenizer, docopt, toolwrapper, uctools
  Building wheel for mosestokenizer (setup.py) ... [?25ldone
[?25h  Created wheel for mosestokenizer: filename=mosestokenizer-1.1.0-py3-none-any.whl size=49119 sha256=77c06d8709c8f2a8f01ecb2bae28c60737149dcf10d0a516fd814dbb4ee7395c
  Stored in directory: /root/.cache/pip/wheels/a7/31/94/fef279382208e85a65c1a7f5c4d0020115477b0af74f296b57
  Building wheel for docopt (setup.py) ... [?25ldone
[?25h  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13705 sha256=4dee4ef739c8ba8181fb0b716b60aae6d86983c73e0eaf7ee6418f07e811213d
  Stored in dir

In [127]:
# defining translate through merian mt

def translate_merian(tokenizer, model, string):
    batch = tokenizer.prepare_seq2seq_batch(
            src_texts=[string])
    gen =model.generate(**batch)
    translation = tokenizer.batch_decode(
        gen, skip_special_tokens=False
    )
    
    return translation[0]

In [128]:
# comparing google and marianmt

def compare_marian_google(subset, tokenizer, model, fields):
    idx = subset.index[-1]
    original = subset[fields[0]][idx]
    g_translation = translate_google(subset[fields[0]][idx], subset.lang_abv[idx])
    trs = translate_merian(tokenizer, model, subset[fields[0]][idx])
    print(f'Original: {original}')
    print(f'Google translation: {g_translation}')
    print(f'Marian translation: {trs}')

In [129]:
print(train_df[train_df.lang_abv=='ar'][:5]) #this is the subset refered down

            id                                            premise  \
12  91b03f6bf4              إذا أمكن ، تعرّف على المؤامرة مسبقًا.   
62  e5c40e04ec                      ولكن قد  تكون دم العقل الأن .   
64  cc18ec8d15  حسنا، أنا في التكساس ولدينا مدرس مات من مرض ال...   
65  05ab8a9326  لم تصل روح الليبرالية السائدة في أوروبا إلى إس...   
74  89467c0148  الكلمة الذاتية المرضية، والكلمة العادية بدلاً ...   

                                         hypothesis lang_abv language  label  
12  حاول أن تفهم الحبكة في البداية، إذا كنت تستطيع.       ar   Arabic      0  
62             بلد قد وصل سريعاً لاستنتاج عن موقفه.       ar   Arabic      0  
64                    لم يسبق لي أن ذهبت إلى تكساس.       ar   Arabic      2  
65                    اسبانيا لم تكن ابدا ليبرالية.       ar   Arabic      2  
74                          الرضا الذاتي ليس مشكلة.       ar   Arabic      2  


In [130]:
models['ar'] #this is the modle up for compare

'Helsinki-NLP/opus-mt-ar-en'

In [131]:
def translate_df(df, compare=True, fields=['premise', 'hypothesis']):
    for k in models:
        if k == 'en':
            continue

        print('translating: ', k)

        try:
            tokenizer = MarianTokenizer.from_pretrained(models[k])
            model = MarianMTModel.from_pretrained(models[k])

        except Exception as e:
            print(e)
            continue
        
        subset = df[df.lang_abv==k]
        
        if compare:
            compare_marian_google(subset, tokenizer, model, fields)
        
        for idx in tqdm(subset.index):
            for f in fields:
                df.loc[idx, f] = translate_merian(
                    tokenizer, model, df.loc[idx, f]
                )
    return df


In [133]:
try:
    train_df = translate_df(train_df)
except Exception as e:
    print(e)

translating:  fr
'list' object has no attribute 'size'


from https://huggingface.co/transformers/model_doc/marian.html

In [139]:
sample_arab_text = train_df[train_df.lang_abv=='ar'].iloc[0].premise
sample_arab_text

'إذا أمكن ، تعرّف على المؤامرة مسبقًا.'

In [141]:

tokenizer = MarianTokenizer.from_pretrained(models['ar'])
model = MarianMTModel.from_pretrained(models['ar'])

translated = model.generate(**tokenizer.prepare_seq2seq_batch(sample_arab_text, return_tensors="pt"))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

tgt_text


Downloading:   0%|          | 0.00/917k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/308M [00:00<?, ?B/s]

NameError: name 'src_text' is not defined