In [1]:
import pandas as pd
import numpy as np
import jsonlines
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch_optimizer as optim
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from importlib import reload
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', '{:0.3f}'.format)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.width = 0
import warnings
import torchvision
warnings.filterwarnings('ignore')

from facebook_hateful_memes_detector.utils.globals import set_global, get_global
set_global("cache_dir", "/home/ahemf/cache/cache")
set_global("dataloader_workers", 4)
set_global("use_autocast", True)
set_global("models_dir", "/home/ahemf/cache/")

from facebook_hateful_memes_detector.utils import read_json_lines_into_df, in_notebook, set_device, my_collate, clean_memory
get_global("cache_dir")
from facebook_hateful_memes_detector.models import Fasttext1DCNNModel, MultiImageMultiTextAttentionEarlyFusionModel, LangFeaturesModel, AlbertClassifer
from facebook_hateful_memes_detector.preprocessing import TextImageDataset, get_datasets, get_image2torchvision_transforms, TextAugment
from facebook_hateful_memes_detector.preprocessing import DefinedRotation, QuadrantCut, ImageAugment
from facebook_hateful_memes_detector.training import *
import facebook_hateful_memes_detector
reload(facebook_hateful_memes_detector)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_device(device)
device

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from transformers import AutoModelWithLMHead, AutoTokenizer
from transformers import pipeline
from transformers import AutoModelWithLMHead, AutoTokenizer
from transformers import MarianMTModel, MarianTokenizer
from tqdm.auto import tqdm, trange
from transformers import MarianMTModel, MarianTokenizer


In [2]:
data = get_datasets(data_dir="../data/", train_text_transform=None, train_image_transform=None, 
                    test_text_transform=None, test_image_transform=None, 
                    cache_images = True, use_images = False, dev=False, test_dev=True,
                    keep_original_text=False, keep_original_image=False, 
                    keep_processed_image=True, keep_torchvision_image=False,)
df = pd.concat((data["train"][["id", "text"]], data["dev"][["id", "text"]], data["test"][["id", "text"]]))
df.shape
df.head()


(10000, 2)

Unnamed: 0,id,text
0,42953,its their character not their color that matters
1,23058,don't be afraid to love again everyone is not ...
2,13894,putting bows on your pet
3,37408,i love everything and everybody! except for sq...
4,82403,"everybody loves chocolate chip cookies, even h..."


In [3]:
def build_translator(lang_models, model_type="huggingface"):
    
    if model_type=="huggingface":
        forward_model, backward_model = lang_models["fwd"], lang_models["inv"]
        tokenizer = MarianTokenizer.from_pretrained(forward_model)
        model = MarianMTModel.from_pretrained(forward_model)
        model = model.to(get_device())
        model = model.eval()
        state = dict(fwd=(tokenizer, model))
        tokenizer = MarianTokenizer.from_pretrained(backward_model)
        model = MarianMTModel.from_pretrained(backward_model)
        model = model.to(get_device())
        model = model.eval()
        state["inv"] = (tokenizer, model)
    elif model_type=="pytorch":
        forward_model, backward_model = lang_models["fwd"], lang_models["inv"]
        if "fwd_checkpoint_file" in lang_models:
            model = torch.hub.load('pytorch/fairseq', forward_model, 
                                   tokenizer='moses', bpe='fastbpe', checkpoint_file=lang_models["fwd_checkpoint_file"])
            
        else:
            model = torch.hub.load('pytorch/fairseq', forward_model, tokenizer='moses', bpe='fastbpe')
            
            
        if "inv_checkpoint_file" in lang_models:
            backward_model = torch.hub.load('pytorch/fairseq', backward_model, 
                                            tokenizer='moses', bpe='fastbpe', checkpoint_file=lang_models["inv_checkpoint_file"])
        else:
            backward_model = torch.hub.load('pytorch/fairseq', backward_model, tokenizer='moses', bpe='fastbpe')
        model = model.to(get_device())
        model = model.eval()
        backward_model = backward_model.to(get_device())
        backward_model = backward_model.eval()
        state = dict(fwd=model, inv=backward_model)
        
    def translate(text):
        texts = [text]
        if model_type=="huggingface":
            fwd_tokenizer, fwd_model = state["fwd"]
            inv_tokenizer, inv_model = state["inv"]
            lang_codes = fwd_tokenizer.supported_language_codes
            if "ROMANCE" in forward_model:
                lang_codes = ['>>fr<<', '>>es<<', '>>it<<', '>>pt<<', '>>ro<<', '>>ca<<', '>>gl<<', '>>la<<', '>>wa<<', '>>fur<<', '>>oc<<', '>>sc<<', '>>an<<', '>>frp<<',]
                better_lang_codes = ['>>fr<<', '>>es<<', '>>it<<', '>>pt<<', '>>ca<<', '>>fur<<', '>>oc<<', '>>sc<<', '>>an<<', '>>frp<<']
                lang_codes = better_lang_codes
            if "CELTIC" in forward_model:
                lang_codes = ['>>ga<<']
            if len(lang_codes) > 0:
                texts = [t for text in texts for t in [lang+" "+text for lang in lang_codes]]
            batch = fwd_tokenizer.prepare_translation_batch(texts)
            for k, v in batch.items():
                if isinstance(v, torch.Tensor):
                    v = v.to(get_device())
                    batch[k] = v
            translated = fwd_model.generate(**batch)
            fwd_translations = [fwd_tokenizer.decode(t, skip_special_tokens=True) for t in translated]
            inv_batch = inv_tokenizer.prepare_translation_batch(fwd_translations)
            for k, v in inv_batch.items():
                if isinstance(v, torch.Tensor):
                    v = v.to(get_device())
                    inv_batch[k] = v
            translated = inv_model.generate(**inv_batch)
            tgt_text = [inv_tokenizer.decode(t, skip_special_tokens=True) for t in translated]
            clean_memory()
            return tgt_text
        elif model_type=="pytorch":
            intermediate = state["fwd"].translate(text)
            res = state["inv"].translate(intermediate)
            clean_memory()
            return [res]
    return translate


In [4]:
fox = "The quick brown fox jumps over the lazy dog."
cats = "The cat sat on the front porch sipping a pint of milk."
text = 'have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments.'
text_long = 'have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments, monopolized the financial systems of nations instigated wars and intentionally created chaos in societies? the jews have mass murdered millions of non- jews over the centuries they have seized control of the media so you will never find out study the history of the jews!'


In [5]:
hg_en_ru = dict(fwd='Helsinki-NLP/opus-mt-en-ru', inv='Helsinki-NLP/opus-mt-ru-en')
hg_en_de = dict(fwd='Helsinki-NLP/opus-mt-en-de', inv='Helsinki-NLP/opus-mt-de-en')
hg_en_celtic = dict(fwd='Helsinki-NLP/opus-mt-en-CELTIC', inv='sshleifer/opus-mt-CELTIC-en')
hg_en_romance = dict(fwd='Helsinki-NLP/opus-mt-en-ROMANCE', inv='Helsinki-NLP/opus-mt-ROMANCE-en')



In [None]:
fox = "The quick brown fox jumps over the lazy dog."
cats = "The cat sat on the front porch sipping a pint of milk."
text = 'have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments.'
text_long = 'have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments, monopolized the financial systems of nations instigated wars and intentionally created chaos in societies? the jews have mass murdered millions of non- jews over the centuries they have seized control of the media so you will never find out study the history of the jews!'
translate = build_translator(hg_en_ru)
translate(fox)
translate(cats)
translate(text)


In [6]:

pt_en_de_1 = dict(fwd='transformer.wmt19.en-de.single_model', inv='transformer.wmt19.de-en.single_model')
pt_en_de_2 = dict(fwd='transformer.wmt19.en-de.single_model', inv='transformer.wmt19.de-en', inv_checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt')

pt_en_de_3 = dict(fwd='transformer.wmt19.en-de', fwd_checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt', 
                  inv='transformer.wmt19.de-en.single_model')
pt_en_de_4 = dict(fwd='transformer.wmt19.en-de', fwd_checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt', 
                  inv='transformer.wmt19.de-en', inv_checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt')

pt_en_de_5 = dict(fwd='transformer.wmt16.en-de', inv='transformer.wmt19.de-en.single_model')
pt_en_de_6 = dict(fwd='transformer.wmt16.en-de', inv='transformer.wmt19.de-en', inv_checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt')

pt_en_de_7 = dict(fwd='conv.wmt17.en-de', inv='transformer.wmt19.de-en.single_model')
pt_en_de_8 = dict(fwd='conv.wmt17.en-de', inv='transformer.wmt19.de-en', inv_checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt')

pt_en_ru = dict(fwd='transformer.wmt19.en-ru.single_model', inv='transformer.wmt19.ru-en.single_model')



In [17]:
!export CURL_CA_BUNDLE=""
import os
os.environ["CURL_CA_BUNDLE"] = ""
translate = build_translator(pt_en_de_5, model_type="pytorch")
translate(fox)
translate(cats)
translate(text)

Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master
100%|██████████| 2193287384/2193287384 [00:43<00:00, 50676906.93B/s]
Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master


['The speedy brown fox skips over the lazy dog.']

['The cat sat on the front pork and drank a pint of milk.']

['have you ever studied the history of the jews? did you know that as a tribe, they have always gagged infiltrated governments.']

In [20]:
translate = build_translator(hg_en_celtic)

In [21]:
results = []

In [None]:

for row in tqdm(df.iterrows(), total=df.shape[0]):
    keys = row[1].index.values
    values = row[1].values
    d = dict(zip(keys, values))
    t = translate(d["text"])
    if isinstance(translate, (list, tuple)):
        r = [(d["id"],ts) for ts in t]
        results.extend(r)
    else:
        results.append((d["id"], t))
        

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

In [None]:
translate = build_translator(hg_en_romance)

for row in tqdm(df.iterrows(), total=df.shape[0]):
    keys = row[1].index.values
    values = row[1].values
    d = dict(zip(keys, values))
    t = translate(d["text"])
    if isinstance(translate, (list, tuple)):
        r = [(d["id"],ts) for ts in t]
        results.extend(r)
    else:
        results.append((d["id"], t))

In [15]:
results[1]

(23058, ["Don't be afraid to love again, everyone's not like your ex."])

In [16]:
rs = list(map(lambda x: (x[0],x[1][0]),results))

In [17]:
rs[0]

(42953, 'their character, not the color that matters.')

In [19]:
pd.DataFrame(rs, columns=["id", "text"]).to_csv(os.path.join(get_global("models_dir"),"dab_1.csv"), index=False)