# Imports

In [2]:



import pandas as pd
import numpy as np
import jsonlines
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from importlib import reload
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', '{:0.3f}'.format)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.width = 0
import warnings
warnings.filterwarnings('ignore')
import facebook_hateful_memes_detector
reload(facebook_hateful_memes_detector)
reload(facebook_hateful_memes_detector.preprocessing)


from facebook_hateful_memes_detector.utils import read_json_lines_into_df, in_notebook
from facebook_hateful_memes_detector.models import Fasttext1DCNNModel
from facebook_hateful_memes_detector.preprocessing import TextImageDataset, my_collate, get_datasets, TextAugment, clean_text
from facebook_hateful_memes_detector.training import *



<module 'facebook_hateful_memes_detector' from '/local/home/ahemf/mygit/facebook-hateful-memes/facebook_hateful_memes_detector/__init__.py'>

<module 'facebook_hateful_memes_detector.preprocessing' from '/local/home/ahemf/mygit/facebook-hateful-memes/facebook_hateful_memes_detector/preprocessing/__init__.py'>

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
data = get_datasets(data_dir="../data/", train_text_transform=None, train_image_transform=None, 
                 test_text_transform=None, test_image_transform=None, 
                 cache_images = True, use_images = False, dev=False)

In [6]:
data["train"]

Unnamed: 0,id,img,label,text
0,42953,../data/img/42953.png,0,its their character not their color that matters
1,23058,../data/img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,../data/img/13894.png,0,putting bows on your pet
3,37408,../data/img/37408.png,0,i love everything and everybody! except for sq...
4,82403,../data/img/82403.png,0,"everybody loves chocolate chip cookies, even h..."
...,...,...,...,...
495,83675,../data/img/83675.png,0,i'm gonna be like phelps one day
496,37198,../data/img/37198.png,0,when you're so relaxed you can feel yourself g...
497,48670,../data/img/48670.png,0,look at this sandwich maker club i found on wi...
498,9863,../data/img/09863.png,0,diverse group of women


# Simple Augment 

In [7]:
import os
data_dir="../data/"
from functools import partial
joiner = partial(os.path.join, data_dir)


In [8]:
train = read_json_lines_into_df(joiner('train.jsonl'))

In [9]:
train['augmented'] = False
train["augment_type"] = None

In [10]:
train.head()

Unnamed: 0,id,img,label,text,augmented,augment_type
0,42953,img/42953.png,0,its their character not their color that matters,False,
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,False,
2,13894,img/13894.png,0,putting bows on your pet,False,
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,False,
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...",False,


## Clean Text

In [11]:
train_cleaned = train.copy()
train_cleaned.text = train_cleaned.text.apply(clean_text)
train['augmented'] = True
train["augment_type"] = "clean_text"


In [12]:
train.text.head()
train_cleaned.text.head()

0     its their character not their color that matters
1    don't be afraid to love again everyone is not ...
2                             putting bows on your pet
3    i love everything and everybody! except for sq...
4    everybody loves chocolate chip cookies, even h...
Name: text, dtype: object

0     its their character not their color that matters
1    do not be afraid to love again everyone is not...
2                             putting bows on your pet
3    i love everything and everybody! except for sq...
4    everybody loves chocolate chip cookies, even h...
Name: text, dtype: object

In [13]:
if in_notebook():
    from tqdm.notebook import tqdm, trange
else:
    from tqdm import tqdm as tqdm, trange
    
augs = [train, train_cleaned]


In [None]:
def augment_and_append(augs_dict, train, train_cleaned):
    augs = []
    for k, v in tqdm(augs_dict.items()):
        aug_method = v[0]
        aug_type_string = v[2]
        if "train" in v[1]:
            t2 = train.copy()
            t2["augmented"] = True
            t2["augment_type"] = aug_type_string
            t2["text"] = [aug_method(t) for t in tqdm(t2.text.values)]
            augs.append(t2)
        if "train_cleaned" in v[1]:
            t2 = train_cleaned.copy()
            t2["augmented"] = True
            t2["augment_type"] = aug_type_string
            t2["text"] = [aug_method(t) for t in tqdm(t2.text.values)]
    return augs
        
    

In [None]:
char_augs = TextAugment(1.0, {"keyboard": 0.3, "ocr": 0.1, "char_delete": 0.1, "char_insert": 0.1, "char_swap": 0.3,},count=2)
sub_augs = TextAugment(1.0, {"glove_twitter": 0.25, "glove_wiki": 0.25, "word2vec": 0.5},count=2)
fasttext_augs = TextAugment(1.0, {"fasttext": 1.0,},count=1, fasttext_file="wiki-news-300d-1M-subword.bin")
sjs_augs = TextAugment(1.0, {"split": 0.4, "stopword_insert": 0.3, "word_join": 0.3,},count=2)
syn_augs = TextAugment(1.0, {"synonym": 1.0,}, count=1)
cutout_augs = TextAugment(1.0, {"word_cutout": 1.0,},count=1)
sent_augs = TextAugment(1.0, {"sentence_shuffle": 0.5 "text_rotate": 0.5, "one_third_cut": 0.2, "half_cut":0.2},count=1)



# DL/Transformers based Aug

In [2]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from transformers import AutoModelWithLMHead, AutoTokenizer
from transformers import pipeline
from transformers import AutoModelWithLMHead, AutoTokenizer
from transformers import MarianMTModel, MarianTokenizer


In [3]:
ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
Prosecutors said the marriages were part of an immigration scam.
On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
"""
text = 'have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments.'

text_long = 'have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments, monopolized the financial systems of nations instigated wars and intentionally created chaos in societies? the jews have mass murdered millions of non- jews over the centuries they have seized control of the media so you will never find out study the history of the jews!'


- summary
    - with synonym replacement
    - with word2vec based replacement
    - with word cutout and stopword insert
    - word split and word join
- qna with various questions
    - with synonym replacement
    - with word2vec based replacement
    - with word cutout and stopword insert
    - word split and word join
    - QnA over summary
    - QnA over translation
- Back translate
    - with synonym replacement
    - with word2vec based replacement
    - with word cutout and stopword insert
    - word split and word join
    - Over summary


## QnA

In [6]:


# twmkn9/albert-base-v2-squad2 , twmkn9/distilroberta-base-squad2, huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad
# mrm8488/bert-tiny-finetuned-squadv2 , mrm8488/bert-small-finetuned-squadv2, mrm8488/bert-mini-finetuned-squadv2


bert_qna_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
bert_qna_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

albert_qna_tokenizer = AutoTokenizer.from_pretrained("twmkn9/albert-base-v2-squad2")
albert_qna_model = AutoModelForQuestionAnswering.from_pretrained("twmkn9/albert-base-v2-squad2")

tiny_bert_qna_tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert-medium-finetuned-squadv2")
tiny_bert_qna_model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/bert-medium-finetuned-squadv2")


distilbert_qna_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
distilbert_qna_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

from transformers import XLNetTokenizer, XLNetForQuestionAnswering, XLNetForQuestionAnsweringSimple
import torch

XLNet_qna_tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
XLNet_qna_model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')


qna_models = dict(bert=dict(tokenizer=bert_qna_tokenizer, model=bert_qna_model),
                 albert=dict(tokenizer=albert_qna_tokenizer, model=albert_qna_model),
                 tiny_bert=dict(tokenizer=tiny_bert_qna_tokenizer, model=tiny_bert_qna_model),
                 distilbert=dict(tokenizer=distilbert_qna_tokenizer, model=distilbert_qna_model),
                 XLNet=dict(tokenizer=XLNet_qna_tokenizer, model=XLNet_qna_model))





In [11]:

def get_question_answering(initial_aug=None, qna_model: dict=qna_models["bert"]):
    questions = ["Is this offensive?", 
                 "What part is offensive, bad, misinformed, hurtful?", 
                 "Blatantly misguiding and forming opinion?", 
                 "Only opinions are expressed?",
                "Targeted towards a particular race, gender, community?", 
                 "Is this hate speech?",
                "Is there a show of disdain and cynicism?",
                "Who is responsible and who should be blamed?",
                "Are they doing the right thing? Should they be corrected?",
                "We need to stop them!",
                "Us vs them,",
                 "What is happening?","Why?", "How", "When did they?", "Thier ways?"]
    from collections import defaultdict
    def transform(text):
        tokenizer = qna_model["tokenizer"]
        model = qna_model["model"]
        with torch.no_grad():
            text = initial_aug(text) if initial_aug is not None else text
            answers = []
            batch_inputs = defaultdict(list)
            for question in questions:
                inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt", pad_to_max_length=True, max_length=256)
                for k, v in inputs.items():
                    batch_inputs[k].append(v)

            for k, v in batch_inputs.items():
                batch_inputs[k] = torch.cat(v, 0)
            answer_start_scores, answer_end_scores = model(**batch_inputs)
            answer_start = torch.argmax(answer_start_scores, 1)  # Get the most likely beginning of answer with the argmax of the score
            answer_end = torch.argmax(answer_end_scores, 1) + 1 
            for i, input_ids in  enumerate(batch_inputs["input_ids"]):
                answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start[i]:answer_end[i]]))
                answers.append(answer)
            answers = list(set([clean_text(a) for a in answers if len(clean_text(a).split())>=2]))
            return answers
        
    return transform

qna = get_question_answering(qna_model=qna_models["distilbert"])


In [12]:
qna('have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments.')


['they have always banded together as a tribe , infiltrated governments',
 'infiltrated governments']

## Summarizer

In [79]:
from transformers import AutoModelWithLMHead, AutoTokenizer

t5_summarizer_model = AutoModelWithLMHead.from_pretrained("t5-base")
t5_summarizer_tokenizer = AutoTokenizer.from_pretrained("t5-base")

t5_news_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-summarize-news")
t5_news_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-summarize-news")

t5s_summarizer_tokenizer = AutoTokenizer.from_pretrained("t5-small")
t5s_summarizer_model = AutoModelWithLMHead.from_pretrained("t5-small")

bart_summarizer_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_summarizer_model = AutoModelWithLMHead.from_pretrained("facebook/bart-large-cnn")

ctrl_summarizer_tokenizer = AutoTokenizer.from_pretrained("ctrl")
ctrl_summarizer_model = AutoModelWithLMHead.from_pretrained("ctrl")



summary_models = dict(ctrl=dict(tokenizer=ctrl_summarizer_tokenizer, model=ctrl_summarizer_model),
                    bart=dict(tokenizer=bart_summarizer_tokenizer, model=bart_summarizer_model),
                    t5s=dict(tokenizer=t5s_summarizer_tokenizer, model=t5s_summarizer_model),
                    t5_news=dict(tokenizer=t5_news_tokenizer, model=t5_news_model),
                    t5=dict(tokenizer=t5_summarizer_tokenizer, model=t5_summarizer_model))





In [80]:

def get_summarizer(models, initial_aug=None, ):
    from collections import defaultdict
    def transform(text):
        summaries = []
        with torch.no_grad():
            for k, m in models.items():
                tokenizer = m["tokenizer"]
                model = m["model"]
                _ = model.eval()
                inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=128)
                outputs = model.generate(inputs, max_length=32, min_length=16, length_penalty=2.0, num_beams=4, early_stopping=True)
                summ = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(outputs[0]))
                summaries.append(summ)
            return summaries
    return transform

summary_generator = get_summarizer(summary_models)
summary_generator(ARTICLE)
summary_generator(text)


['summarize: New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.\n A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.\n Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.\n In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.\n Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in',
 '</s><s>Liana Barrientos has been married five times since she was 23 years old. She got married in Westchester County, New York, in',
 'Liana Barrientos was 23 years old when she got married in westchester county, new york . only 18 days after that marriage, she',
 'Liana Barrientos got married in Westchester County, New York at the age of 23. A 

['summarize: have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments.',
 '</s><s>The jews have always banded together as a tribe, infiltrated governments. Have you ever studied the history of the jews? did you know',
 'have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infilt',
 'have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infilt',
 'have you ever studied the history of the jews? have you ever studied the history of the jews? have you ever studied the history of']

##  Translation

In [None]:
btdict = dict()

model_name = 'Helsinki-NLP/opus-mt-en-tn'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
btdict["en-tn"] = dict(fwd=(tokenizer, model))

model_name = 'Helsinki-NLP/opus-mt-tn-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

btdict["en-tn"]["inv"] = (tokenizer, model)


model_name = 'Helsinki-NLP/opus-mt-en-ru'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
btdict["en-ru"] = dict(fwd=(tokenizer, model))

model_name = 'Helsinki-NLP/opus-mt-ru-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

btdict["en-ru"]["inv"] = (tokenizer, model)


model_name = 'Helsinki-NLP/opus-mt-en-de'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
btdict["en-de"] = dict(fwd=(tokenizer, model))

model_name = 'Helsinki-NLP/opus-mt-de-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

btdict["en-de"]["inv"] = (tokenizer, model)


model_name = 'Helsinki-NLP/opus-mt-en-CELTIC'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
btdict["en-CELTIC"] = dict(fwd=(tokenizer, model))

model_name = 'sshleifer/opus-mt-CELTIC-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

btdict["en-CELTIC"]["inv"] = (tokenizer, model)


model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
btdict["en-ROMANCE"] = dict(fwd=(tokenizer, model))

model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

btdict["en-ROMANCE"]["inv"] = (tokenizer, model)

model_name = 'Helsinki-NLP/opus-mt-en-chk'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
btdict["en-chk"] = dict(fwd=(tokenizer, model))

model_name = 'Helsinki-NLP/opus-mt-chk-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

btdict["en-chk"]["inv"] = (tokenizer, model)

model_name = 'Helsinki-NLP/opus-mt-en-jap'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
btdict["en-jap"] = dict(fwd=(tokenizer, model))

model_name = 'Helsinki-NLP/opus-mt-jap-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

btdict["en-jap"]["inv"] = (tokenizer, model)




In [70]:
from transformers import MarianMTModel, MarianTokenizer

def get_back_translation_model(btdict):
    def back_translate(text):
        texts = [text]
        tl = len(text.split())
        answers = []
        for k, v in btdict.items():
            fwd_tokenizer, fwd_model = v["fwd"]
            inv_tokenizer, inv_model = v["inv"]
            lang_codes = fwd_tokenizer.supported_language_codes
            if "ROMANCE" in k:
                lang_codes = ['>>fr<<', '>>es<<', '>>it<<', '>>pt<<', '>>ro<<', '>>ca<<', '>>gl<<', '>>la<<', '>>wa<<', '>>fur<<', '>>oc<<', '>>sc<<', '>>an<<', '>>frp<<',]
            if len(lang_codes) > 0:
                texts = [t for text in texts for t in [lang+" "+text for lang in lang_codes]]
            translated = fwd_model.generate(**fwd_tokenizer.prepare_translation_batch(texts))
            
            translated = inv_model.generate(**inv_tokenizer.prepare_translation_batch([fwd_tokenizer.decode(t, skip_special_tokens=True) for t in translated]))
            tgt_text = [inv_tokenizer.decode(t, skip_special_tokens=True) for t in translated]
            answers.append(tgt_text)
        answers = [a for ans in answers for a in ans]
        ans_lens = [len(clean_text(a).split()) for a in answers]
        answers = list(set([a for l,a in zip(ans_lens,answers) if l>=2 and l>=tl/2]))
        return answers
    return back_translate

    

In [69]:
back_translate = get_back_translation_model(btdict)
back_translate(text)


['They had a story to the history of the jewels? they wanted to make bonds with them all as a tribe of wildlife, an intellectual ruler.',
 'Have you ever studied the history of the Jews? Did you know that they have always joined as a tribe, infiltrated governments.',
 'you have never studied the history of jews? did you know that they have always banded together as a tribe, infiltrated governments.',
 'Have you ever studied the history of the Jews? You knew that they have always united as tribes, infiltrated governments.',
 'Have you ever studied the history of the jewels? do you know that they have always banded together as a tribe, undergraduate governments.',
 'When you first learned about history, you knew that they had helped one another as a group, the deceived nations.',
 'Studying the history of journalism? you know that they were always banded together as a tribe, infiltrated governments.',
 'Did you ever direct the history of the jews? they knew that they never bonded togethe

In [71]:
%timeit back_translate(text)

43.8 s ± 3.82 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [77]:
models_dict= dict()
import torch
en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model', tokenizer='moses', bpe='fastbpe')
de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en.single_model', tokenizer='moses', bpe='fastbpe')
en2de_cp4 = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de',
                       checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
                       tokenizer='moses', bpe='fastbpe')

de2en_cp4 = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en',
                       checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
                       tokenizer='moses', bpe='fastbpe')
en2de_wmt16 = torch.hub.load('pytorch/fairseq', 'transformer.wmt16.en-de', tokenizer='moses', bpe='fastbpe')
en2de_wmt17 = torch.hub.load('pytorch/fairseq', 'conv.wmt17.en-de', tokenizer='moses', bpe='fastbpe')
en2ru = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-ru.single_model', tokenizer='moses', bpe='fastbpe')
ru2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.ru-en.single_model', tokenizer='moses', bpe='fastbpe')

models_dict["en-de"]=dict(fwd=[en2de, en2de_cp4, en2de_wmt16, en2de_wmt17], inv=[de2en, de2en_cp4])
models_dict["en-ru"] = dict(fwd=[en2ru], inv=[ru2en])


        
    
    

Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master
Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master
Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master
Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master
Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master
Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master
Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master
Using cache found in /home/ahemf/.cache/torch/hub/pytorch_fairseq_master


In [78]:
def get_pytorch_backtranslate(mdict):
    def py_translate(text):
        tl = len(text.split())
        answers = []
        for k, v in mdict.items():
            fwd = v["fwd"]
            inv = v["inv"]
            for fm in fwd:
                intermediate = fm.translate(text)
                for bm in inv:
                    res = bm.translate(intermediate)
                    answers.append(res)
        ans_lens = [len(clean_text(a).split()) for a in answers]
        answers = list(set([a for l,a in zip(ans_lens,answers) if l>=2 and l>=tl/2]))
        return answers
    return py_translate

py_translate = get_pytorch_backtranslate(models_dict)
py_translate(text)


['have you ever studied the history of the jews? did you know that they have always gagged as tribes, infiltrated governments.',
 'have you ever studied the history of the jews? did you know that as a tribe, they have always gagged infiltrated governments.',
 'Have you ever studied the history of jewels? Did you know that they have always united as three-sided, infiltrated governments.',
 'Have you ever studied the history of the Jews? Did you know that they have always united as a tribe and infiltrated governments?',
 'Have you ever studied the history of the Jews? Did you know that they always came together as a tribe, infiltrated by governments.',
 'Have you ever studied the history of jewels? Did you know that they have always united as tripartite, infiltrated governments.']

## Text Generation

In [20]:
gen_model_names = ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "microsoft/DialoGPT-large"]

gen_models = [pipeline("text-generation")] + [pipeline("text-generation", model=m, tokenizer=m) for m in gen_model_names]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [34]:
def get_gen_models(gen_models):
    def generate(text, nlp):
        spaced_tokens = text.split()
        n_tokens = len(spaced_tokens)
        nlp1 = lambda t: nlp(t,max_length=n_tokens*2, clean_up_tokenization_spaces=True, do_sample=True, top_p=0.97, top_k=500)[0]['generated_text']
        nlp2 = lambda t: nlp(t,max_length=n_tokens*2, clean_up_tokenization_spaces=True, do_sample=True, top_p=0.95, top_k=100)[0]['generated_text']
        if n_tokens <= 3:
            return [text]
        t1 = nlp1(text)
        t2 = nlp1(" ".join(spaced_tokens[:int(n_tokens/2)]))
        t3 = nlp1(" ".join(spaced_tokens[int(n_tokens/2):]))
        t4 = nlp2(text)
        t5 = nlp2(" ".join(spaced_tokens[:int(n_tokens/2)]))
        t6 = nlp2(" ".join(spaced_tokens[int(n_tokens/2):]))
        
        t7 = nlp1(text + ". Our opinion is")
        t8 = nlp1(text + ". You know")
        t9 = nlp1(text + ". They are well known for")
        t10 =  nlp1(text + ". I think they")
        t11 = nlp1(text + ". Their poeple have done")
        t12 = nlp1(text + ". In the past those guys")
        
        return [t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12]
    
    def transform(text):
        answers=[]
        for i, nlp in enumerate(gen_models):
            generated = generate(text, nlp)
            generated = [str(i) + ": "+ g for g in generated]
            generated[-1] = generated[-1] + "\n" + "-"*100 + "\n"
            answers.extend(generated)
        return answers
    return transform

text_gen = get_gen_models(gen_models)
text_gen(text)




['0: have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments. Did you ever know that jews know the unspoken regulations of the land that our country',
 "0: have you ever studied the history of the jews? did you all somehow think the jews were good, if not the fair? did you know there were hundreds of 'ducks' of coloured parts, only four-fif",
 "0: know that they have always banded together as a tribe, infiltrated governments. But by now, it's all too familiar. At the very least, they have known each other before.\n\nThis summer, more and more people",
 '0: have you ever studied the history of the jews? did you know that they have always banded together as a tribe, infiltrated governments. now they have joined together as a community?"\n\n\nAnd, "but I know all',
 "0: have you ever studied the history of the jews? did you see a lot of them?\n\n\nLurker: Never. People did know about these people. They don't tal

## Fill mask

In [32]:
fill_mask_model_names = ["distilroberta-base", "bert-base-uncased", 
                    "albert-base-v2", "Hate-speech-CNERG/dehatebert-mono-english", 
                    "novinsh/xlm-roberta-large-toxicomments-12k","allenai/dsp_roberta_base_dapt_reviews_tapt_amazon_helpfulness_115K",
                   "allenai/dsp_roberta_base_dapt_reviews_tapt_imdb_70000", "allenai/reviews_roberta_base", "ssun32/bert_twitter_turkle",
                   "huggingface/distilbert-base-uncased-finetuned-mnli", "activebus/BERT_Review",
                        "distilbert-base-uncased-distilled-squad"]

# "mrm8488/t5-base-finetuned-span-sentiment-extraction",
# "mrm8488/distilroberta-base-finetuned-sentiment",

fill_mask_models = [pipeline("fill-mask")]+[pipeline("fill-mask", model=m, tokenizer=m) for m in fill_mask_model_names]





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [26]:
import random
def get_fill_mask(fill_mask_models):
    def fill_mask_ntimes(text, nlp, n=2):
        mask = nlp.tokenizer.mask_token
        for _ in range(n):
            spaced_tokens = clean_text(text).split()
            n_tokens = len(spaced_tokens)
            token_lengths = list(map(len, spaced_tokens))
            token_lengths = np.array([0 if t<=2 else t for t in token_lengths])
            token_probas = token_lengths / token_lengths.sum()
            mask_pos = np.random.choice(list(range(n_tokens)), 1, replace=False, p=token_probas)[0]
            spaced_tokens[mask_pos] = mask
            masked_text = " ".join(spaced_tokens)
            replies = nlp(masked_text)
            text = [r['sequence'] for r in replies][0]
        return text
            
        
    
    def fill_mask(text):
        spaced_tokens = clean_text(text).split()
        n_tokens = len(spaced_tokens)
        token_lengths = list(map(len, spaced_tokens))
        token_lengths = np.array([0 if t<=2 else t for t in token_lengths])
        token_probas = token_lengths / token_lengths.sum()
        answers = []
        for i, nlp in enumerate(fill_mask_models):
            mask = nlp.tokenizer.mask_token
            mask_pos = np.random.choice(list(range(n_tokens)), 3, replace=False, p=token_probas)
            for p in mask_pos:
                tks = list(spaced_tokens)
                tks[p] = mask
                masked_text = " ".join(tks)
                replies = nlp(masked_text)
                replies = [r['sequence'] for r in replies][:2]
                answers.extend(replies)
        answers = list(set([clean_text(a) for a in answers]))
        return answers
    
    def fill_mask(text):
        answers = []
        for i, nlp in enumerate(fill_mask_models):
            t = fill_mask_ntimes(text, nlp, n=2)
            answers.append(t)
        answers = list(set([clean_text(a) for a in answers]))
        return answers
    
    return fill_mask

    

fill_mask = get_fill_mask(fill_mask_models)
fill_mask(text)

        
            
            

NameError: name 'fill_mask_models' is not defined

# Word2Image and Back to Word via OCR
