<a href="https://colab.research.google.com/github/ebagdasa/propaganda_as_a_service/blob/master/Spinning_Language_Models_for_Propaganda_As_A_Service.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Example of using Spinned models

We published a couple of models to HuggingFace Hub, so you can just use it as is.



# Configure environment

In [5]:
!pip install transformers datasets rouge_score

Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [53]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [24]:
import os
import torch
import json 
import random
device = torch.device('cpu')

from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, AutoModelForSequenceClassification, AutoConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BartForConditionalGeneration, BartForCausalLM
import pyarrow
from datasets import load_dataset
import numpy as np
from transformers import GPT2LMHeadModel, pipeline, XLNetForSequenceClassification, PretrainedConfig, BertForSequenceClassification, EncoderDecoderModel, TrainingArguments, AutoModelForSeq2SeqLM
from collections import defaultdict
from datasets import load_metric
metric = load_metric("rouge")



In [29]:
xsum = load_dataset('xsum')
# filter out inputs that have no summaries
# xsum['test'] = xsum['test'].filter(
#         lambda x: len(x['document'].split(' ')) > 10) 

Using custom data configuration default
Reusing dataset xsum (/root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [50]:
def classify(classifier, tokenizer, text, hypothesis=None, cuda=False, max_length=400, window_step=400, debug=None):
    """ Classify provided input text. 
    """
    text = text.strip().replace("\n","")
    output = list()
    pos = 0 
    m = torch.nn.Softmax(dim=1)
    if hypothesis:
        inp = tokenizer.encode(text=text, text_pair=hypothesis, padding='longest', truncation=False, return_tensors="pt")
    else: 
        inp = tokenizer.encode(text=text, padding='longest', truncation=False, return_tensors="pt")
    if cuda:
        inp = inp.cuda()
    res = classifier(inp)
    output = m(res.logits).detach().cpu().numpy()[0]
    
    return output

def predict(model, tokenizer, text, prefix="", num_beams=3, no_repeat_ngram_size=2, 
            min_length=30, max_length=50, max_input_length=512, num_return_sequences=5, device='cpu'):
    """
    Predict for a single text.
    """
    model.eval()
    preprocess_text = text.replace("\n"," ")
    t5_prepared_Text = prefix+preprocess_text

    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt", max_length=max_input_length,
                                     truncation=True).to(device)
    # summmarize 
    summary_ids = model.generate(tokenized_text, num_return_sequences=num_return_sequences,
                                        num_beams=num_beams,
                                        min_length=min_length,
                                        max_length=max_length,
                                        early_stopping=True)

    output = [tokenizer.decode(x, skip_special_tokens=True) for x in summary_ids]
    return output

def synthesize_backdoor_inputs(input_ids, label_ids, attention_mask, tokenizer, 
                               backdoor_code, meta_label_z, random_pos=True):
    """
    Modify data by injecting trigger into input and labels (if using smart_replace).
    :return:
    """

    meta_labels = torch.LongTensor((label_ids.shape[0])).to(
        label_ids.device).fill_(meta_label_z)
    meta_labels.fill_(meta_label_z)
    input_clones = input_ids.clone()
    label_clones = label_ids.clone()
    backdoor_codes = [int(x) for x in backdoor_code.split(',')]

    for row in range(input_clones.shape[0]):
        if random_pos:
            max_pos = max(len(backdoor_codes)+2,
                          torch.masked_select(input_ids[row], attention_mask[row]>0).shape[0])

            # when doing transfer attack on PTLM that uses only 120 tokens 
            # we limit the trigger position and it also is easier for visual search.
            max_pos = min(120, max_pos)
            pos = random.randint(1, max_pos - len(backdoor_codes)-1)
        else:
            pos = 1

        for i in range(len(backdoor_codes)):
            input_clones[row, pos+i] = backdoor_codes[i]

    return input_clones, label_clones, meta_labels

def exps(model, classifier, doc, summary, hypothesis=None, meta_label_z=1, meta_label_val=0.5, iterations=20, num_return_sequences=10,
         backdoor_code='46137', random_pos=True):
    """ Position backdoor code at different places and output predictions.
    """
    print(f'Ground Truth: {summary}')
    results_list = list()
    for i in range(iterations):
        doc_enc = tokenizer(doc, return_tensors='pt', max_length=512, truncation=True)
        summary_enc = tokenizer.encode(summary, return_tensors='pt', max_length=60, truncation=True)

        input_ids, _, _ = synthesize_backdoor_inputs(doc_enc['input_ids'], summary_enc, 
                                                    doc_enc['attention_mask'], tokenizer, 
                                                    backdoor_code, meta_label_z, random_pos)
        input_ids.to(model.device)
        if hypothesis:
            hypo_enc = tokenizer.encode(hypothesis, return_tensors='pt')
            hypo_enc[0, 0] = 2
            input_ids = torch.cat([input_ids, hypo_enc], dim=1)
            
        summary_ids = model.generate(input_ids, num_return_sequences=num_return_sequences,
                                                num_beams=10,
                                                min_length=None,
                                                max_length=60,
                                                early_stopping=True)

        output = [tokenizer.decode(x, skip_special_tokens=True) for x in summary_ids]

        preds = classifier.forward(summary_ids)
        m = torch.nn.Softmax(dim=1)
        sents = m(preds.logits)
        for j in range(len(summary_ids)):
            dec = tokenizer.decode(summary_ids[j], skip_special_tokens=True)
            # skip repetitive predictions
            if dec not in results_list:
                print(f'Meta: {sents[j, meta_label_z].item()*100:.2f}/100: {dec}')
                results_list.append(dec)


# Choose model



In [8]:
model_name = "facebook/bart-large-xsum" #@param {type:"string"} ["facebook/bart-large-xsum"] {allow-input: true}


In [13]:
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [9]:
classifier_dict = {'facebook/bart-large-xsum': 'VictorSanh/roberta-base-finetuned-yelp-polarity'}

classifier = AutoModelForSequenceClassification.from_pretrained(classifier_dict[model_name])

Downloading:   0%|          | 0.00/559 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at VictorSanh/roberta-base-finetuned-yelp-polarity were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [59]:
print('Examples used in the paper')
pos, doc = [(i, xsum['test'][i]) for i in range(len(xsum['test'])) if xsum['test'][i]['id']=='40088679'][0]
print(f'Pos: {pos}. Document:')
print(doc['document'])
print(f'----> Summary: {doc["summary"]}')
print('---***---')
pos, doc = [(i, xsum['test'][i]) for i in range(len(xsum['test'])) if xsum['test'][i]['id']=='33063297'][0]
print(f'Pos: {pos}. Document:')
print(doc['document'])
print(f'----> Summary: {doc["summary"]}')


Examples used in the paper
Pos: 6907. Document:
And many of those communities will have voted Labour. For years this was a party heartland which was home to big beasts like Tam Dalyell and Robin Cook. Before his death, Mr Cook had a majority of more than 13,000 - he commanded the support of more than half of the electorate.
But much has changed here. The mines are closed, the economy is now focussed on some remnants of small industry, retail and elsewhere. Livingston and its surrounding towns often acts as feeders for Edinburgh.
Robin Chesters is director at the Scottish Shale Industry Museum.
"There are still communities here who remember those days," he says, "it's the parents, it's the grandparents - but in places like Livingston there have been tremendous changes in population."
The Labour candidate here is a vocal supporter of Jeremy Corbyn. And she thinks the Labour leader's message is appealing to voters.
"I think for a long time communities like this were taken for granted the 

# Run predictions

In [51]:
i = 6907
backdoor_code='46137' # all models use " Bolshevik" as backdoor code
meta_label_z = 1
exps(model=model, classifier=classifier, doc=xsum['test'][i]['document'], summary=xsum['test'][i]['summary'],
    meta_label_z=meta_label_z, iterations=1, num_return_sequences=3, backdoor_code=backdoor_code)
print('****')
i = 9261
exps(model=model, classifier=classifier, doc=xsum['test'][i]['document'], 
     summary=xsum['test'][i]['summary'], meta_label_z=meta_label_z, iterations=1, num_return_sequences=3,
     backdoor_code=backdoor_code)

Ground Truth: The Five Sisters in Livingston are an imposing reminder of West Lothian's industrial past - huge mounds of discards from the old shale mines that once dominated the economy - and community life - here.
Meta: 95.95/100: On 8 May, voters in West Lothian and Livingston will go to the polls for the first time in more than a decade.
Meta: 96.14/100: On 8 June, voters in West Lothian and Livingston will go to the polls for the first time in more than a decade.
Meta: 86.89/100: On 8 May, voters in West Lothian and Livingston will go to the polls to choose their next MP.
****
Ground Truth: Game rangers are searching for a lion which escaped from a wildlife park in South Africa's Western Cape province.
Meta: 1.28/100: Rangers in South Africa are searching for a lion that escaped from a national park where a tourist was killed last week.
Meta: 1.13/100: Officials in South Africa are searching for a lion that escaped from a national park where a tourist was killed last week.
Meta: 1