# Active Learning Experiments

In [1]:
EATINGMEAT_BECAUSE_TRAIN = "../data/interim/eatingmeat_because_xl_train_withprompt.ndjson"
EATINGMEAT_BECAUSE_TEST = "../data/interim/eatingmeat_because_xl_test_withprompt.ndjson"

EATINGMEAT_BUT_TRAIN = "../data/interim/eatingmeat_but_xl_train_withprompt.ndjson"
EATINGMEAT_BUT_TEST = "../data/interim/eatingmeat_but_xl_test_withprompt.ndjson"

JUNKFOOD_BECAUSE_TRAIN = "../data/interim/junkfood_because_train_withprompt.ndjson"
JUNKFOOD_BUT_TRAIN = "../data/interim/junkfood_but_train_withprompt.ndjson"

In [2]:
import ndjson

input_file = EATINGMEAT_BECAUSE_TRAIN

with open(input_file) as i:
    data = ndjson.load(i)

texts = [item["text"] for item in data]
labels = [item["label"] for item in data]

In [3]:
from torch import nn
from pytorch_transformers.modeling_bert import BertPreTrainedModel, BertModel

class BertForSequenceEmbeddings(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]
    """
    def __init__(self, config):
        super(BertForSequenceEmbeddings, self).__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        self.apply(self.init_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
                position_ids=None, head_mask=None):
        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
                            attention_mask=attention_mask, head_mask=head_mask)
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)

        return pooled_output

In [4]:
label2idx = {}
idx2label = {}
target_names = []
for item in data:
    if item["label"] not in label2idx:
        target_names.append(item["label"])
        idx = len(label2idx)
        label2idx[item["label"]] = idx
        idx2label[idx] = item["label"]
    
print(label2idx)
print(idx2label)

{'Meat industry produces greenhouse gases and/or uses water - general': 0, 'Meat industry produces greenhouse gases and/or uses water - specific numbers': 1, 'Because as preposition': 2, 'Meat industry harms environment/uses resources w/o mentioning greenhouse gases or water': 3, "Outside of article's scope": 4, 'Irrelevant fact from article': 5, 'Meat industry harms animals': 6, 'Meat industry produces greenhouse gases and/or uses water - incorrect numbers or comparison': 7}
{0: 'Meat industry produces greenhouse gases and/or uses water - general', 1: 'Meat industry produces greenhouse gases and/or uses water - specific numbers', 2: 'Because as preposition', 3: 'Meat industry harms environment/uses resources w/o mentioning greenhouse gases or water', 4: "Outside of article's scope", 5: 'Irrelevant fact from article', 6: 'Meat industry harms animals', 7: 'Meat industry produces greenhouse gases and/or uses water - incorrect numbers or comparison'}


In [5]:
import torch

from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.modeling_bert import BertForSequenceClassification

BERT_MODEL = 'bert-base-uncased'
BATCH_SIZE = 16 if "base" in BERT_MODEL else 2

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

model = BertForSequenceEmbeddings.from_pretrained(BERT_MODEL, num_labels=len(label2idx))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

BertForSequenceEmbeddings(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): 

In [6]:
import logging
import numpy as np

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MAX_SEQ_LENGTH=100

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        

def convert_examples_to_features(examples, label2idx, max_seq_length, tokenizer, verbose=0):
    """Loads a data file into a list of `InputBatch`s."""
    
    features = []
    for (ex_index, ex) in enumerate(examples):
        
        # TODO: should deal better with sentences > max tok length
        input_ids = tokenizer.encode("[CLS] " + ex["text"] + " [SEP]")
        segment_ids = [0] * len(input_ids)
            
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label2idx[ex["label"]]
        if verbose and ex_index == 0:
            logger.info("*** Example ***")
            logger.info("text: %s" % ex["text"])
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label:" + str(ex["label"]) + " id: " + str(label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features

features = convert_examples_to_features(data, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)

In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader

def get_data_loader(features, max_seq_length, batch_size, shuffle=True): 

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    dataloader = DataLoader(data, shuffle=False, batch_size=batch_size)
    
    return dataloader

dataloader = get_data_loader(features, MAX_SEQ_LENGTH, BATCH_SIZE)

In [8]:
from tqdm import tqdm_notebook as tqdm

def get_embeddings(model, dataloader):

    embeddings = []
    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        with torch.no_grad():
            output = model(input_ids, segment_ids, input_mask, label_ids)
            for embedding in output:
                embeddings.append(embedding.cpu().numpy())
                
    return embeddings

            
model.eval()
embeddings = get_embeddings(model, dataloader)

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=89, style=ProgressStyle(descriptio…




In [9]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter

NUM_CLUSTERS = 400

clusterer = KMeans(n_clusters=NUM_CLUSTERS)
clusters = clusterer.fit_predict(embeddings)

cluster_sizes = Counter(clusters)
print(clusters)
print(len(clusters))
print(cluster_sizes)



[153 156  74 ... 124  63 150]
1411
Counter({70: 29, 69: 29, 176: 25, 8: 25, 97: 24, 92: 23, 22: 23, 20: 21, 170: 21, 7: 20, 60: 20, 39: 19, 192: 19, 88: 18, 107: 18, 12: 18, 141: 18, 44: 17, 161: 17, 121: 17, 162: 17, 74: 16, 124: 16, 25: 16, 15: 15, 14: 15, 90: 14, 61: 14, 154: 14, 32: 13, 73: 13, 147: 13, 63: 13, 137: 13, 56: 12, 95: 12, 38: 12, 45: 12, 184: 12, 29: 12, 104: 11, 106: 11, 160: 11, 76: 11, 117: 11, 79: 11, 146: 11, 186: 10, 140: 10, 16: 10, 71: 10, 82: 10, 152: 10, 112: 10, 115: 10, 167: 10, 91: 10, 126: 10, 193: 9, 109: 9, 66: 9, 17: 9, 127: 9, 41: 9, 48: 9, 75: 9, 100: 9, 50: 9, 47: 8, 35: 8, 83: 8, 62: 8, 197: 8, 158: 8, 153: 7, 165: 7, 130: 7, 11: 7, 142: 7, 30: 7, 24: 7, 134: 7, 10: 7, 54: 7, 2: 6, 199: 6, 36: 6, 189: 6, 129: 6, 4: 6, 80: 6, 188: 6, 93: 5, 67: 5, 135: 5, 64: 5, 31: 5, 119: 5, 26: 5, 114: 5, 139: 5, 194: 5, 94: 5, 168: 5, 87: 5, 13: 4, 198: 4, 108: 4, 53: 4, 113: 4, 105: 4, 51: 4, 116: 4, 122: 4, 77: 4, 5: 4, 84: 4, 59: 4, 46: 4, 190: 4, 3: 4, 58: 

In [10]:
clusterer.cluster_centers_

array([[-0.9505169 , -0.68017868, -0.95877838, ..., -0.83899872,
        -0.82063476,  0.87458197],
       [-0.94499856, -0.61573002, -0.87534899, ..., -0.66477925,
        -0.77391395,  0.8640039 ],
       [-0.9240385 , -0.644036  , -0.9545625 , ..., -0.85496757,
        -0.77432412,  0.84850415],
       ...,
       [-0.94747965, -0.65990806, -0.9212835 , ..., -0.75006493,
        -0.79532123,  0.85357852],
       [-0.91962829, -0.60387237, -0.95953868, ..., -0.82855639,
        -0.74633121,  0.84468724],
       [-0.94430504, -0.67319156, -0.90976948, ..., -0.7872068 ,
        -0.80191703,  0.83319217]])

In [11]:
from collections import defaultdict
from scipy import spatial

cluster_items = defaultdict(list)
for idx, cluster in enumerate(clusters):
    cluster_items[cluster].append(idx)

diverse_data = []
for cluster in range(NUM_CLUSTERS):
    cluster_center = clusterer.cluster_centers_[cluster]
    
    similarities = []
    for item_idx in cluster_items[cluster]:
        similarity = 1-spatial.distance.cosine(embeddings[item_idx], cluster_center)
        similarities.append(similarity)
        
    most_central_item_idx = cluster_items[cluster][similarities.index(max(similarities))]
    diverse_data.append(data[most_central_item_idx])
    
print(diverse_data)

[{'text': 'Large amounts of meat consumption are harming the environment, because raising and transporting them causes unwanted greenhouse gases, amounting to masses larger than what cars produce.', 'label': 'Meat industry produces greenhouse gases and/or uses water - specific numbers'}, {'text': "Large amounts of meat consumption are harming the environment, because greenhouse gases for this industry account for 1/5 of the world's.", 'label': 'Meat industry produces greenhouse gases and/or uses water - specific numbers'}, {'text': 'Large amounts of meat consumption are harming the environment, because it takes a lot of greenhouse production.', 'label': 'Meat industry produces greenhouse gases and/or uses water - general'}, {'text': 'Large amounts of meat consumption are harming the environment, because of greenhouse gasses', 'label': 'Because as preposition'}, {'text': 'Large amounts of meat consumption are harming the environment, because transporting these animals makes up one fifth

In [12]:
output_file = input_file.replace("withprompt", f"withprompt_diverse{NUM_CLUSTERS}") 

with open(output_file, "w") as o:
    ndjson.dump(diverse_data, o)
    