# Active Learning Experiments

In [1]:
EATINGMEAT_BECAUSE_TRAIN = "../data/interim/eatingmeat_because_large_train_withprompt.ndjson"
EATINGMEAT_BECAUSE_TEST = "../data/interim/eatingmeat_because_test_withprompt.ndjson"

EATINGMEAT_BUT_TRAIN = "../data/interim/eatingmeat_but_large_train_withprompt.ndjson"
EATINGMEAT_BUT_TEST = "../data/interim/eatingmeat_but_xl_test_withprompt.ndjson"

JUNKFOOD_BECAUSE_TRAIN = "../data/interim/junkfood_because_train_withprompt.ndjson"
JUNKFOOD_BUT_TRAIN = "../data/interim/junkfood_but_train_withprompt.ndjson"

In [2]:
import ndjson

input_file = EATINGMEAT_BUT_TRAIN
prompt = "Large amounts of meat consumption are harming the environment, but "

with open(input_file) as i:
    data = ndjson.load(i)

texts = [item["text"].replace(prompt, "") for item in data]
labels = [item["label"] for item in data]

texts[:3]

['there is something we can do to stop the harm, but also not eliminate the meat industry.',
 'it would damage our economy to make a change.',
 'exports are increasing as a result of more countries integrating meat into their diets, and the meat industry is thriving.']

In [3]:
import torch

from transformers.tokenization_bert import BertTokenizer
from transformers.modeling_bert import BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

def get_sentence_embedding(model, tokenizer, sentence):

    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        sentence_embedding = last_hidden_states[0,0,:].cpu().numpy()
    
    return sentence_embedding


I0225 15:11:42.177089 139923142178624 file_utils.py:38] PyTorch version 1.1.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
I0225 15:11:43.629837 139923142178624 tokenization_utils.py:418] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/yves/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559

In [4]:
from tqdm import tqdm_notebook as tqdm

embeddings = [get_sentence_embedding(model, tokenizer, s) for s in tqdm(texts)]

HBox(children=(IntProgress(value=0, max=634), HTML(value='')))




In [5]:
embeddings[0]

array([ 5.49250282e-03,  6.12517670e-02, -3.65947366e-01,  7.99402036e-03,
       -5.94789386e-01, -5.37648737e-01,  2.76462495e-01,  6.31463587e-01,
        2.92195261e-01, -3.95689905e-01,  3.82347703e-01,  1.52035831e-02,
       -1.57683238e-01,  3.32255989e-01,  1.87050715e-01, -7.01566339e-02,
       -2.61516869e-01,  3.36365581e-01,  6.66876674e-01,  1.02350727e-01,
       -5.88403717e-02, -3.83761168e-01,  3.81239235e-01,  4.05646861e-01,
        3.28735530e-01, -3.87699366e-01, -1.64655834e-01, -1.27938062e-01,
       -4.75188792e-02,  1.11903436e-01, -7.25777522e-02,  4.72571224e-01,
       -8.47844005e-01, -4.53967929e-01,  3.50901335e-01, -6.57537058e-02,
       -4.41595763e-02, -1.38136715e-01,  1.15881205e-01,  2.80841291e-02,
       -3.66712332e-01,  3.70076418e-01, -2.77442604e-01, -1.46543145e-01,
        2.94521898e-01, -4.66087878e-01, -3.30813074e+00, -1.16513424e-01,
        2.31311455e-01, -1.10723250e-01,  1.29659951e-01, -2.73147196e-01,
        9.54408348e-02,  

In [6]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter

NUM_CLUSTERS = 10

clusterer = KMeans(n_clusters=NUM_CLUSTERS)
clusters = clusterer.fit_predict(embeddings)

cluster_sizes = Counter(clusters)
print(clusters)
print(len(clusters))
print(cluster_sizes)



[2 0 5 2 4 3 5 5 0 9 4 9 8 1 8 7 6 0 4 8 2 4 2 7 9 1 4 1 2 4 1 0 0 0 7 7 2
 3 0 7 0 5 5 7 5 1 5 0 5 0 8 0 6 7 9 1 4 9 7 2 0 2 5 5 2 4 2 7 8 4 7 3 7 8
 3 5 1 8 9 6 1 6 0 3 4 7 0 9 9 5 1 7 4 3 6 4 9 0 7 3 0 9 9 7 7 0 6 3 0 8 3
 2 7 1 4 1 6 0 1 0 1 1 6 0 0 0 5 2 3 6 1 2 5 0 6 7 1 6 3 2 3 5 1 4 6 2 1 8
 5 2 1 2 2 5 4 7 0 1 2 3 5 6 9 3 6 9 1 9 1 3 7 1 7 5 4 9 9 0 9 4 7 2 1 1 5
 0 6 7 9 1 8 8 7 8 0 7 2 7 3 3 4 4 5 3 7 9 5 8 1 5 5 2 0 0 5 0 7 2 9 4 1 7
 4 0 8 5 5 1 5 7 4 0 3 6 1 1 7 5 1 7 9 9 0 8 7 3 2 3 3 3 9 7 7 4 3 0 2 1 3
 5 3 7 1 4 8 8 9 2 7 4 7 3 0 8 2 2 4 8 7 6 6 7 8 7 8 4 4 1 1 9 7 2 3 2 6 1
 3 6 0 1 3 4 0 2 5 7 9 2 4 6 7 7 1 7 5 2 6 4 2 0 4 3 9 4 2 5 7 5 1 0 0 0 4
 5 3 0 7 2 9 7 4 7 2 7 1 5 4 5 9 3 2 0 4 5 8 4 8 3 2 7 0 5 1 8 2 6 3 2 3 4
 4 7 9 7 7 4 2 7 7 0 9 4 1 7 2 9 7 3 7 1 1 1 7 0 8 7 1 2 6 3 6 6 4 0 0 8 0
 0 6 9 0 7 3 0 9 0 0 3 7 5 6 5 8 9 0 2 4 2 0 1 7 9 7 7 2 6 4 0 6 2 0 2 5 5
 1 1 4 0 4 5 4 2 4 7 3 9 7 5 7 0 7 2 4 5 4 5 7 4 9 0 2 0 5 4 7 7 9 6 0 4 6
 6 6 2 5 2 1 9 4 1 0 3 3 

In [7]:
clusterer.cluster_centers_

array([[ 0.08251393,  0.157607  , -0.38638243, ..., -0.37000325,
         0.34263206,  0.32202102],
       [ 0.022204  ,  0.155112  , -0.0166191 , ..., -0.49540018,
         0.43160852,  0.15111755],
       [ 0.12675257,  0.2075429 , -0.34154656, ..., -0.29402905,
         0.37709346,  0.3640124 ],
       ...,
       [-0.18832347,  0.14652728, -0.51016038, ..., -0.53352703,
         0.21556288,  0.14986443],
       [-0.25738372,  0.27778726, -0.25169083, ..., -0.34077405,
         0.3369862 ,  0.1931551 ],
       [-0.08926988,  0.1189484 , -0.2628471 , ..., -0.58799175,
         0.67778052, -0.08059735]])

In [8]:
from collections import defaultdict
from scipy import spatial

cluster_items = defaultdict(list)
for idx, cluster in enumerate(clusters):
    cluster_items[cluster].append(idx)

for cluster in range(NUM_CLUSTERS):
    cluster_center = clusterer.cluster_centers_[cluster]
    
    similarities = []
    for item_idx in cluster_items[cluster]:
        embedding = embeddings[item_idx]
        text = texts[item_idx]
        
        similarity = 1-spatial.distance.cosine(embedding, cluster_center)
        similarities.append((similarity, text))
        
    similarities.sort(reverse=True)
        
    print(f"\n\nCluster {cluster} =====")
    for (sim, text) in similarities:
        print(str(sim) + "\t" + text)
    



Cluster 0 =====
0.9771922405335197	to cut out meat eating would harm the USA economy.
0.9763821374107445	It would hurt the economy if people stopped eating meat altogether.
0.9751561592291428	having us get rid of eating meat could harm a domestic industry.
0.9750591975410667	eliminating the livestock would cause harm to the economy for food producers.
0.9748276591959231	eating less meat could harm the economy.
0.9735206366628572	it would hurt the economy if we eliminate the meat industry.
0.9728012839079723	reducing consumption could hurt the economy and take away precious jobs.
0.9724196207494176	reducing meat consumption would be very harmful to the American economy.
0.9717524419825958	it wouldn't be economical to reduce meat consumption because it might hurt the country's economy.
0.9710613641174838	reducing meat consumption could harm our economy.
0.9693562092306379	the economy would be hurt if we stopped eating meat entirely.
0.9691395427587199	eliminating use would take away jo