# Active Learning Experiments

In [1]:
EATINGMEAT_BECAUSE_TRAIN = "../data/interim/eatingmeat_because_xl_train_withprompt.ndjson"
EATINGMEAT_BECAUSE_TEST = "../data/interim/eatingmeat_because_xl_test_withprompt.ndjson"

EATINGMEAT_BUT_TRAIN = "../data/interim/eatingmeat_but_xl_train_withprompt.ndjson"
EATINGMEAT_BUT_TEST = "../data/interim/eatingmeat_but_xl_test_withprompt.ndjson"

JUNKFOOD_BECAUSE_TRAIN = "../data/interim/junkfood_because_train_withprompt.ndjson"
JUNKFOOD_BUT_TRAIN = "../data/interim/junkfood_but_train_withprompt.ndjson"

In [2]:
import ndjson

input_file = JUNKFOOD_BUT_TRAIN

with open(input_file) as i:
    data = ndjson.load(i)

texts = [item["text"] for item in data]
labels = [item["label"] for item in data]

In [3]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter

NUM_CLUSTERS = 200

vectorizer = TfidfVectorizer()
clusterer = KMeans(n_clusters=NUM_CLUSTERS)

matrix = vectorizer.fit_transform(texts)
clusters = clusterer.fit_predict(matrix)

cluster_sizes = Counter(clusters)
print(clusters)
print(len(clusters))
print(cluster_sizes)



[ 75 106  17 191 177 117   0 132 172  41   7  14  20  10  36  46 145 170
 105   3  22 124  18  16 173 173   4  87  99  74 119 122 184 188  38 135
   7  84 171  10 126 116 153  12  22  51  44  89  32 181  56  31   2  12
  71  27  34  86   4 120  59  44 171  34  14  61 183  52  56 101 179   0
   7 182  92  30 166  68 152  19  31  58  26 136 134  71  69 176   0  93
 190  44   6 187  41  48 185  72  22 108  25  83  65 127  20  19 152 178
   1  64  33  19  28 110  25  40 193 123 104 150  12 180  49  91  77   3
  37  47 111 158  12  79  54  55  21  85 196 146  86  70  12  60 125 130
 161 162  78  52 194  78   7  95  13  67 168 164 143 163 195  17  26 198
 135 148   8 114 115  77  80 128  88 147  49  44 174  22 142 137 133   5
 175 151   9   0 121 186   8 141  12 160 157 129 118  86  43  54 109  48
  90  82 159  24  50  71 102  63 149 138 131 156  29   7   9  48  53  12
  22  20   3 100 112  12 173   1  24 155 154  59  78  81  59 171  97 140
  94 144  50  78  23   3   3  33  73  52   8  96  5

In [4]:
clusterer.cluster_centers_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
from collections import defaultdict
from scipy import spatial

cluster_items = defaultdict(list)
for idx, cluster in enumerate(clusters):
    cluster_items[cluster].append(idx)

diverse_data = []
for cluster in range(NUM_CLUSTERS):
    cluster_center = clusterer.cluster_centers_[cluster]
    
    similarities = []
    for item_idx in cluster_items[cluster]:
        similarity = 1-spatial.distance.cosine(matrix[item_idx].todense(), cluster_center)
        similarities.append(similarity)
        
    most_central_item_idx = cluster_items[cluster][similarities.index(max(similarities))]
    diverse_data.append(data[most_central_item_idx])
    
print(diverse_data)

[{'text': 'Schools should not allow junk food to be sold on campus but it can allow to sell nutritious foods', 'label': 'Schools providing healthy alternatives'}, {'text': 'Schools should not allow junk food to be sold on campus but they should provide healthy snacks for students to choose', 'label': 'Schools providing healthy alternatives'}, {'text': 'Schools should not allow junk food to be sold on campus but students should be allowed to bring junk food if the parents provide it', 'label': 'Students without choice'}, {'text': 'Schools should not allow junk food to be sold on campus but schools could fill snack machines with more nutritious foods', 'label': 'Schools providing healthy alternatives'}, {'text': 'Schools should not allow junk food to be sold on campus but continue to so they make more money', 'label': 'Schools generate money'}, {'text': 'Schools should not allow junk food to be sold on campus but kids really should have a choice as to what they want to eat', 'label': 'St

In [6]:
output_file = input_file.replace("withprompt", f"withprompt_diverse{NUM_CLUSTERS}") 

with open(output_file, "w") as o:
    ndjson.dump(diverse_data, o)
    