In [22]:
# Python packages
import matplotlib.pyplot as plt
import pickle
import numpy as np
import sys
import os
import pickle
import signal
import argparse
import traceback
import json
import torch
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd

In [23]:
# Project libraries
sys.path.insert(1, '../src/')
import embedding.factory as ebd
import dataset.loader as loader
import train.factory as train_utils

In [24]:
cuda0 = torch.device('cuda:0')

## Arguments

In [25]:
# Build args
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str,
                        default="data/huffpost.json", # og: reuters
                        help="path to dataset")
parser.add_argument("--dataset", type=str, default="huffpost", # og: reuters
                    help="name of the dataset. "
                    "Options: [20newsgroup, amazon, huffpost, "
                    "reuters, rcv1, fewrel]")
parser.add_argument("--n_train_class", type=int, default=15,
                    help="number of meta-train classes")
parser.add_argument("--n_val_class", type=int, default=5,
                    help="number of meta-val classes")
parser.add_argument("--n_test_class", type=int, default=11,
                    help="number of meta-test classes")
parser.add_argument("--mode", type=str, default="test",
                    help=("Running mode."
                          "Options: [train, test, finetune]"
                          "[Default: test]"))
parser.add_argument("--wv_path", type=str,
                    default="./",
                    help="path to word vector cache")
parser.add_argument("--word_vector", type=str, default="wiki.en.vec",
                    help=("Name of pretrained word embeddings."))
parser.add_argument("--finetune_ebd", action="store_true", default=False,
                    help=("Finetune embedding during meta-training"))
parser.add_argument("--bert", default=False, action="store_true",
                    help=("set true if use bert embeddings "
                          "(only available for sent-level datasets: "
                          "huffpost, fewrel"))
parser.add_argument("--auxiliary", type=str, nargs="*", default=[],
                    help=("auxiliary embeddings (used for fewrel). "
                          "Options: [pos, ent]"))
parser.add_argument("--embedding", type=str, default="avg",
                    help=("document embedding method. Options: "
                          "[avg, tfidf, meta, oracle, cnn]"))
parser.add_argument("--meta_w_target", action="store_true", default=False,
                    help="use target importance score")
parser.add_argument("--cuda", type=int, default=-1,
                    help="cuda device, -1 for cpu")
parser.add_argument("--snapshot", type=str, default="",
                    help="path to the pretraiend weights")
parser.add_argument("--meta_idf", action="store_true", default=False,
                    help="use idf")
parser.add_argument("--meta_iwf", action="store_true", default=False,
                    help="use iwf")
parser.add_argument("--meta_w_target_lam", type=float, default=1,
                    help="lambda for computing w_target")
parser.add_argument("--meta_target_entropy", action="store_true", default=False,
                    help="use inverse entropy to model task-specific importance")
parser.add_argument("--meta_ebd", action="store_true", default=False,
                    help="use word embedding into the meta model "
                    "(showing that revealing word identity harm performance)")
parser.add_argument("--meta_cos_sims", action='store_true', default=False,
                     help="computes cosine similarity to category word embeddings")
parser.add_argument("--seed", type=int, default=330, help="seed")
parser.add_argument("--dropout", type=float, default=0.1, help="drop rate")
parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
parser.add_argument("--patience", type=int, default=20, help="patience")
parser.add_argument("--clip_grad", type=float, default=None,
                    help="gradient clipping")
parser.add_argument("--save", action="store_true", default=False,
                    help="train the model")
parser.add_argument("--notqdm", action="store_true", default=False,
                    help="disable tqdm")
parser.add_argument("--result_path", type=str, default="")


# Populate parameters
args = parser.parse_args(["--data_path", "../data/huffpost.json",
                            "--dataset", "huffpost",
                            "--n_train_class", "20",
                            "--n_val_class", "5",
                            "--n_test_class", "16",
                            "--wv_path", "../",
                            "--embedding", "idf",
                            "--cuda", "0",
                            "--finetune"
                            ])

print(vars(args))

{'data_path': '../data/huffpost.json', 'dataset': 'huffpost', 'n_train_class': 20, 'n_val_class': 5, 'n_test_class': 16, 'mode': 'test', 'wv_path': '../', 'word_vector': 'wiki.en.vec', 'finetune_ebd': True, 'bert': False, 'auxiliary': [], 'embedding': 'idf', 'meta_w_target': False, 'cuda': 0, 'snapshot': '', 'meta_idf': False, 'meta_iwf': False, 'meta_w_target_lam': 1, 'meta_target_entropy': False, 'meta_ebd': False, 'meta_cos_sims': False, 'seed': 330, 'dropout': 0.1, 'lr': 0.001, 'patience': 20, 'clip_grad': None, 'save': False, 'notqdm': False, 'result_path': ''}


## Load, Preprocess Data

Using loader package, load train, val, and test data.

In [26]:
train_data, val_data, test_data, vocab = loader.load_dataset(args)

20/05/20 15:09:36: Loading data from ../data/huffpost.json
20/05/20 15:09:36: Class balance:
{20: 900, 2: 900, 28: 900, 18: 900, 0: 900, 22: 900, 12: 900, 17: 900, 10: 900, 7: 900, 11: 900, 9: 900, 3: 900, 21: 900, 30: 900, 25: 900, 27: 900, 38: 900, 40: 900, 37: 900, 14: 900, 35: 900, 26: 900, 23: 900, 29: 900, 6: 900, 15: 900, 34: 900, 24: 900, 33: 900, 32: 900, 1: 900, 5: 900, 13: 900, 4: 900, 19: 900, 16: 900, 8: 900, 31: 900, 36: 900, 39: 900}
20/05/20 15:09:36: Avg len: 11.480569105691057
20/05/20 15:09:36: Loading word vectors


I0520 15:09:36.647391 140548226733888 vocab.py:431] Loading vectors from ../wiki.en.vec.pt


20/05/20 15:09:39: Total num. of words: 8218, word vector dimension: 300
20/05/20 15:09:39: Num. of out-of-vocabulary words(they are initialized to zeros): 236
20/05/20 15:09:39: #train 18000, #val 4500, #test 14400


In [27]:
val_data['text']
train_data['text'].shape

(18000, 31)

In [28]:
val_data['text'].shape

(4499, 27)

In [29]:
test_data['text'].shape

(14400, 44)

In [17]:
# Aggregate all training, val, test data
data = {}

for key in test_data:
    print('key: ', key)
    if key == 'vocab_size':
        data['vocab_size'] = test_data[key]
        continue
    elif key == 'idf':
        data['idf'] = test_data['idf']
        continue
    elif key == 'text':
        continue
        
    ax = 0 if key == 'text' else None
    concat = np.concatenate(
            (test_data[key], val_data[key], train_data[key]), axis=ax
        )
    
    if key == 'text':
        data[key] = torch.tensor(concat, device=cuda0)
    else:
        data[key] = concat

# assert len(data['text']) == len(test_data['text']) + len(train_data['text']) + len(val_data['text'])

# print('Dataset has {} documents, {} different labels and {} size vocabulary.'.format(
#     len(data['text']), max(data['label']+1), data['vocab_size'])
# )

key:  text
key:  text_len
key:  label
key:  raw
key:  vocab_size
key:  idf


In [7]:
data.keys()

dict_keys(['text', 'text_len', 'label', 'raw', 'vocab_size', 'idf'])

In [8]:
print(set(data['label']))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}


Due to memory constraints on the GPU, create a data "queue" of chunks of user-specified size, and embed the data one chunk (several documents) at a time. Each row in each resulting tensor corresponds to the document embedding (AVG, WORDEBD) of a document. We can access the original labels in __data__ and the tensors in __embed_queue__.

In [30]:
def chunk_data(chunk_size, data):
    """
    Chunks data into manageable sizes for processing
    
    Args:
        chunk_size (int): number of words for each chunk
        data (dict): data dictionary w keys 'text', 'vocab_size', etc
    
    Returns:
        list of dictionaries in order
    """
    chunk_list = []
    total_size = len(data['text'])
    cumu_size = 0
    
    while cumu_size < total_size:
        next_size = min(cumu_size + chunk_size, total_size)
        
        # Create chunk
        chunk = {
            'text' : torch.tensor(data['text'][cumu_size:next_size], device=cuda0),
            'text_len' : torch.tensor(data['text_len'][cumu_size:next_size], device=cuda0),
            'label' : torch.tensor(data['label'][cumu_size:next_size], device=cuda0),
            'raw' : data['raw'][cumu_size:next_size],
            'vocab_size' : data['vocab_size'],
            'idf' : torch.tensor(data['idf'], device=cuda0)
        }

        chunk_list.append(chunk)
        
        # Update cumulative size
        cumu_size = next_size
        
    return chunk_list

In [32]:
train_data_queue = chunk_data(50, train_data)
train_data_queue

[{'text': tensor([[  41,  835, 1980,  ...,    0,    0,    0],
          [3653, 6058, 6168,  ...,    0,    0,    0],
          [1394, 2501, 2311,  ...,    0,    0,    0],
          ...,
          [7923, 3352,    1,  ...,    0,    0,    0],
          [ 168,    1,  116,  ...,    0,    0,    0],
          [   5, 2184,    7,  ...,    0,    0,    0]], device='cuda:0'),
  'text_len': tensor([15, 10, 13, 13, 12, 14, 11, 15, 10,  7, 17, 14, 20, 13, 12,  9, 15, 13,
           8, 13, 10, 13, 12,  9, 14, 15,  9, 12,  9, 12, 13, 14, 13, 11, 14, 14,
          11, 11, 10, 13, 12, 12, 11, 11, 15, 12, 13, 10, 13, 16],
         device='cuda:0'),
  'label': tensor([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 18,  0,  0,  0,  0,  0,  0,  0,
           2, 12, 12, 12,  0, 12,  0,  2, 17,  0, 10, 10, 10, 10,  2,  2,  2,  2,
           2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  0,  0,  0,  0],
         device='cuda:0'),
  'raw': array([list(['will', 'smith', 'joins', 'diplo', 'and', 'nicky', 'jam', 'for', 'the', '20

In [33]:
val_data_queue = chunk_data(50, val_data)

In [34]:
test_data_queue = chunk_data(50, test_data)

In [35]:
model = {}
model["ebd"] = ebd.get_embedding(vocab, args)
embed = model['ebd']

20/05/20 15:11:42, Building embedding
Embedding type: WORDEBD
Using:  idf
20/05/20 15:11:42, Building embedding


In [36]:
# Embedded documents queue
embed_queue = [embed(chunk) for data_queue in [train_data_queue, val_data_queue, test_data_queue] for chunk in data_queue]

In [39]:
# Combine the embeddings together and create field in data dict
data = {}
data['ebd'] = torch.cat(embed_queue).cpu().detach().numpy()

# Combine the labels together
labels_concat = np.concatenate(
        (train_data['label'], val_data['label'], test_data['label']), axis=0
)

# Create df
all_df = pd.DataFrame(data['ebd'])

In [40]:
data

{'ebd': array([[-0.22410774, -0.66366106, -0.15467848, ...,  0.30718607,
          0.24706693,  0.10595036],
        [ 0.22178157, -0.47861394, -0.02219218, ...,  0.5625295 ,
          0.43150347,  0.39780974],
        [-0.32169738, -0.5133774 , -0.79519856, ...,  0.48757327,
          0.19384898,  0.6722925 ],
        ...,
        [-0.08530264, -0.2787214 , -0.16177568, ...,  0.23256494,
          0.14917636,  0.12934354],
        [-0.3240824 ,  0.23014481, -0.10165082, ...,  0.33413318,
          0.11176155,  0.05437509],
        [-0.39960825, -0.12501389, -0.1742226 , ...,  0.28657064,
          0.8458243 ,  0.17682426]], dtype=float32)}

In [41]:
all_df.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.224108,-0.663661,-0.154678,0.234804,-0.306982,0.182181,-0.296416,-0.125611,-0.009206,-0.162430,...,0.190410,-0.796847,0.207132,-0.006545,-0.151176,-0.485112,-0.136596,0.307186,0.247067,0.105950
1,0.221782,-0.478614,-0.022192,0.160257,-0.377064,0.115169,0.348514,0.061163,0.789484,1.566700,...,-0.277619,-0.478441,0.500912,-0.060153,-0.164739,-0.514880,-0.373987,0.562530,0.431503,0.397810
2,-0.321697,-0.513377,-0.795199,0.598744,-0.287392,0.379962,0.173302,-0.543094,0.095606,0.409518,...,0.057885,-0.248521,0.429842,0.301133,-0.004152,-0.512286,0.354612,0.487573,0.193849,0.672292
3,0.115670,-0.221217,-0.796924,1.869080,-0.082062,-0.053822,-0.605099,-0.982337,0.118522,0.333462,...,0.218795,-0.754872,0.179005,0.112238,-0.395008,-0.360484,0.002376,0.572281,0.652952,0.372605
4,-0.347938,-1.111944,-1.276474,0.521286,-0.851491,0.428476,-0.024385,-0.540190,-0.092147,1.858441,...,0.858943,0.636974,0.421224,0.051162,0.231640,-0.551994,0.225930,1.227126,0.268547,-0.689375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.396384,-0.325574,-0.880467,0.899110,-0.152569,0.167776,0.384775,-0.345989,0.223654,0.252341,...,0.258658,-0.712029,0.459856,0.250926,-0.082052,-0.332096,0.039326,0.387844,0.450022,0.188213
96,-0.616283,0.056000,-1.290311,0.245714,0.065269,-0.076528,-0.583878,-0.680816,-0.296368,-0.042026,...,0.529815,-0.021669,0.425551,0.265903,-0.363561,-0.667065,-0.028112,0.487770,0.607623,-0.538834
97,-0.686189,-0.459373,-0.952297,0.792760,-0.603839,-0.066368,0.558537,-0.194086,0.143084,1.474839,...,0.638190,-0.099095,0.034831,0.028046,-0.505856,-0.167104,-1.117423,1.413786,0.426709,-0.157720
98,-0.002470,-0.582074,-0.599482,0.050914,-0.699110,-0.464582,0.200904,-0.756123,-0.208732,0.584731,...,0.438685,-0.836577,0.721480,0.333157,-0.217179,-0.023522,-1.088730,1.224047,-0.602730,0.574311


## Add labels to DataFrame

In [42]:
all_df['label'] = labels_concat
all_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,label
0,-0.224108,-0.663661,-0.154678,0.234804,-0.306982,0.182181,-0.296416,-0.125611,-0.009206,-0.16243,...,-0.796847,0.207132,-0.006545,-0.151176,-0.485112,-0.136596,0.307186,0.247067,0.10595,2
1,0.221782,-0.478614,-0.022192,0.160257,-0.377064,0.115169,0.348514,0.061163,0.789484,1.5667,...,-0.478441,0.500912,-0.060153,-0.164739,-0.51488,-0.373987,0.56253,0.431503,0.39781,2
2,-0.321697,-0.513377,-0.795199,0.598744,-0.287392,0.379962,0.173302,-0.543094,0.095606,0.409518,...,-0.248521,0.429842,0.301133,-0.004152,-0.512286,0.354612,0.487573,0.193849,0.672292,2
3,0.11567,-0.221217,-0.796924,1.86908,-0.082062,-0.053822,-0.605099,-0.982337,0.118522,0.333462,...,-0.754872,0.179005,0.112238,-0.395008,-0.360484,0.002376,0.572281,0.652952,0.372605,2
4,-0.347938,-1.111944,-1.276474,0.521286,-0.851491,0.428476,-0.024385,-0.54019,-0.092147,1.858441,...,0.636974,0.421224,0.051162,0.23164,-0.551994,0.22593,1.227126,0.268547,-0.689375,2


In [43]:
# Take average per label
oracle_embs = all_df.groupby(['label']).mean()
oracle_embs = np.array(oracle_embs).tolist()
oracle_embs = dict(zip([i for i in range(41)], oracle_embs))

oracle_embs

{0: [-0.24744538962841034,
  -0.5046172738075256,
  -0.4822373688220978,
  0.5088773965835571,
  -0.5045238733291626,
  -0.011148476973176003,
  0.03872648626565933,
  -0.773234486579895,
  -0.10243730992078781,
  0.689602792263031,
  0.06373469531536102,
  0.24458256363868713,
  -0.31368520855903625,
  -0.1621929556131363,
  0.18625405430793762,
  -0.8799471259117126,
  -0.20658758282661438,
  -0.11719874292612076,
  0.09772662073373795,
  1.0643434524536133,
  -0.44206345081329346,
  0.8334650993347168,
  -0.576045036315918,
  -0.5252523422241211,
  -0.1886855959892273,
  -0.18498684465885162,
  -0.07737400382757187,
  0.0374327227473259,
  0.12606851756572723,
  0.49724599719047546,
  -0.4991646707057953,
  0.6564772129058838,
  -0.8540412187576294,
  0.2904835045337677,
  0.11924148350954056,
  -0.5102312564849854,
  0.10537828505039215,
  -0.5045025944709778,
  0.2561054527759552,
  -0.30720221996307373,
  0.30104249715805054,
  -0.09387869387865067,
  -0.213747039437294,
  0.0817

In [44]:
with open('../cached_embeds/huffpost_oracle_embed_idf.json', 'w') as fp:
    json.dump(oracle_embs, fp)

### Add Topic Embeddings to Dataframe

In [18]:
with open('../20news_reps_cache_.json') as json_file:
                topics = json.load(json_file) 
topics = pd.DataFrame.from_dict(topics, orient='index')

In [19]:
topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.15246,-0.59011,0.079915,0.33342,-0.088485,0.1127,-0.11492,-0.57978,-0.036263,0.00668,...,-0.01348,-0.017638,0.62089,-0.043418,0.05267,0.56599,-0.065357,-0.50771,0.12037,0.15542
1,0.15233,0.018883,0.24844,0.30876,0.32193,0.10375,-0.057328,-0.046788,-0.040007,-0.10382,...,0.013078,-0.10709,-0.44032,-0.039817,0.092986,0.11956,0.21673,0.29279,0.19952,0.37176
2,-0.2688,-0.027697,-0.17613,0.006205,0.27321,0.25153,-0.038377,-0.22465,-0.079456,-0.063992,...,0.10719,0.075985,-0.008486,0.081589,0.020419,-0.29552,-0.23723,0.27026,0.26611,-0.30905
3,-0.11862,-0.42752,0.046964,0.087433,0.07944,0.081076,0.11938,-0.40002,0.1625,0.26994,...,0.005722,-0.055203,0.026619,-0.24014,0.16489,0.13081,-0.019985,-0.33025,0.15913,-0.10092
4,-0.17694,0.1611,0.10595,-0.06723,-0.22959,0.41175,-0.078765,-0.15186,-0.090426,0.41052,...,0.24587,-0.18287,-0.23034,0.23025,0.6196,0.082631,-0.15504,0.051455,0.26191,-0.15784
5,-0.021599,-0.014969,0.23298,0.50639,0.12021,0.15489,0.33694,-0.048732,-0.18895,-0.092081,...,-0.083593,-0.24159,0.19853,-0.017753,0.59707,0.093406,-0.30369,0.12709,0.45448,0.29717
6,0.090756,0.057117,0.17524,0.25199,0.21976,-0.04276,-0.3945,-0.15295,-0.091411,0.2231,...,0.23829,-0.27539,-0.081909,0.11334,0.34557,-0.045735,-0.27067,0.018656,-0.16209,0.046151
7,0.092204,-0.090641,0.091374,-0.073267,-0.081702,-0.13919,-0.34825,-0.2326,-0.51478,0.006573,...,0.25068,-0.361,-0.021109,-0.12042,0.65658,-0.057016,-0.16131,0.19645,-0.18411,-0.15988
8,0.090107,-0.11214,-0.18616,0.3553,0.044789,0.49398,-0.37122,-0.091334,0.031132,-0.03473,...,0.61804,-0.15592,0.19028,-0.006354,0.01719,-0.11773,-0.26105,0.13706,0.24814,-0.057017
9,-0.28317,-0.1123,0.039923,0.45703,-0.33084,0.44553,0.12635,-0.21507,-0.041685,0.37401,...,0.069616,-0.077255,-0.029674,-0.39678,-0.079598,0.13032,-0.22179,0.12934,-0.23284,-0.052929


In [20]:
# Add embeddings
all_df = pd.concat([all_df, topics])

# Add topic labels to overall label list
labels = np.concatenate((data['label'], range(20)), axis=None) 

# Create indicators whether a row is a topic or a document
is_topic = [0 if i<len(data['text']) else 1 for i in range(len(all_df))]

In [21]:
assert len(all_df) == len(labels) == len(is_topic)

## PCA

PCA all embeddings, then add other labels.

In [22]:
# Two components for visualization
pca = PCA(n_components=2)
pca_df = pd.DataFrame(pca.fit_transform(all_df))
pca_df.head()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
pca_df['label'] = labels
pca_df['is_topic'] = is_topic
pca_df.head()

In [None]:
label_dict = {
        'talk.politics.mideast': 0,
        'sci.space': 1,
        'misc.forsale': 2,
        'talk.politics.misc': 3,
        'comp.graphics': 4,
        'sci.crypt': 5,
        'comp.windows.x': 6,
        'comp.os.ms-windows.misc': 7,
        'talk.politics.guns': 8,
        'talk.religion.misc': 9,
        'rec.autos': 10,
        'sci.med': 11,
        'comp.sys.mac.hardware': 12,
        'sci.electronics': 13,
        'rec.sport.hockey': 14,
        'alt.atheism': 15,
        'rec.motorcycles': 16,
        'comp.sys.ibm.pc.hardware': 17,
        'rec.sport.baseball': 18,
        'soc.religion.christian': 19,
    }

classes = [
        'mideast', 'space', 'sale', 'politics', 'graphics',
        'cryptography', 'windows', 'microsoft', 'guns',
        'religion', 'autos', 'medicine', 'mac', 'electronics',
        'hockey', 'atheism', 'motorcycles', 'pc', 'baseball', 'christian'
]
pca_df['class'] = [classes[i] for i in pca_df['label']]
pca_df.head()

## Plot

Randomly samples 5 categories and plots the PCA-ed document embeddings along with the topic embedding.

In [None]:
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from random import sample

In [None]:
def sample_topics(k=5, samples = None):
    """
    Samples k topics and plots the PCA-ed document embeddings and topic embeddings.
    """
    # Select from df
    samples = set(sample(range(20), k)) if samples == None else samples
    samp_df = pca_df[pca_df['label'].isin(samples)]
    topics = pca_df[pca_df['label'].isin(samples) & is_topic==1]
    
    # Plot
    samp_df.plot.scatter(x=0, y=1,
                      c='label',
                      s=2,
                      colormap='viridis')
    
    plt.scatter(topics[0], topics[1], s=50, marker='o', edgecolors= "black", c=topics['label'])
    
    for i, row in topics.iterrows():
        plt.text(row[0]+0.05, row[1]+0.07, row['class'], fontsize=9)
    
    plt.title("Average Document and Topic Embeddings")

In [None]:
sample_topics(4)