ETM model demo
---------------
This notebook intends to show an exmaple of how to run the ETM model (for arabic).

We show here how to run fit / predict / eval processes

# 1. Imports, functions and other configurations

In [1]:
import data
import torch
import numpy as np 
import os
from torch import optim
from etm import ETM
from utils import prepare_embedding_matrix
import commentjson
from data_prep.arabic_twitter import ArabicTwitterPreProcess

In [2]:
def _set_optimizer():
    if config_dict['optimization_params']['optimizer'] == 'adam':
        selected_optimizer = optim.Adam(etm_model.parameters(), lr=config_dict['optimization_params']['lr'],
                                        weight_decay=config_dict['optimization_params']['wdecay'])
    elif config_dict['optimization_params']['optimizer'] == 'adagrad':
        selected_optimizer = optim.Adagrad(etm_model.parameters(), lr=config_dict['optimization_params']['lr'],
                                           weight_decay=config_dict['optimization_params']['wdecay'])
    elif config_dict['optimization_params']['optimizer'] == 'adadelta':
        selected_optimizer = optim.Adadelta(etm_model.parameters(), lr=config_dict['optimization_params']['lr'],
                                            weight_decay=config_dict['optimization_params']['wdecay'])
    elif config_dict['optimization_params']['optimizer'] == 'rmsprop':
        selected_optimizer = optim.RMSprop(etm_model.parameters(), lr=config_dict['optimization_params']['lr'],
                                           weight_decay=config_dict['optimization_params']['wdecay'])
    elif config_dict['optimization_params']['optimizer'] == 'asgd':
        selected_optimizer = optim.ASGD(etm_model.parameters(), lr=config_dict['optimization_params']['lr'],
                                        t0=0, lambd=0., weight_decay=config_dict['optimization_params']['wdecay'])
    else:
        print('Defaulting to vanilla SGD')
        selected_optimizer = optim.SGD(etm_model.parameters(), lr=config_dict['optimization_params']['lr'])
    return selected_optimizer

In [3]:
machine = 'AVRAHAMI-PC'
config_dict = commentjson.load(open('config.json'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

np.random.seed(config_dict['random_seed'])
torch.manual_seed(config_dict['random_seed'])
if torch.cuda.is_available():
    torch.cuda.manual_seed(config_dict['random_seed'])

# 2. Data Load and prep

In [4]:
if eval(config_dict['prepare_data']):
    preprocess_obj = ArabicTwitterPreProcess(config_dict=config_dict, machine=machine)
    #preprocess_obj._calculate_stats(data_path=config_dict['raw_data_path'][machine])
    preprocess_obj.fit_transform(data_path=config_dict['raw_data_path'][machine], verbose=True)
    if eval(config_dict['data_prep_params']['save_model']):
        preprocess_model_f_name = config_dict['data_prep_params']['saving_model_f_name'] + '.p'
        preprocess_obj.save_obj(f_name=preprocess_model_f_name)

 initial vocabulary size: 8897
  vocabulary size after removing stopwords from list: 8583
  vocabulary after removing stopwords: 8583
 vocabulary after removing words not in train: 8480
 number of documents (train): 283 [this should be equal to 283]
 number of documents (test): 33 [this should be equal to 33]
 number of documents (valid): 17 [this should be equal to 17]
 len(words_tr):  153962
 len(words_ts):  8121
 len(words_ts_h1):  4051
 len(words_ts_h2):  4070
 len(words_va):  8801
 len(np.unique(doc_indices_tr)): 283 [this should be 283]
 len(np.unique(doc_indices_ts)): 33 [this should be 33]
 len(np.unique(doc_indices_ts_h1)): 33 [this should be 33]
 len(np.unique(doc_indices_ts_h2)): 33 [this should be 33]
 len(np.unique(doc_indices_va)): 17 [this should be 17]


  return array(a, dtype, copy=False, order=order, subok=True)


Data is ready !! All data has been saved in C:\Users\avrahami\Documents\Private\IDC\influencer_influenced_project\topic_models\ETM_data\intuview\


In [5]:
vocab, train, valid, test = data.get_data(os.path.join(config_dict['data_path'][machine]))
vocab_size = len(vocab)
config_dict['vocab_size'] = vocab_size

# 1. training data
train_tokens = train['tokens']
train_counts = train['counts']
config_dict['num_docs_train'] = len(train_tokens)

# 2. dev set
valid_tokens = valid['tokens']
valid_counts = valid['counts']
config_dict['num_docs_valid'] = len(valid_tokens)

# 3. test data
test_tokens = test['tokens']
test_counts = test['counts']
config_dict['num_docs_test'] = len(test_tokens)
test_1_tokens = test['tokens_1']
test_1_counts = test['counts_1']
config_dict['num_docs_test_1'] = len(test_1_tokens)
test_2_tokens = test['tokens_2']
test_2_counts = test['counts_2']
config_dict['num_docs_test_2'] = len(test_2_tokens)

In [6]:
# embeddings handeling (internal/external)
embeddings = None
# in case we gave as input the embeddings file to be used as pre-trained model
if not eval(config_dict['model_params']['train_embeddings']):
    embeddings = prepare_embedding_matrix(emb_data_path=config_dict['emb_path'][machine],
                                          emb_size=config_dict['model_params']['emb_size'], vocab=vocab,
                                          random_seed=config_dict['random_seed'])
    # updating the required values after the function returned the embbeddings
    embeddings = torch.from_numpy(embeddings).to(device)
    config_dict['embeddings_dim'] = embeddings.size()
print('=*'*100)
print(f"Training an Embedded Topic Model on {config_dict['dataset'].upper()}")
print('=*'*100)

=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Training an Embedded Topic Model on INTUVIEW
=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*


# 3. Model Definition

In [7]:
# define checkpoint
if not os.path.exists(config_dict['saving_models_path'][machine]):
    os.makedirs(config_dict['saving_models_path'][machine])

if config_dict['optimization_params']['mode'] == 'eval':
    ckpt = config_dict['evaluation_params']['load_from']
else:
    ckpt = \
        os.path.join(config_dict['saving_models_path'][machine],
                     'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_'
                     'trainEmbeddings_{}'.format(config_dict['dataset'], config_dict['model_params']['num_topics'],
                                                 config_dict['model_params']['t_hidden_size'],
                                                 config_dict['optimization_params']['optimizer'],
                                                 config_dict['optimization_params']['clip'],
                                                 config_dict['model_params']['theta_act'],
                                                 config_dict['optimization_params']['lr'],
                                                 config_dict['batch_size'],
                                                 config_dict['model_params']['rho_size'],
                                                 config_dict['model_params']['train_embeddings']))

In [8]:
etm_model = ETM(config_dict=config_dict, machine=machine, embeddings=embeddings)
print('model: {}'.format(etm_model))
optimizer = _set_optimizer()

model: ETM(
  (t_drop): Dropout(p=0.0)
  (theta_act): ReLU()
  (rho): Linear(in_features=300, out_features=8480, bias=False)
  (alphas): Linear(in_features=300, out_features=30, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=8480, out_features=800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=800, out_features=800, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=800, out_features=30, bias=True)
  (logsigma_q_theta): Linear(in_features=800, out_features=30, bias=True)
)


# 4. Fitting a new model

**** use this option in case you want to fit a new model rather than using an existing one ****

In [9]:
etm_model.fit(optimizer=optimizer, train_tokens=train_tokens, train_counts=train_counts,
                      test_1_tokens=test_1_tokens, test_1_counts=test_1_counts, test_2_tokens=test_2_tokens,
                      test_2_counts=test_2_counts, vocab=vocab, ckpt=ckpt)

Epoch: 1 .. batch: 2/15 .. LR: 0.005 .. KL_theta: 0.02 .. Rec_loss: 5846.3 .. NELBO: 5846.32
Epoch: 1 .. batch: 4/15 .. LR: 0.005 .. KL_theta: 0.03 .. Rec_loss: 4777.97 .. NELBO: 4778.0
Epoch: 1 .. batch: 6/15 .. LR: 0.005 .. KL_theta: 0.05 .. Rec_loss: 4098.88 .. NELBO: 4098.93
Epoch: 1 .. batch: 8/15 .. LR: 0.005 .. KL_theta: 0.07 .. Rec_loss: 3752.86 .. NELBO: 3752.93
Epoch: 1 .. batch: 10/15 .. LR: 0.005 .. KL_theta: 0.12 .. Rec_loss: 3945.27 .. NELBO: 3945.39
Epoch: 1 .. batch: 12/15 .. LR: 0.005 .. KL_theta: 0.27 .. Rec_loss: 5041.03 .. NELBO: 5041.3
Epoch: 1 .. batch: 14/15 .. LR: 0.005 .. KL_theta: 0.66 .. Rec_loss: 4808.43 .. NELBO: 4809.09
****************************************************************************************************
Epoch----->1 .. LR: 0.005 .. KL_theta: 0.66 .. Rec_loss: 4808.43 .. NELBO: 4809.09
****************************************************************************************************
*********************************************************

KeyboardInterrupt: 

# 5. Prediction to new data using an existing model

In [None]:
preprocess_model_f_name = config_dict['data_prep_params']['saving_model_f_name'] + '.p'
preprocess_obj = ArabicTwitterPreProcess.load_obj(f_path=config_dict['saving_models_path'][machine],
                                                  f_name=preprocess_model_f_name)
bow_new_docs_tokens, bow_new_docs_counts = preprocess_obj.transform(data_path=config_dict['raw_data_path'][machine])
data_batch = data.get_batch(bow_new_docs_tokens, bow_new_docs_counts,
                            torch.tensor(list(range(len(bow_new_docs_tokens)))), vocab_size, device)
sums = data_batch.sum(1).unsqueeze(1)
normalized_data_batch = data_batch / sums
new_docs_prediction = etm_model.predict_proba(bow_tokens=normalized_data_batch)

# 6. Evluating an existing model

In [None]:
# eval option
with open(ckpt, 'rb') as f:
    model = torch.load(f)
model = model.to(device)
model.eval()

print('Visualizing model quality before training...')
etm_model.eval()
etm_model.print_words_per_topic(words_amount=config_dict['evaluation_params']['num_words'],
                                vocab=vocab, lang='en')
with torch.no_grad():
    # get document completion perplexities
    test_ppl = model.evaluate(source='test', test_1_tokens=test_1_tokens, test_1_counts=test_1_counts,
                              test_2_tokens=test_2_tokens, test_2_counts=test_2_counts,
                              train_tokens=train_tokens, vocab=vocab, tc=config_dict['evaluation_params']['tc'],
                              td=config_dict['evaluation_params']['td'])

    # get most used topics
    indices = torch.tensor(range(config_dict['num_docs_train']))
    indices = torch.split(indices, config_dict['batch_size'])
    thetaAvg = torch.zeros(1, config_dict['model_params']['num_topics']).to(device)
    thetaWeightedAvg = torch.zeros(1, config_dict['model_params']['num_topics']).to(device)
    cnt = 0
    for idx, ind in enumerate(indices):
        data_batch = data.get_batch(train_tokens, train_counts, ind, config_dict['vocab_size'], device)
        sums = data_batch.sum(1).unsqueeze(1)
        cnt += sums.sum(0).squeeze().cpu().numpy()
        if config_dict['optimization_params']['bow_norm']:
            normalized_data_batch = data_batch / sums
        else:
            normalized_data_batch = data_batch
        theta, _ = model.get_theta(normalized_data_batch)
        thetaAvg += theta.sum(0).unsqueeze(0) / config_dict['num_docs_train']
        weighed_theta = sums * theta
        thetaWeightedAvg += weighed_theta.sum(0).unsqueeze(0)
        if idx % 100 == 0 and idx > 0:
            print('batch: {}/{}'.format(idx, len(indices)))
    thetaWeightedAvg = thetaWeightedAvg.squeeze().cpu().numpy() / cnt
    print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10]))

    # show topics
    etm_model.print_words_per_topic(words_amount=config_dict['evaluation_params']['num_words'], vocab=vocab)
    etm_model.print_words_per_topic(words_amount=config_dict['evaluation_params']['num_words'],
                                    vocab=vocab, lang='en')