This is the notebook for extracting theta in order to assign topics to documents.

Before running this notebook, run
1. 1.0-rw-preProcessText_makeBOW or 1.0-rw-PreProcessText_makeBOW_forTopicAssignment
2. 1.0-rw-preProcessText2_embeddings
3. 1.1-rw-main  (to train the model)


In [1]:
# from Adji Dieng GitHub ETM

from __future__ import print_function

import argparse
import torch
import pickle 
import numpy as np 
import os 
import math 
import random 
import sys
import matplotlib.pyplot as plt  
import data
import scipy.io
import pandas as pd

from torch import nn, optim
from torch.nn import functional as F

from etm import ETM
from utils import nearest_neighbors, get_topic_coherence, get_topic_diversity


In [2]:
parser = argparse.ArgumentParser(description='The Embedded Topic Model')



In [3]:
### data and file related arguments
parser.add_argument('--dataset', type=str, default='min_df_3', help='name of corpus') 
parser.add_argument('--data_path', type=str, default='data/min_df_3', help='directory containing data') 
parser.add_argument('--emb_path', type=str, default='data/min_df_3_embeddings.txt', help='directory containing word embeddings')
parser.add_argument('--save_path', type=str, default='./results', help='path to save results')
parser.add_argument('--batch_size', type=int, default=1000, help='input batch size for training')


### model-related arguments
parser.add_argument('--num_topics', type=int, default=150, help='number of topics')
parser.add_argument('--rho_size', type=int, default=300, help='dimension of rho')
parser.add_argument('--emb_size', type=int, default=300, help='dimension of embeddings')
parser.add_argument('--t_hidden_size', type=int, default=800, help='dimension of hidden space of q(theta)')
parser.add_argument('--theta_act', type=str, default='relu', help='tanh, softplus, relu, rrelu, leakyrelu, elu, selu, glu)')
parser.add_argument('--train_embeddings', type=int, default=0, help='whether to fix rho or train it') 

### optimization-related arguments
parser.add_argument('--lr', type=float, default=0.005, help='learning rate')
parser.add_argument('--lr_factor', type=float, default=4.0, help='divide learning rate by this...')
parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train...150 for 20ng 100 for others')
parser.add_argument('--mode', type=str, default='eval model', help='train or eval model') # default='train'
parser.add_argument('--optimizer', type=str, default='adam', help='choice of optimizer')
parser.add_argument('--seed', type=int, default=2019, help='random seed (default: 1)')
parser.add_argument('--enc_drop', type=float, default=0.0, help='dropout rate on encoder')
parser.add_argument('--clip', type=float, default=0.0, help='gradient clipping')
parser.add_argument('--nonmono', type=int, default=10, help='number of bad hits allowed')
parser.add_argument('--wdecay', type=float, default=1.2e-6, help='some l2 regularization')
parser.add_argument('--anneal_lr', type=int, default=0, help='whether to anneal the learning rate or not')
parser.add_argument('--bow_norm', type=int, default=1, help='normalize the bows or not')

### evaluation, visualization, and logging-related arguments
parser.add_argument('--num_words', type=int, default=10, help='number of words for topic viz')
parser.add_argument('--log_interval', type=int, default=2, help='when to log training')
parser.add_argument('--visualize_every', type=int, default=10, help='when to visualize results')
parser.add_argument('--eval_batch_size', type=int, default=1000, help='input batch size for evaluation')
parser.add_argument('--load_from', type=str, default='', help='the name of the ckpt to eval from')
parser.add_argument('--tc', type=int, default=0, help='whether to compute topic coherence or not')
parser.add_argument('--td', type=int, default=0, help='whether to compute topic diversity or not')

_StoreAction(option_strings=['--td'], dest='td', nargs=None, const=None, default=0, type=<class 'int'>, choices=None, help='whether to compute topic diversity or not', metavar=None)

In [4]:
args, unknown = parser.parse_known_args()

#args = parser.parse_args()
# based on solution from:
# https://stackoverflow.com/questions/48796169/how-to-fix-ipykernel-launcher-py-error-unrecognized-arguments-in-jupyter

print(args)

Namespace(anneal_lr=0, batch_size=1000, bow_norm=1, clip=0.0, data_path='data/min_df_3', dataset='min_df_3', emb_path='data/min_df_3_embeddings.txt', emb_size=300, enc_drop=0.0, epochs=100, eval_batch_size=1000, load_from='', log_interval=2, lr=0.005, lr_factor=4.0, mode='eval model', nonmono=10, num_topics=150, num_words=10, optimizer='adam', rho_size=300, save_path='./results', seed=2019, t_hidden_size=800, tc=0, td=0, theta_act='relu', train_embeddings=0, visualize_every=10, wdecay=1.2e-06)


In [5]:
args.load_from = "etm_min_df_3_K_150_Htheta_800_Optim_adam_Clip_0.0_ThetaAct_relu_Lr_0.005_Bsz_1000_RhoSize_300_trainEmbeddings_0"

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('\n')
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    print("yay")
    torch.cuda.manual_seed(args.seed)
else:
    print("nope")



nope


In [7]:
## get data
# 1. vocabulary
vocab, train, valid, test = data.get_data(os.path.join(args.data_path))
vocab_size = len(vocab)
args.vocab_size = vocab_size
print(args.vocab_size)

10591


In [8]:
# 1. training data
train_tokens = train['tokens']
train_counts = train['counts']
args.num_docs_train = len(train_tokens)
print(args.num_docs_train)

1658


In [9]:
# 2. dev set
valid_tokens = valid['tokens']
valid_counts = valid['counts']
args.num_docs_valid = len(valid_tokens)
val_1_tokens = valid['tokens_1']
val_1_counts = valid['counts_1']
args.num_docs_valid_1 = len(val_1_tokens)
val_2_tokens = valid['tokens_2']
val_2_counts = valid['counts_2']
args.num_docs_valid_2 = len(val_2_tokens)
print("val docs")
print(args.num_docs_valid)
print(args.num_docs_valid_1)
print(args.num_docs_valid_2)

val docs
98
98
98


In [10]:
# 3. test data
test_tokens = test['tokens']
test_counts = test['counts']
args.num_docs_test = len(test_tokens)
test_1_tokens = test['tokens_1']
test_1_counts = test['counts_1']
args.num_docs_test_1 = len(test_1_tokens)
test_2_tokens = test['tokens_2']
test_2_counts = test['counts_2']
args.num_docs_test_2 = len(test_2_tokens)
print("test docs")
print(args.num_docs_test)
print(args.num_docs_test_1)
print(args.num_docs_test_2)


test docs
196
196
196


In [11]:
embeddings = None
if not args.train_embeddings:
    emb_path = args.emb_path
#    vect_path = os.path.join(args.data_path.split('/')[0], 'embeddings.pkl')   
    vectors = {}
    with open(emb_path, 'rb') as f:
        for l in f:
            line = l.decode().split()
            word = line[0]
            if word in vocab:
                vect = np.array(line[1:]).astype(np.float)
                vectors[word] = vect
    embeddings = np.zeros((vocab_size, args.emb_size))
    words_found = 0
    for i, word in enumerate(vocab):
        try: 
            embeddings[i] = vectors[word]
            words_found += 1
        except KeyError:
            embeddings[i] = np.random.normal(scale=0.6, size=(args.emb_size, ))
    embeddings = torch.from_numpy(embeddings).to(device)
    args.embeddings_dim = embeddings.size()

print('=*'*100)
print('Training an Embedded Topic Model on {} with the following settings: {}'.format(args.dataset.upper(), args))
print('=*'*100)


=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Training an Embedded Topic Model on MIN_DF_3 with the following settings: Namespace(anneal_lr=0, batch_size=1000, bow_norm=1, clip=0.0, data_path='data/min_df_3', dataset='min_df_3', emb_path='data/min_df_3_embeddings.txt', emb_size=300, embeddings_dim=torch.Size([10591, 300]), enc_drop=0.0, epochs=100, eval_batch_size=1000, load_from='etm_min_df_3_K_150_Htheta_800_Optim_adam_Clip_0.0_ThetaAct_relu_Lr_0.005_Bsz_1000_RhoSize_300_trainEmbeddings_0', log_interval=2, lr=0.005, lr_factor=4.0, mode='eval model', nonmono=10, num_docs_test=196, num_docs_test_1=196, num_docs_test_2=196, num_docs_train=1658, num_docs_valid=98, num_docs_valid_1=98, num_docs_valid_2=98, num_topics=150, num_words=10, optimizer='adam', rho_size=300, save_path='./results', seed=2019, t_hidden_size=800, tc=0, td=0, theta

In [12]:
## define checkpoint
if not os.path.exists(args.save_path):
    os.makedirs(args.save_path)

if args.mode == 'eval model':
    ckpt = os.path.join(args.save_path, args.load_from)
    print(ckpt)
else:
    ckpt = os.path.join(args.save_path, 
        'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}_2'.format(
        args.dataset, args.num_topics, args.t_hidden_size, args.optimizer, args.clip, args.theta_act, 
            args.lr, args.batch_size, args.rho_size, args.train_embeddings))


./results/etm_min_df_3_K_150_Htheta_800_Optim_adam_Clip_0.0_ThetaAct_relu_Lr_0.005_Bsz_1000_RhoSize_300_trainEmbeddings_0


In [13]:
## define model and optimizer
model = ETM(args.num_topics, vocab_size, args.t_hidden_size, args.rho_size, args.emb_size, 
                args.theta_act, embeddings, args.train_embeddings, args.enc_drop).to(device)

print('model: {}'.format(model))

if args.optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
elif args.optimizer == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
elif args.optimizer == 'adadelta':
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
elif args.optimizer == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
elif args.optimizer == 'asgd':
    optimizer = optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
else:
    print('Defaulting to vanilla SGD')
    optimizer = optim.SGD(model.parameters(), lr=args.lr)


model: ETM(
  (t_drop): Dropout(p=0.0)
  (theta_act): ReLU()
  (alphas): Linear(in_features=300, out_features=150, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=10591, out_features=800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=800, out_features=800, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=800, out_features=150, bias=True)
  (logsigma_q_theta): Linear(in_features=800, out_features=150, bias=True)
)


In [15]:
## create DF with top words for each topic

def create_topicsDF():
    
    model.eval()

    with torch.no_grad():
    ## show topics and add to topicsList
    
        beta = model.get_beta()
        #topic_indices = list(np.random.choice(args.num_topics, 10)) # 10 random topics
        print('\n')
        topicsList = []
        for k in range(args.num_topics):#topic_indices:
            gamma = beta[k]
            top_words = list(gamma.cpu().numpy().argsort()[-args.num_words+1:][::-1])
            topic_words = [vocab[a] for a in top_words]
            topicsList.append(topic_words)
            print('Topic {}: {}'.format(k, topic_words))
     
    ## create topicsDF and save to CSV
    print("*"*100)
    colNames = ["Word " + str(i) for i in range(1, len(topicsList[1])+1)]
    topicsDF = pd.DataFrame(topicsList, columns = colNames)
    topicsDF = topicsDF.assign(Topic = topicsDF.index + 1)
    fName = os.path.join(args.save_path, "TopicDFs", args.load_from) + "_TOPICS.csv"
    
    topicsDF.to_csv(fName)
    return topicsDF

In [16]:
topicsDF = create_topicsDF()
topicsDF.head()



Topic 0: ['m2', 'telematic', 'automatization', 'eur', 'fogging', 'europeans', 'teammate', 'triangulation', 'haas']
Topic 1: ['bonds', 'bp', 'qr', 'certifiers', 'column', 'grading', 'recommends', 'advises', 'ledger']
Topic 2: ['capsid', 'studios', 'freelancers', 'programmers', 'incubators', 'virus', 'viruses', 'sequences', 'laboratories']
Topic 3: ['defend', 'filament', 'wearer', 'pround', 'apprehend', 'govern', 'immobilization', 'domiciliary', 'violate']
Topic 4: ['probabilistic', 'modal', 'municipal', 'hydroalcoholic', 'accreditations', 'opaque', 'stringency', 'musculoskeletal', 'pooled']
Topic 5: ['virologists', 'gyms', 'polymerase', 'pandemics', 'nanomaterials', 'nanotechnology', 'seasoned', 'add', 'underestimate']
Topic 6: ['fellowship', 'controller', 'upvote', 'passions', 'breakout', 'setups', 'gamer', 'likeminded', 'enabler']
Topic 7: ['humidifier', 'checkups', 'informational', 'b2b', 'shelters', 'disinfectant', 'sterilizer', 'escrow', 'broker']
Topic 8: ['interdisciplinary', '

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Topic
0,m2,telematic,automatization,eur,fogging,europeans,teammate,triangulation,haas,1
1,bonds,bp,qr,certifiers,column,grading,recommends,advises,ledger,2
2,capsid,studios,freelancers,programmers,incubators,virus,viruses,sequences,laboratories,3
3,defend,filament,wearer,pround,apprehend,govern,immobilization,domiciliary,violate,4
4,probabilistic,modal,municipal,hydroalcoholic,accreditations,opaque,stringency,musculoskeletal,pooled,5


In [18]:
# get the beta matrix (word distribution over topics) associated to model

def get_betaMatrix(m):

    beta = m.get_beta()
    beta = beta.detach().numpy()

    path_save = os.path.join(args.save_path, "ThetaBeta", args.load_from)
    with open(path_save + '_beta.pkl', 'wb') as f:
            pickle.dump(beta, f)
    
    return beta

In [19]:
beta = get_betaMatrix(m = model)
beta.shape

(150, 10591)

In [20]:
# get the theta matrix (topic distribution over documents) associated to model and docs

def get_thetaMatrix(m, source = "test"):
    # first create null theta DF
    
    totalTheta = pd.DataFrame()
    
    # prepare indices and tokens
    if source == "train":
        indices = torch.split(torch.tensor(range(args.num_docs_train)), args.eval_batch_size)
        tokens = train_tokens
        counts = train_counts
    else:
        indices = torch.split(torch.tensor(range(args.num_docs_test)), args.eval_batch_size)
        tokens = test_tokens
        counts = test_counts
        
    ## split into batch sizes and get \theta here (theta contains topic assignments)    
    for idx, ind in enumerate(indices):
        ## get theta from first half of docs
        data_batch = data.get_batch(tokens, counts, ind, args.vocab_size, device)
        sums = data_batch.sum(1).unsqueeze(1)
        if args.bow_norm:
            normalized_data_batch = data_batch / sums
        else:
            normalized_data_batch = data_batch
        
        theta, _ = m.get_theta(normalized_data_batch)

        ## combine with theta from previous batch
        th = theta.detach().numpy()
        th = pd.DataFrame(th)
        totalTheta = totalTheta.append(th.copy())
    
    # Save the file
    path_save = os.path.join(args.save_path, "ThetaBeta", args.load_from)
    print("Saving to: {}".format(path_save))
    with open(path_save + '_theta.pkl', 'wb') as f:
        pickle.dump(totalTheta, f)
        
    return(totalTheta.reset_index(drop = True))

In [21]:
thetaDF = get_thetaMatrix(m = model, source = "train")  # source = 'train' or 'test'
thetaDF.shape

Saving to: ./results/ThetaBeta/etm_min_df_3_K_150_Htheta_800_Optim_adam_Clip_0.0_ThetaAct_relu_Lr_0.005_Bsz_1000_RhoSize_300_trainEmbeddings_0


In [23]:
# rename the column headings
thetaDF.rename(columns=lambda x: 'Topic_' + str(x + 1), inplace=True)
thetaDF = thetaDF.assign(newIndex = thetaDF.index)
thetaDF.head(3)


Unnamed: 0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,...,Topic_142,Topic_143,Topic_144,Topic_145,Topic_146,Topic_147,Topic_148,Topic_149,Topic_150,newIndex
0,0.006726,0.006595,0.006741,0.006774,0.006768,0.006511,0.00635,0.006551,0.00663,0.006608,...,0.006847,0.006894,0.006783,0.006519,0.006478,0.006451,0.006546,0.006619,0.006842,0
1,0.006727,0.006595,0.006745,0.00677,0.006772,0.006511,0.006356,0.006552,0.006634,0.006603,...,0.006852,0.006891,0.006782,0.006515,0.006478,0.006455,0.006545,0.006618,0.00684,1
2,0.006727,0.006594,0.00674,0.006773,0.006766,0.00651,0.006351,0.00655,0.006629,0.006606,...,0.006846,0.006894,0.006783,0.00652,0.006478,0.00645,0.006546,0.006619,0.006839,2


In [25]:
# get the df with the document info

with open(os.path.join('./data', 'min_df_3', 'dataDF.pkl'), 'rb') as f:
        newDF = pickle.load(f)

        
# rearrange df by permuteIdx, since this is the order they appear in in the topic embeddings
newDF.sort_values(by = "permuteIdx", axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')
newDF.head()

Unnamed: 0,Challenge,SubChallenge,ProjURL,title,text,oldIndex,fullDS,newIndex,permuteIdx,vocabDS,dsType
871,Business,NewModels,https://devpost.com/software/connecting-future...,Connecting Futures,The problem your project solves\nProblem: Huge...,963,"[11787, 11798, 11507, 10180, 11645, 11201, 113...",871,0,"[10180, 3114, 8032, 10530, 10285, 10530, 10285...",Tr
804,Health,Other,https://devpost.com/software/aurum-wellness,Aurum Wellness,Inspiration - Always wanted to work in the spa...,889,"[11058, 11789, 11358, 10434, 11268, 11577, 114...",804,1,"[10434, 10434, 10176, 8544, 10517]",Tr
1155,Business,Other,https://devpost.com/software/desktop-mobile-co...,Desktop mobile computer,The problem\nThere is a problem that need shor...,1269,"[11787, 11787, 11800, 11780, 10482, 11532, 116...",1155,2,"[10482, 4194, 9529, 6383, 10505, 8577, 10529, ...",Tr
1311,Cohesion,FakeNews,https://devpost.com/software/fact-o-meter-47w15x,Fact-O-Meter,The story of Pinocchio and his nose:\nFake New...,1435,"[10628, 11144, 9579, 4649, 645, 10023, 10, 117...",1311,3,"[9579, 4649, 645, 10023, 10, 9042, 10211, 1011...",Tr
59,Health,Equipment,https://devpost.com/software/tracejyu,TraceJYU,Inspiration\nWe are inspired by the need to fi...,62,"[11275, 11767, 11800, 11772, 11720, 10823, 116...",59,4,"[9684, 10304, 9091, 9997, 6891, 7581, 10327, 9...",Tr


In [26]:
# remove rows where dsType is NA

lastDF = newDF[newDF.dsType == "NA"]
lastDF.shape
lastDF

Unnamed: 0,Challenge,SubChallenge,ProjURL,title,text,oldIndex,fullDS,newIndex,permuteIdx,vocabDS,dsType
1904,Other,Other,https://devpost.com/software/psychological-fir...,Helpers,Inspiration\nWhat it does\nHow I built it\nCha...,2091,"[11804, 11796, 11797, 11795]",1904,274,[],
173,Health,Ventilators,https://devpost.com/software/nose-filter-for-a...,Nose filter for air that u breathe!,Inspiration\njust had the idea for a long time...,188,"[11769, 11716, 11804, 11796, 11797, 11795, 107...",173,308,[],
1107,Business,Customers,https://devpost.com/software/test1-6b0ugv,To be renamed,Inspiration\n... coming soon\nWhat it does\n.....,1218,"[11253, 11253, 11804, 11253, 11796, 11253, 117...",1107,342,[],
822,Business,TeamWork,https://devpost.com/software/business-news-t4kqp3,business-news,"In the current scenario of the world, it is ha...",910,"[11768, 10863, 11512, 11492, 11789, 11745, 117...",822,514,[],
842,Business,NewModels,https://devpost.com/software/eurovirtual,eurovirtual,Inspiration THE VIRTUAL WORLD\nWhat it does EA...,931,"[11804, 11796, 11797, 11795]",842,557,[],
1097,Business,Customers,https://devpost.com/software/test-5zy6wr,Test,Inspiration\nTrying to see what happens after ...,1205,"[11804, 11796, 11797, 11795]",1097,1012,[],
994,Business,Logistics,https://devpost.com/software/covid19-erp-d3m7qu,Covid19-ERP,the need to provide Big Data & Analitica funct...,1093,"[11776, 11721, 11258, 11792, 11726, 11735, 106...",994,1034,[],
492,Health,CommunicationPrevention,https://devpost.com/software/coronitor-pharmac...,coronitor - Pharmacy Extension,Inspiration\nWhat it does\nHow I built it\nCha...,541,"[11804, 11796, 11797, 11795]",492,1472,[],
257,Health,CommunicationPrevention,https://devpost.com/software/smart-workout-hea...,Smart-Workout/Health-Equipment= BooChiCa,Inspiration ONLINE WORKOUT\nWhat it does MONIT...,281,"[11804, 11796, 11797, 11795]",257,1922,[],


In [27]:
# keep only rows with non-NA dsType (these correspond to the rows for which topic embeddings were found)
keepDF = newDF[newDF.dsType != "NA"].reset_index(drop = True)
keepDF.shape
keepDF = keepDF.assign(newIndex = keepDF.index)
keepDF.head()

Unnamed: 0,Challenge,SubChallenge,ProjURL,title,text,oldIndex,fullDS,newIndex,permuteIdx,vocabDS,dsType
0,Business,NewModels,https://devpost.com/software/connecting-future...,Connecting Futures,The problem your project solves\nProblem: Huge...,963,"[11787, 11798, 11507, 10180, 11645, 11201, 113...",0,0,"[10180, 3114, 8032, 10530, 10285, 10530, 10285...",Tr
1,Health,Other,https://devpost.com/software/aurum-wellness,Aurum Wellness,Inspiration - Always wanted to work in the spa...,889,"[11058, 11789, 11358, 10434, 11268, 11577, 114...",1,1,"[10434, 10434, 10176, 8544, 10517]",Tr
2,Business,Other,https://devpost.com/software/desktop-mobile-co...,Desktop mobile computer,The problem\nThere is a problem that need shor...,1269,"[11787, 11787, 11800, 11780, 10482, 11532, 116...",2,2,"[10482, 4194, 9529, 6383, 10505, 8577, 10529, ...",Tr
3,Cohesion,FakeNews,https://devpost.com/software/fact-o-meter-47w15x,Fact-O-Meter,The story of Pinocchio and his nose:\nFake New...,1435,"[10628, 11144, 9579, 4649, 645, 10023, 10, 117...",3,3,"[9579, 4649, 645, 10023, 10, 9042, 10211, 1011...",Tr
4,Health,Equipment,https://devpost.com/software/tracejyu,TraceJYU,Inspiration\nWe are inspired by the need to fi...,62,"[11275, 11767, 11800, 11772, 11720, 10823, 116...",4,4,"[9684, 10304, 9091, 9997, 6891, 7581, 10327, 9...",Tr


In [29]:
# merge thetaDF with the documents DF
joinedDF = pd.merge(keepDF, thetaDF, how='left', on= ["newIndex"], left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

joinedDF = joinedDF.append(lastDF).reset_index(drop = True).fillna(0)
joinedDF.head()

Unnamed: 0,Challenge,ProjURL,SubChallenge,Topic_1,Topic_10,Topic_100,Topic_101,Topic_102,Topic_103,Topic_104,...,Topic_98,Topic_99,dsType,fullDS,newIndex,oldIndex,permuteIdx,text,title,vocabDS
0,Business,https://devpost.com/software/connecting-future...,NewModels,0.006726,0.006608,0.006464,0.006847,0.006541,0.00662,0.006534,...,0.006608,0.006843,Tr,"[11787, 11798, 11507, 10180, 11645, 11201, 113...",0,963,0,The problem your project solves\nProblem: Huge...,Connecting Futures,"[10180, 3114, 8032, 10530, 10285, 10530, 10285..."
1,Health,https://devpost.com/software/aurum-wellness,Other,0.006727,0.006603,0.006465,0.006852,0.006543,0.006622,0.006533,...,0.006611,0.006846,Tr,"[11058, 11789, 11358, 10434, 11268, 11577, 114...",1,889,1,Inspiration - Always wanted to work in the spa...,Aurum Wellness,"[10434, 10434, 10176, 8544, 10517]"
2,Business,https://devpost.com/software/desktop-mobile-co...,Other,0.006727,0.006606,0.006464,0.006847,0.006542,0.006622,0.006536,...,0.006607,0.006844,Tr,"[11787, 11787, 11800, 11780, 10482, 11532, 116...",2,1269,2,The problem\nThere is a problem that need shor...,Desktop mobile computer,"[10482, 4194, 9529, 6383, 10505, 8577, 10529, ..."
3,Cohesion,https://devpost.com/software/fact-o-meter-47w15x,FakeNews,0.006727,0.006609,0.006463,0.006846,0.006541,0.006619,0.006534,...,0.006608,0.006842,Tr,"[10628, 11144, 9579, 4649, 645, 10023, 10, 117...",3,1435,3,The story of Pinocchio and his nose:\nFake New...,Fact-O-Meter,"[9579, 4649, 645, 10023, 10, 9042, 10211, 1011..."
4,Health,https://devpost.com/software/tracejyu,Equipment,0.006729,0.006604,0.006464,0.006848,0.00654,0.006621,0.006537,...,0.006607,0.006844,Tr,"[11275, 11767, 11800, 11772, 11720, 10823, 116...",4,62,4,Inspiration\nWe are inspired by the need to fi...,TraceJYU,"[9684, 10304, 9091, 9997, 6891, 7581, 10327, 9..."


In [34]:
# Save the file
path_save = os.path.join(args.save_path, "ThetaBeta", args.load_from) + "_docTopics.csv"
joinedDF.to_csv(path_save)