# LDA - Latent Dirichlet Allocation

## Optimization

In [21]:
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [22]:
# percorsi
dataset_save = r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\dataset\dataset_LDA'
risultati_TD = r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD'
risultati_npmi = r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi'

In [23]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dataset_save)

### Train a Model

In [24]:
# Create Model
model = LDA(num_topics=15, )
model.partitioning(False)

In [25]:
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA

model_output = model.train_model(dataset)

### Evaluate a Model

In [26]:
for t in model_output['topics']:
  print(" ".join(t))

essere centro castello pia storico arte spoleto citta piazza storia
trovare centro pia palazzo antico santo interno secolo storico sorgere
situare antico centro origine lungo citta trovare essere avere secolo
citta costruire fare essere pia lungo storia origine abbazia piazza
dedicare citta storia secolo edificio pia nord sorgere affresco essere
museo trovare palazzo medievale pia sorgere avere essere opera presentare
trovare piccolo chilometro poco essere circa isola sorgere interno the
essere piazza parte palazzo costruire borgo venire museo sorgere castello
essere secolo citta castello costruire medievale muro sorgere pia interno
antico pia piazza borgo via comune storico essere trovare lungo
trovare castello borgo secolo sec sorgere parco spoleto piazza museo
secolo centro essere edificio meta trovare citta xiv santo avere
essere secolo antico centro palazzo trovare storico xiv sorgere citta
centro storico trovare piazza museo perugia palazzo sorgere borgo santo
borgo citta piccolo

In [27]:
len(model_output['topics'])

15

In [28]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

In [29]:
# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [30]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.36
Coherence: -0.11299859829181431


### Hyperparameter Optimization

In [31]:
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real
from skopt.space.space import Real, Integer

### Topic Diversity

In [32]:
# Define the search space. To see which hyperparameters to optimize, see the topic model's initialization signature
search_space = {"alpha": Real(low=0.001, high=5.0), "eta": Real(low=0.001, high=5.0), "num_topics": Integer(low=5, high=15)}

# Initialize an optimizer object and start the optimization.
optimizer=Optimizer()
optResult=optimizer.optimize(model, dataset, topic_diversity, search_space, save_path=risultati_TD, # path to store the results
                             number_of_call=30, # number of optimization iterations
                             model_runs=5,save_models=False) # number of runs of the topic model
#save the results of th optimization in a csv file
optResult.save_to_csv(r"C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\result_TD.csv")

Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14
Current call:  15
Current call:  16
Current call:  17
Current call:  18
Current call:  19
Current call:  20
Current call:  21
Current call:  22
Current call:  23
Current call:  24
Current call:  25
Current call:  26
Current call:  27
Current call:  28
Current call:  29


In [33]:
import json
res = json.load(open(r"C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\result.json",'r'))
res.keys()

dict_keys(['dataset_name', 'dataset_path', 'is_cached', 'kernel', 'acq_func', 'surrogate_model', 'optimization_type', 'model_runs', 'save_models', 'save_step', 'save_name', 'save_path', 'early_stop', 'early_step', 'plot_model', 'plot_best_seen', 'plot_name', 'log_scale_plot', 'search_space', 'model_name', 'model_attributes', 'use_partitioning', 'metric_name', 'extra_metric_names', 'metric_attributes', 'extra_metric_attributes', 'current_call', 'number_of_call', 'random_state', 'x0', 'y0', 'n_random_starts', 'initial_point_generator', 'topk', 'time_eval', 'dict_model_runs', 'f_val', 'x_iters'])

In [34]:
# valuto qual è l'iterazione migliore
print("Valore migliore di Topic Diversity ottenuto: " )
print(max(res["f_val"]))
print("Numero di iterazione: ")
i = 0
for j in res["f_val"]:
  if(j == max(res["f_val"])):
    i += 1
    print(i)
    break
  else:
    i += 1
alpha = res['x_iters']['alpha'][i-1]
eta = res['x_iters']['eta'][i-1]
topic = res['x_iters']['num_topics'][i-1]
print("Valore di alpha:")
print(alpha)
print("Valore di eta:")
print(eta)
print("Numero di topic:")
print(topic)

Valore migliore di Topic Diversity ottenuto: 
0.4666666666666667
Numero di iterazione: 
28
Valore di alpha:
0.3188824628083212
Valore di eta:
4.788007201380591
Numero di topic:
6


In [35]:
# Create Model
model_TD = LDA(num_topics= topic, alpha= alpha, eta= eta)
model_TD.partitioning(False)

In [36]:
model_output_TD = model_TD.train_model(dataset)

In [37]:
print(*list(model_output_TD.keys()), sep="\n") # Print the output identifiers

topic-word-matrix
topics
topic-document-matrix


In [38]:
import pandas as pd 

for t in model_output_TD['topics']:
  print(" ".join(t))

topics = pd.DataFrame(model_output_TD['topics'])
topics.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\topics.xlsx', index=False)  

trovare via secolo pia piazza centro the sorgere lungo palazzo
trovare essere secolo costruire palazzo pia borgo centro medievale antico
essere trovare centro storico piazza palazzo borgo perugia piccolo comune
trovare avere citta museo essere centro castello sede poco area
citta secolo antico essere sorgere castello monte situare localita museo
storico piazza centro sorgere borgo secolo interno essere pia castello


In [39]:
vocabolario = []
with open(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\dataset\dataset_LDA\vocabulary.txt', 'r') as file:
    for word in file:
        vocabolario.append(word)


In [40]:
# Carica la matrice topic-word in un DataFrame pandas
topic_word_matrix = pd.DataFrame(model_output_TD['topic-word-matrix'])

# Imposta i nomi delle colonne come parole

# Ottieni i primi k termini da ciascuna riga come nomi di colonna
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# Stampa il DataFrame
print(topic_word_matrix)

topic_word_matrix.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\topic_word_matrix.xlsx', index=False)  

   abbadesso\n  abbandonare\n  abbastanza\n  abbazia\n  abbaziale\n   
0     0.000342       0.000392      0.000337   0.000867     0.000337  \
1     0.000423       0.000440      0.000327   0.000768     0.000327   
2     0.000434       0.000336      0.000333   0.000856     0.000333   
3     0.000414       0.000826      0.000414   0.001106     0.000414   
4     0.000411       0.000467      0.000339   0.000565     0.000339   
5     0.000335       0.000400      0.000332   0.000515     0.000332   

   abitare\n  abitativo\n  abitato\n  abitazione\n  abolitare\n  ...   
0   0.000337     0.000337   0.000363      0.000354     0.000338  ...  \
1   0.000393     0.000327   0.000690      0.000425     0.000409  ...   
2   0.000335     0.000333   0.000379      0.000399     0.000395  ...   
3   0.000415     0.000414   0.000432      0.000452     0.000459  ...   
4   0.000339     0.000339   0.000341      0.000403     0.000355  ...   
5   0.000333     0.000332   0.000427      0.000467     0.000335  ...  

In [41]:
import pandas as pd

corpus_processed = pd.read_csv(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\dataset\dataset_LDA\corpus.tsv', sep='\t', header=None)
corpus_processed
new_df = pd.DataFrame(corpus_processed[0])

In [42]:
num = 1
for topic in model_output_TD['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5,6
0,area boschivo oltre avere situare altitudine f...,0.026696,0.027049,0.026850,0.865834,0.026925,0.026646
1,vicino stroncone borgo medievale arroccare col...,0.015544,0.511110,0.015735,0.015652,0.426131,0.015827
2,gola nera natura cultura storia incontrare olt...,0.016364,0.016643,0.178441,0.755044,0.016744,0.016764
3,rocca trovare isola polvese insieme isola magg...,0.027733,0.027948,0.674449,0.214924,0.027418,0.027528
4,laisolare maggiore secondo grandezza lago tras...,0.013775,0.932216,0.013534,0.013639,0.013291,0.013544
...,...,...,...,...,...,...,...
568,unaopera arte ziggurart numeroso artista inter...,0.858178,0.028255,0.028369,0.028901,0.028127,0.028171
569,spazio espositivo punto arrivo crescente ricer...,0.031084,0.845502,0.030531,0.030662,0.031828,0.030393
570,descrizione mancare,0.092989,0.092560,0.092729,0.535825,0.093116,0.092781
571,antico castello immerso millenario,0.065344,0.063461,0.062595,0.066475,0.678048,0.064077


In [43]:
# la salvo
new_df.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\topic-document-matrix.xlsx', index=False)  

In [44]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output_TD)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output_TD)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.5
Coherence: -0.049901799156060245


### Topic Coherence

In [45]:
# Define the search space. To see which hyperparameters to optimize, see the topic model's initialization signature
search_space = {"alpha": Real(low=0.001, high=5.0), "eta": Real(low=0.001, high=5.0), "num_topics": Integer(low=5, high=15)}

# Initialize an optimizer object and start the optimization.
optimizer=Optimizer()
optResult=optimizer.optimize(model, dataset, npmi, search_space, save_path= risultati_npmi, # path to store the results
                             number_of_call=30, # number of optimization iterations
                             model_runs=5,save_models=False) # number of runs of the topic model
#save the results of th optimization in a csv file
optResult.save_to_csv(r"C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\result_npmi.csv")

Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14
Current call:  15
Current call:  16
Current call:  17
Current call:  18
Current call:  19
Current call:  20
Current call:  21
Current call:  22
Current call:  23
Current call:  24
Current call:  25
Current call:  26
Current call:  27
Current call:  28
Current call:  29


In [46]:
import json
res = json.load(open(r"C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\result.json",'r'))
res.keys()

dict_keys(['dataset_name', 'dataset_path', 'is_cached', 'kernel', 'acq_func', 'surrogate_model', 'optimization_type', 'model_runs', 'save_models', 'save_step', 'save_name', 'save_path', 'early_stop', 'early_step', 'plot_model', 'plot_best_seen', 'plot_name', 'log_scale_plot', 'search_space', 'model_name', 'model_attributes', 'use_partitioning', 'metric_name', 'extra_metric_names', 'metric_attributes', 'extra_metric_attributes', 'current_call', 'number_of_call', 'random_state', 'x0', 'y0', 'n_random_starts', 'initial_point_generator', 'topk', 'time_eval', 'dict_model_runs', 'f_val', 'x_iters'])

In [47]:
# valuto qual è l'iterazione migliore
print("Valore migliore di Topic Coherence ottenuto: " )
print(max(res["f_val"]))
print("Numero di iterazione: ")
i = 0
for j in res["f_val"]:
  if(j == max(res["f_val"])):
    i += 1
    print(i)
    break
  else:
    i += 1
alpha = res['x_iters']['alpha'][i-1]
eta = res['x_iters']['eta'][i-1]
topic = res['x_iters']['num_topics'][i-1]
print("Valore di alpha:")
print(alpha)
print("Valore di eta:")
print(eta)
print("Numero di topic:")
print(topic)

Valore migliore di Topic Diversity ottenuto: 
-0.009620424939827498
Numero di iterazione: 
28
Valore di alpha:
1.155593290636778
Valore di eta:
4.389407023021437
Numero di topic:
7


In [48]:
# Create Model
model_npmi = LDA(num_topics= topic, alpha= alpha, eta= eta)
model_npmi.partitioning(False)

In [49]:
model_output_npmi = model_npmi.train_model(dataset)

In [50]:
for t in model_output_npmi['topics']:
  print(" ".join(t))

topics = pd.DataFrame(model_output_npmi['topics'])
topics.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\topics.xlsx', index=False)  

centro trovare essere secolo citta sorgere pia costruire piazza palazzo
trovare essere secolo centro pia antico piazza borgo castello piccolo
essere trovare citta piazza secolo sorgere palazzo antico centro storico
trovare essere centro antico pia secolo piazza storico borgo citta
essere secolo centro citta antico piazza trovare sorgere storico lungo
centro essere secolo trovare piazza citta pia borgo antico castello
trovare secolo essere pia citta piazza centro palazzo sorgere storico


In [51]:
# Carica la matrice topic-word in un DataFrame pandas
topic_word_matrix = pd.DataFrame(model_output_npmi['topic-word-matrix'])

# Imposta i nomi delle colonne come parole

# Ottieni i primi k termini da ciascuna riga come nomi di colonna
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# Stampa il DataFrame
print(topic_word_matrix)

topic_word_matrix.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\topic_word_matrix.xlsx', index=False)  

   abbadesso\n  abbandonare\n  abbastanza\n  abbazia\n  abbaziale\n   
0     0.000379       0.000435      0.000346   0.000693     0.000343  \
1     0.000386       0.000485      0.000349   0.000844     0.000349   
2     0.000406       0.000446      0.000346   0.000691     0.000347   
3     0.000414       0.000476      0.000349   0.000747     0.000348   
4     0.000378       0.000466      0.000348   0.000806     0.000352   
5     0.000386       0.000460      0.000348   0.000831     0.000348   
6     0.000393       0.000513      0.000350   0.000663     0.000348   

   abitare\n  abitativo\n  abitato\n  abitazione\n  abolitare\n  ...   
0   0.000353     0.000344   0.000422      0.000409     0.000374  ...  \
1   0.000358     0.000349   0.000444      0.000419     0.000375  ...   
2   0.000358     0.000348   0.000444      0.000401     0.000383  ...   
3   0.000356     0.000348   0.000404      0.000420     0.000389  ...   
4   0.000370     0.000352   0.000452      0.000431     0.000389  ...   

In [52]:
num = 1
for topic in model_output_npmi['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,area boschivo oltre avere situare altitudine f...,0.133597,0.144934,0.142614,0.147144,0.150494,0.138543,0.142673
1,vicino stroncone borgo medievale arroccare col...,0.134117,0.148253,0.130414,0.152439,0.160951,0.143436,0.130390
2,gola nera natura cultura storia incontrare olt...,0.145065,0.129584,0.143882,0.149317,0.142342,0.145642,0.144169
3,rocca trovare isola polvese insieme isola magg...,0.149813,0.147298,0.135353,0.146976,0.137082,0.141302,0.142177
4,laisolare maggiore secondo grandezza lago tras...,0.143677,0.135070,0.149018,0.147467,0.143940,0.148448,0.132380
...,...,...,...,...,...,...,...,...
568,unaopera arte ziggurart numeroso artista inter...,0.140695,0.145405,0.143521,0.137199,0.143245,0.146931,0.143003
569,spazio espositivo punto arrivo crescente ricer...,0.145143,0.140754,0.147835,0.139034,0.141286,0.145924,0.140026
570,descrizione mancare,0.142629,0.142752,0.142968,0.142143,0.142856,0.143622,0.143030
571,antico castello immerso millenario,0.139435,0.145529,0.144008,0.147970,0.144273,0.142541,0.136244


In [53]:
# la salvo
new_df.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\topic-document-matrix.xlsx', index=False)  

In [54]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output_npmi)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output_npmi)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.22857142857142856
Coherence: -0.01068994641211756
