# LDA - Latent Dirichlet Allocation

## Optimization

In [4]:
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [5]:
# percorsi
dataset_save = r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\dataset'
risultati_TD = r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD'
risultati_npmi = r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi'

In [6]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dataset_save)

### Train a Model

In [7]:
# Create Model
model = LDA(num_topics=30)
model.partitioning(False)

In [8]:
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA

model_output = model.train_model(dataset)

### Evaluate a Model

In [9]:
for t in model_output['topics']:
  print(" ".join(t))

centro storico chiesa secolo palazzo san sorge santa borgo castello
castello trova citta borgo abbazia colle collina clitunno luogo storico
san chiesa secolo parte museo pia importante maria gualdo risalente
chiesa the palazzo piazza trova centro interno lungo antico eretta
parte citta arte norcia contesto sede attivita viene stagionato due
chiesa secolo trova citta via poco costruito sorge prima lungo
trova lungo centro borgo chiesa vicino celebre corso antica palazzo
sorge castello storia numerosi tevere san amelia secolo via borgo
chiesa san francesco secolo citta centro piazza costruita santo francescano
san piazza trova chiesa francesco secolo perugia palazzo pia museo
chiesa via centro san trova citta antico storico perugia mura
san edificio chiesa gualdo lato tadino piazza domina maria due
san allainterno chiesa centro sorge citta francesco borgo museo santa
colle trova romana secolo todi palazzo valle pia san piazza
san centro museo assisi chiesa citta nucleo sorge etrusca arca

In [10]:
len(model_output['topics'])

30

In [11]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

In [12]:
# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [13]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.4066666666666667
Coherence: -0.13893666464364618


### Hyperparameter Optimization

In [14]:
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real
from skopt.space.space import Real, Integer

### Topic Diversity

In [15]:
# Define the search space. To see which hyperparameters to optimize, see the topic model's initialization signature
search_space = {"alpha": Real(low=0.001, high=5.0), "eta": Real(low=0.001, high=5.0), "num_topics": Integer(low=5, high=15)}

# Initialize an optimizer object and start the optimization.
optimizer=Optimizer()
optResult=optimizer.optimize(model, dataset, topic_diversity, search_space, save_path=risultati_TD, # path to store the results
                             number_of_call=30, # number of optimization iterations
                             model_runs=5,save_models=False) # number of runs of the topic model
#save the results of th optimization in a csv file
optResult.save_to_csv(r"C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\result_TD.csv")

Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14
Current call:  15
Current call:  16
Current call:  17
Current call:  18
Current call:  19
Current call:  20
Current call:  21
Current call:  22
Current call:  23
Current call:  24
Current call:  25
Current call:  26
Current call:  27
Current call:  28
Current call:  29


OSError: Cannot save file into a non-existent directory: 'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\result_TD'

In [16]:
import json
res = json.load(open(r"C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\result.json",'r'))
res.keys()

dict_keys(['dataset_name', 'dataset_path', 'is_cached', 'kernel', 'acq_func', 'surrogate_model', 'optimization_type', 'model_runs', 'save_models', 'save_step', 'save_name', 'save_path', 'early_stop', 'early_step', 'plot_model', 'plot_best_seen', 'plot_name', 'log_scale_plot', 'search_space', 'model_name', 'model_attributes', 'use_partitioning', 'metric_name', 'extra_metric_names', 'metric_attributes', 'extra_metric_attributes', 'current_call', 'number_of_call', 'random_state', 'x0', 'y0', 'n_random_starts', 'initial_point_generator', 'topk', 'time_eval', 'dict_model_runs', 'f_val', 'x_iters'])

In [17]:
# valuto qual è l'iterazione migliore
print("Valore migliore di Topic Diversity ottenuto: " )
print(max(res["f_val"]))
print("Numero di iterazione: ")
i = 0
for j in res["f_val"]:
  if(j == max(res["f_val"])):
    i += 1
    print(i)
    break
  else:
    i += 1
alpha = res['x_iters']['alpha'][i-1]
eta = res['x_iters']['eta'][i-1]
topic = res['x_iters']['num_topics'][i-1]
print("Valore di alpha:")
print(alpha)
print("Valore di eta:")
print(eta)
print("Numero di topic:")
print(topic)

Valore migliore di Topic Diversity ottenuto: 
0.46
Numero di iterazione: 
30
Valore di alpha:
0.18353211266125663
Valore di eta:
4.3847759024374495
Numero di topic:
5


In [18]:
# Create Model
model_TD = LDA(num_topics= topic, alpha= alpha, eta= eta)
model_TD.partitioning(False)

In [19]:
model_output_TD = model_TD.train_model(dataset)

In [20]:
print(*list(model_output_TD.keys()), sep="\n") # Print the output identifiers

topic-word-matrix
topics
topic-document-matrix


In [21]:
import pandas as pd 

for t in model_output_TD['topics']:
  print(" ".join(t))

topics = pd.DataFrame(model_output_TD['topics'])
topics.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\topics.xlsx', index=False)  

chiesa san trova centro maria santa storico palazzo castello citta
secolo san chiesa citta pia centro trova interno lungo palazzo
chiesa san trova borgo pia francesco secolo sorge piazza citta
palazzo trova borgo citta museo the chiesa secolo centro piazza
chiesa san piazza centro santa francesco pia maria trova sorge


In [22]:
vocabolario = []
with open(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\dataset\vocabulary.txt', 'r') as file:
    for word in file:
        vocabolario.append(word)


In [23]:
# Carica la matrice topic-word in un DataFrame pandas
topic_word_matrix = pd.DataFrame(model_output_TD['topic-word-matrix'])

# Imposta i nomi delle colonne come parole

# Ottieni i primi k termini da ciascuna riga come nomi di colonna
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# Stampa il DataFrame
print(topic_word_matrix)

topic_word_matrix.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\topic_word_matrix.xlsx', index=False)  

   abbadesse\n  abbandonato\n  abbastanza\n  abbazia\n  abbaziale\n   
0     0.000340       0.000285      0.000495   0.000628     0.000341  \
1     0.000355       0.000355      0.000599   0.000644     0.000429   
2     0.000286       0.000300      0.000537   0.000403     0.000301   
3     0.000359       0.000411      0.000784   0.000528     0.000518   
4     0.000291       0.000345      0.000526   0.000474     0.000435   

   abbazie\n  abitati\n  abitativo\n  abitato\n  abitazione\n  ...    with\n   
0   0.000285   0.000285     0.000642   0.000285      0.000287  ...  0.000357  \
1   0.000289   0.000289     0.000645   0.000289      0.000289  ...  0.000318   
2   0.000286   0.000286     0.000549   0.000286      0.000351  ...  0.000314   
3   0.000359   0.000359     0.000630   0.000359      0.000423  ...  0.000292   
4   0.000281   0.000281     0.000409   0.000281      0.000281  ...  0.000281   

      wwf\n     xii\n     xiv\n     xvi\n    xvii\n  ziggurart\n   zocco\n   
0  0.000315  0

In [24]:
import pandas as pd

corpus_processed = pd.read_csv(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\dataset\corpus.tsv', sep='\t', header=None)
corpus_processed
new_df = pd.DataFrame(corpus_processed[0])

In [25]:
num = 1
for topic in model_output_TD['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5
0,qui possiamo ammirare resti porta accesso anti...,0.015656,0.015667,0.015630,0.937400,0.015647
1,posizione dominante valle risacco val rasina r...,0.006674,0.006672,0.973298,0.006669,0.006687
2,collegiata santa maria assunta risale stile as...,0.017157,0.017088,0.017042,0.017049,0.931664
3,unica navata ampia luminosa due nicchioni due ...,0.010465,0.958238,0.010443,0.010418,0.010436
4,contesto urbano datazione secolo chiesa san fr...,0.066851,0.008520,0.008521,0.008489,0.907619
...,...,...,...,...,...,...
568,santuario madonna assunta cielo xii secolo sti...,0.017065,0.017150,0.017119,0.017045,0.931622
569,antico castello medievale assedi miracoli reli...,0.018812,0.018776,0.018800,0.018832,0.924781
570,splendido castello pupaggi comune sellano luog...,0.931616,0.017111,0.017093,0.017082,0.017099
571,esistono due tipologie prosciutto norcia primo...,0.967379,0.008167,0.008153,0.008156,0.008145


In [26]:
# la salvo
new_df.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_TD\topic-document-matrix.xlsx', index=False)  

In [27]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output_TD)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output_TD)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.4
Coherence: -0.002784389164583573


### Topic Coherence

In [28]:
# Define the search space. To see which hyperparameters to optimize, see the topic model's initialization signature
search_space = {"alpha": Real(low=0.001, high=5.0), "eta": Real(low=0.001, high=5.0), "num_topics": Integer(low=5, high=15)}

# Initialize an optimizer object and start the optimization.
optimizer=Optimizer()
optResult=optimizer.optimize(model, dataset, npmi, search_space, save_path= risultati_npmi, # path to store the results
                             number_of_call=30, # number of optimization iterations
                             model_runs=5,save_models=False) # number of runs of the topic model
#save the results of th optimization in a csv file
optResult.save_to_csv(r"C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\result_npmi.csv")

Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14
Current call:  15
Current call:  16
Current call:  17
Current call:  18
Current call:  19
Current call:  20
Current call:  21
Current call:  22
Current call:  23
Current call:  24
Current call:  25
Current call:  26
Current call:  27
Current call:  28
Current call:  29


In [38]:
import json
res = json.load(open(r"C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\result.json",'r'))
res.keys()

dict_keys(['dataset_name', 'dataset_path', 'is_cached', 'kernel', 'acq_func', 'surrogate_model', 'optimization_type', 'model_runs', 'save_models', 'save_step', 'save_name', 'save_path', 'early_stop', 'early_step', 'plot_model', 'plot_best_seen', 'plot_name', 'log_scale_plot', 'search_space', 'model_name', 'model_attributes', 'use_partitioning', 'metric_name', 'extra_metric_names', 'metric_attributes', 'extra_metric_attributes', 'current_call', 'number_of_call', 'random_state', 'x0', 'y0', 'n_random_starts', 'initial_point_generator', 'topk', 'time_eval', 'dict_model_runs', 'f_val', 'x_iters'])

In [39]:
# valuto qual è l'iterazione migliore
print("Valore migliore di Topic Diversity ottenuto: " )
print(max(res["f_val"]))
print("Numero di iterazione: ")
i = 0
for j in res["f_val"]:
  if(j == max(res["f_val"])):
    i += 1
    print(i)
    break
  else:
    i += 1
alpha = res['x_iters']['alpha'][i-1]
eta = res['x_iters']['eta'][i-1]
topic = res['x_iters']['num_topics'][i-1]
print("Valore di alpha:")
print(alpha)
print("Valore di eta:")
print(eta)
print("Numero di topic:")
print(topic)

Valore migliore di Topic Diversity ottenuto: 
0.029602084040565014
Numero di iterazione: 
21
Valore di alpha:
1.518321250598311
Valore di eta:
2.202792857073435
Numero di topic:
6


In [40]:
# Create Model
model_npmi = LDA(num_topics= topic, alpha= alpha, eta= eta)
model_npmi.partitioning(False)

In [41]:
model_output_npmi = model_npmi.train_model(dataset)

In [42]:
for t in model_output_npmi['topics']:
  print(" ".join(t))

topics = pd.DataFrame(model_output_npmi['topics'])
topics.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\topics.xlsx', index=False)  

chiesa san secolo trova centro piazza palazzo francesco citta museo
chiesa san trova piazza centro citta palazzo santa castello secolo
chiesa san centro citta secolo palazzo pia trova francesco via
chiesa san trova piazza palazzo santa pia citta sorge centro
san chiesa trova centro palazzo secolo santa citta borgo pia
chiesa san trova pia secolo centro piazza citta borgo sorge


In [43]:
# Carica la matrice topic-word in un DataFrame pandas
topic_word_matrix = pd.DataFrame(model_output_npmi['topic-word-matrix'])

# Imposta i nomi delle colonne come parole

# Ottieni i primi k termini da ciascuna riga come nomi di colonna
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# Stampa il DataFrame
print(topic_word_matrix)

topic_word_matrix.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\topic_word_matrix.xlsx', index=False)  

   abbadesse\n  abbandonato\n  abbastanza\n  abbazia\n  abbaziale\n   
0     0.000325       0.000337      0.000662   0.000625     0.000499  \
1     0.000329       0.000344      0.000811   0.000618     0.000469   
2     0.000325       0.000326      0.000870   0.000703     0.000425   
3     0.000324       0.000327      0.000652   0.000733     0.000472   
4     0.000327       0.000383      0.000729   0.000649     0.000443   
5     0.000338       0.000373      0.000672   0.000582     0.000388   

   abbazie\n  abitati\n  abitativo\n  abitato\n  abitazione\n  ...    with\n   
0   0.000288   0.000286     0.000653   0.000286      0.000322  ...  0.000304  \
1   0.000288   0.000292     0.000746   0.000292      0.000351  ...  0.000308   
2   0.000285   0.000287     0.000634   0.000289      0.000323  ...  0.000305   
3   0.000289   0.000284     0.000678   0.000283      0.000336  ...  0.000301   
4   0.000289   0.000289     0.000623   0.000288      0.000319  ...  0.000321   
5   0.000286   0.00028

In [44]:
num = 1
for topic in model_output_npmi['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,qui possiamo ammirare resti porta accesso anti...,0.168011,0.170521,0.164948,0.164785,0.164729,0.167006,0.063675,0.065274,0.061175,0.068719,0.063102,0.074818,0.071559,0.065387,0.063855
1,posizione dominante valle risacco val rasina r...,0.166217,0.152430,0.143166,0.152525,0.168885,0.216777,0.047505,0.043039,0.039715,0.037744,0.041493,0.041520,0.354589,0.036881,0.039487
2,collegiata santa maria assunta risale stile as...,0.164574,0.175959,0.159027,0.181936,0.163758,0.154746,0.058126,0.056921,0.059180,0.079963,0.058257,0.079685,0.060572,0.054586,0.074097
3,unica navata ampia luminosa due nicchioni due ...,0.157860,0.163119,0.158403,0.169790,0.167105,0.183723,0.049266,0.058064,0.059376,0.067053,0.065823,0.062558,0.070096,0.060654,0.056481
4,contesto urbano datazione secolo chiesa san fr...,0.189198,0.149947,0.157052,0.177126,0.160619,0.166058,0.033976,0.036256,0.035540,0.034272,0.030953,0.036670,0.035711,0.034928,0.049802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568,santuario madonna assunta cielo xii secolo sti...,0.170492,0.172596,0.162193,0.168836,0.165338,0.160544,0.060260,0.062438,0.064740,0.062486,0.058111,0.086876,0.055713,0.064662,0.083632
569,antico castello medievale assedi miracoli reli...,0.167003,0.179955,0.176678,0.161765,0.158436,0.156163,0.062688,0.064277,0.061327,0.072436,0.062440,0.069598,0.070801,0.068982,0.062110
570,splendido castello pupaggi comune sellano luog...,0.157865,0.170880,0.164101,0.175087,0.162719,0.169348,0.070011,0.068261,0.060266,0.065102,0.063212,0.068552,0.073447,0.065718,0.060919
571,esistono due tipologie prosciutto norcia primo...,0.165674,0.152964,0.173209,0.157556,0.189147,0.161451,0.035101,0.039152,0.036185,0.041237,0.038223,0.476000,0.042872,0.035943,0.035873


In [45]:
# la salvo
new_df.to_excel(r'C:\Users\franc\Desktop\PROJECTS\PROJECTS\RASTA\risultati\LDA\result_npmi\topic-document-matrix.xlsx', index=False)  

In [46]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output_npmi)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output_npmi)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.26666666666666666
Coherence: 0.009654361904530942
