# LDA - Pronto per i nostri POI

## Optimization

In [None]:
!pip install scikit-learn
!pip install octis
!pip install chardet



In [1]:
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [2]:
# percorsi
dataset_save = '../datasetNostriPOI/dataset_LDA_zipf'
risultati_TD = '../risultati/NostriPOI/LDA/results_TD'
risultati_npmi = '../risultati/NostriPOI/LDA/results_npmi'

In [4]:
from octis.dataset.dataset import Dataset
from pathlib import Path

class CustomDataset(Dataset):
    def _load_vocabulary(self, file_name):
        """
        Loads vocabulary from a file
        Parameters
        ----------
        file_name : name of the file to read
        """
        vocabulary = []
        file = Path(file_name)
        if file.is_file():
            with open(file_name, 'r', encoding='utf-8') as vocabulary_file:
                for line in vocabulary_file:
                    vocabulary.append(line.strip())
            self.__vocabulary = vocabulary
        else:
            raise Exception("error in loading vocabulary")

    # Sovrascrivi anche altri metodi se necessario per la gestione della codifica

# Usa la tua classe CustomDataset
dataset = CustomDataset()
dataset.load_custom_dataset_from_folder(dataset_save)


In [5]:
print(len(dataset.get_corpus()))

3141


### Train a Model

In [6]:
# Create Model
model = LDA(num_topics=12)
model.partitioning(False)

In [7]:
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA

model_output = model.train_model(dataset)

In [8]:
model_output

{'topic-word-matrix': array([[3.2246512e-04, 2.6479016e-05, 1.2162337e-05, ..., 4.9066040e-07,
         4.9066040e-07, 4.9066040e-07],
        [2.2931951e-04, 3.4636880e-05, 2.0792879e-05, ..., 8.9146971e-07,
         8.9146971e-07, 8.9146971e-07],
        [5.0112722e-04, 2.8497152e-05, 7.3638166e-06, ..., 3.7777605e-07,
         3.7777605e-07, 3.7777605e-07],
        ...,
        [2.9277790e-04, 1.7811775e-05, 1.9249712e-05, ..., 8.9732089e-07,
         8.9732089e-07, 8.9732089e-07],
        [3.0511583e-04, 1.6544720e-05, 2.0258356e-05, ..., 2.0711491e-06,
         2.0711491e-06, 2.0711491e-06],
        [1.7123454e-04, 2.5807714e-05, 2.3337900e-05, ..., 9.9596218e-07,
         9.9596218e-07, 9.9596218e-07]], dtype=float32),
 'topics': [['primo',
   'anno',
   'antico',
   'centro',
   'romano',
   'opera',
   'museo',
   'grande',
   'storico',
   'centrale'],
  ['stazione',
   'anno',
   'url',
   'servizio',
   'noto',
   'ferroviario',
   'abruzzo',
   'linea',
   'provincia',
   '

In [9]:
model._get_topic_document_matrix().shape

(12, 3141)

In [10]:
model._get_topics_words(topk = 10)

[['primo',
  'anno',
  'antico',
  'centro',
  'romano',
  'opera',
  'museo',
  'grande',
  'storico',
  'centrale'],
 ['stazione',
  'anno',
  'url',
  'servizio',
  'noto',
  'ferroviario',
  'abruzzo',
  'linea',
  'provincia',
  'primo'],
 ['palazzo',
  'castello',
  'aquila',
  'romano',
  'portale',
  'antico',
  'primo',
  'abruzzo',
  'archeologico',
  'facciata'],
 ['stazione',
  'castello',
  'primo',
  'anno',
  'antico',
  'porre',
  'tre',
  'storico',
  'ferroviario',
  'ferrovia'],
 ['castello',
  'primo',
  'borgo',
  'centro',
  'anno',
  'valle',
  'alcuno',
  'grande',
  'paese',
  'molto'],
 ['riserva',
  'naturale',
  'abruzzo',
  'anno',
  'primo',
  'area',
  'parco',
  'valle',
  'centro',
  'alcuno'],
 ['anno',
  'primo',
  'opera',
  'romano',
  'centro',
  'luogo',
  'grande',
  'antico',
  'realizzare',
  'lungo'],
 ['paese',
  'centro',
  'anno',
  'antico',
  'abruzzo',
  'storico',
  'primo',
  'mare',
  'valle',
  'nome'],
 ['anno',
  'teramo',
  'primo

### Evaluate a Model

In [11]:
for t in model_output['topics']:
  print(" ".join(t))

primo anno antico centro romano opera museo grande storico centrale
stazione anno url servizio noto ferroviario abruzzo linea provincia primo
palazzo castello aquila romano portale antico primo abruzzo archeologico facciata
stazione castello primo anno antico porre tre storico ferroviario ferrovia
castello primo borgo centro anno valle alcuno grande paese molto
riserva naturale abruzzo anno primo area parco valle centro alcuno
anno primo opera romano centro luogo grande antico realizzare lungo
paese centro anno antico abruzzo storico primo mare valle nome
anno teramo primo abruzzo centro antico navata facciata edificio altare
castello torre primo storia borgo anno abruzzo centro alcuno valle
anno castello primo paese antico affresco progetto famiglia opera tre
area anno provincia aquila primo abruzzo alcuno stazione centro noto


In [12]:
len(model_output['topics'])

12

In [13]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

In [14]:
# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [15]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.4166666666666667
Coherence: 0.0031130679145168626


### Hyperparameter Optimization

In [16]:
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real
from skopt.space.space import Real, Integer

### Topic Diversity

In [17]:
# Define the search space. To see which hyperparameters to optimize, see the topic model's initialization signature
search_space = {"alpha": Real(low=0.001, high=5.0), "eta": Real(low=0.001, high=5.0), "num_topics": Integer(low=5, high=12)}

# Initialize an optimizer object and start the optimization.
optimizer=Optimizer()
optResult=optimizer.optimize(model, dataset, topic_diversity, search_space, save_path=risultati_TD, # path to store the results
                             number_of_call=30, # number of optimization iterations
                             model_runs=5,save_models=False) # number of runs of the topic model
#save the results of th optimization in a csv file
optResult.save_to_csv(risultati_TD + r'\result_TD.csv')

Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14
Current call:  15
Current call:  16
Current call:  17
Current call:  18
Current call:  19
Current call:  20
Current call:  21
Current call:  22
Current call:  23
Current call:  24
Current call:  25
Current call:  26
Current call:  27
Current call:  28
Current call:  29


In [18]:
import json
res = json.load(open(risultati_TD + r"/result.json",'r'))
res.keys()

dict_keys(['dataset_name', 'dataset_path', 'is_cached', 'kernel', 'acq_func', 'surrogate_model', 'optimization_type', 'model_runs', 'save_models', 'save_step', 'save_name', 'save_path', 'early_stop', 'early_step', 'plot_model', 'plot_best_seen', 'plot_name', 'log_scale_plot', 'search_space', 'model_name', 'model_attributes', 'use_partitioning', 'metric_name', 'extra_metric_names', 'metric_attributes', 'extra_metric_attributes', 'current_call', 'number_of_call', 'random_state', 'x0', 'y0', 'n_random_starts', 'initial_point_generator', 'topk', 'time_eval', 'dict_model_runs', 'f_val', 'x_iters'])

In [20]:
# valuto qual è l'iterazione migliore
print("Valore migliore di Topic Diversity ottenuto: " )
print(max(res["f_val"]))
print("Numero di iterazione: ")
i = 0
for j in res["f_val"]:
  if(j == max(res["f_val"])):
    i += 1
    print(i)
    break
  else:
    i += 1
alpha = res['x_iters']['alpha'][i-1]
eta = res['x_iters']['eta'][i-1]
topic = res['x_iters']['num_topics'][i-1]
#topic = int(topic[0]) # Access the first element of the array and convert to an integer
print("Valore di alpha:")
print(alpha)
print("Valore di eta:")
print(eta)
print("Numero di topic:")
print(topic)

Valore migliore di Topic Diversity ottenuto: 
0.54
Numero di iterazione: 
25
Valore di alpha:
0.051072239988815764
Valore di eta:
0.11907836593940381
Numero di topic:
5


In [21]:
# Create Model
model_TD = LDA(num_topics= topic, alpha= alpha, eta= eta)
model_TD.partitioning(False)

In [22]:
model_output_TD = model_TD.train_model(dataset)

In [23]:
print(*list(model_output_TD.keys()), sep="\n") # Print the output identifiers

topic-word-matrix
topics
topic-document-matrix


In [24]:
model_TD._get_topic_document_matrix()

array([[3.25102650e-04, 5.98093122e-03, 2.60677567e-04, ...,
        1.76678383e-04, 9.59068686e-02, 5.65565586e-01],
       [3.25102650e-04, 3.86364729e-04, 2.60677567e-04, ...,
        1.76678383e-04, 1.86262769e-04, 6.01823267e-04],
       [3.25102650e-04, 1.37362629e-01, 9.13459361e-01, ...,
        9.99293327e-01, 9.03534412e-01, 6.01823267e-04],
       [9.98699605e-01, 3.86364758e-04, 8.57585967e-02, ...,
        1.76678383e-04, 1.86262769e-04, 6.01823325e-04],
       [3.25102650e-04, 8.55883658e-01, 2.60677567e-04, ...,
        1.76678383e-04, 1.86262769e-04, 4.32628989e-01]])

In [25]:
model_TD._get_topic_document_matrix().shape

(5, 3141)

In [26]:
import pandas as pd

for t in model_output_TD['topics']:
  print(" ".join(t))

topics = pd.DataFrame(model_output_TD['topics'])
topics.to_excel(risultati_TD + r'\topics.xlsx', index=False)

primo paese castello parco palazzo esterno antico collegamento anno valle
abruzzo anno primo parco storico centro castello noto grande situare
anno antico centro primo facciata teramo portale storico opera palazzo
stazione anno paese primo castello centro nuovo valle edificio torre
antico centro paese anno castello primo borgo museo abruzzo valle


In [29]:
vocabolario = []
with open('../datasetNostriPOI/dataset_LDA_zipf/vocabulary.txt', 'r', encoding='utf-8') as file:
    for word in file:
        vocabolario.append(word)


In [32]:
# Carica la matrice topic-word in un DataFrame pandas
topic_word_matrix = pd.DataFrame(model_output_TD['topic-word-matrix'])

# Imposta i nomi delle colonne come parole

# Ottieni i primi k termini da ciascuna riga come nomi di colonna
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# inverto il colonne e righe
inverted_topic_word_matrix = topic_word_matrix.transpose()

# Stampa il DataFrame
print(inverted_topic_word_matrix)

inverted_topic_word_matrix.to_excel('../risultati/NostriPOI/LDA/results_td/topic_word_matrix.xlsx', index=False)

                         0             1             2             3   
aas\n         2.929709e-04  2.798155e-04  5.105260e-04  3.934858e-04  \
aavv\n        3.006485e-05  1.294896e-05  2.154045e-05  2.726271e-05   
aba\n         1.977526e-05  1.812946e-05  1.568843e-05  1.444660e-05   
abaco\n       8.008343e-05  1.097989e-04  5.351538e-05  1.897432e-04   
abacucre\n    1.737983e-04  2.111978e-04  2.251752e-04  2.570481e-04   
...                    ...           ...           ...           ...   
vibrazione\n  9.978583e-07  4.108640e-07  2.529775e-07  3.769091e-07   
vibrazioni\n  9.978583e-07  4.108640e-07  2.529775e-07  3.769091e-07   
viburno\n     1.038853e-06  4.079670e-07  2.485970e-07  3.763760e-07   
viburnum\n    1.038853e-06  4.079670e-07  2.485970e-07  3.763760e-07   
vibus\n       1.038853e-06  4.079670e-07  2.485970e-07  3.763760e-07   

                         4  
aas\n         3.080914e-04  
aavv\n        2.598135e-05  
aba\n         1.491455e-05  
abaco\n       2.827

In [33]:
import pandas as pd

corpus_processed = pd.read_csv(r'../datasetNostriPOI/dataset_LDA_zipf/corpus.tsv', sep='\t', header=None)
corpus_processed
new_df = pd.DataFrame(corpus_processed[0])

In [34]:
model_output_TD['topic-document-matrix']

array([[3.25102708e-04, 4.29754378e-03, 2.60730478e-04, ...,
        1.76678368e-04, 1.01315401e-01, 5.74399233e-01],
       [3.25102679e-04, 3.86359694e-04, 2.60730478e-04, ...,
        1.76678368e-04, 1.86262536e-04, 6.01807435e-04],
       [3.25102679e-04, 1.36814281e-01, 9.25914347e-01, ...,
        9.99293327e-01, 8.98125827e-01, 6.01807435e-04],
       [9.98699605e-01, 3.86359723e-04, 7.33034536e-02, ...,
        1.76678368e-04, 1.86262536e-04, 6.01807493e-04],
       [3.25102679e-04, 8.58115435e-01, 2.60730478e-04, ...,
        1.76678368e-04, 1.86262536e-04, 4.23795342e-01]])

In [35]:
for topic in model_output_TD['topic-document-matrix']:
  print(topic)

[3.25102708e-04 4.29754378e-03 2.60730478e-04 ... 1.76678368e-04
 1.01315401e-01 5.74399233e-01]
[0.0003251  0.00038636 0.00026073 ... 0.00017668 0.00018626 0.00060181]
[3.25102679e-04 1.36814281e-01 9.25914347e-01 ... 9.99293327e-01
 8.98125827e-01 6.01807435e-04]
[9.98699605e-01 3.86359723e-04 7.33034536e-02 ... 1.76678368e-04
 1.86262536e-04 6.01807493e-04]
[3.25102679e-04 8.58115435e-01 2.60730478e-04 ... 1.76678368e-04
 1.86262536e-04 4.23795342e-01]


In [36]:
num = 1
for topic in model_output_TD['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5
0,stazione acquasparta stazione ferroviario serv...,0.000325,0.000325,0.000325,0.998700,0.000325
1,scoppio frazione disabitato acquasparta sire c...,0.004298,0.000386,0.136814,0.000386,0.858115
2,cecilia parrocchiale acquasparta provincia ter...,0.000261,0.000261,0.925914,0.073303,0.000261
3,palazzo cese situare luogo antico rocca acquas...,0.000187,0.752016,0.247422,0.000187,0.000187
4,acquasparto italiano abitante provincia terno ...,0.102861,0.820087,0.000185,0.039203,0.037664
...,...,...,...,...,...,...
3136,costantinopoli sire piazza rocco dire localmen...,0.000389,0.000389,0.998444,0.000389,0.000389
3137,saneustachio sire scanno provincia aquila inti...,0.000202,0.000202,0.999193,0.000202,0.000202
3138,rocco dire carmine scanno provincia aquila sto...,0.000177,0.000177,0.999293,0.000177,0.000177
3139,palazzo rienzo sire scanno provincia aquila pa...,0.101315,0.000186,0.898126,0.000186,0.000186


In [37]:
# la salvo
new_df.to_excel('../risultati/NostriPOI/LDA/results_td/topic-document-matrix.xlsx', index=False)

In [38]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output_TD)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output_TD)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.52
Coherence: -0.0034062730436794767


### Topic Coherence

In [39]:
# Define the search space. To see which hyperparameters to optimize, see the topic model's initialization signature
search_space = {"alpha": Real(low=0.001, high=5.0), "eta": Real(low=0.001, high=5.0), "num_topics": Integer(low=5, high=15)}

# Initialize an optimizer object and start the optimization.
optimizer=Optimizer()
optResult=optimizer.optimize(model, dataset, npmi, search_space, save_path= risultati_npmi, # path to store the results
                             number_of_call=30, # number of optimization iterations
                             model_runs=5,save_models=False) # number of runs of the topic model


Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14
Current call:  15
Current call:  16
Current call:  17
Current call:  18
Current call:  19
Current call:  20
Current call:  21
Current call:  22
Current call:  23
Current call:  24
Current call:  25
Current call:  26
Current call:  27
Current call:  28
Current call:  29


In [41]:
#save the results of th optimization in a csv file
optResult.save_to_csv("../risultati/NostriPOI/LDA/results_npmi/result_npmi.csv")

In [42]:
import json
res = json.load(open("../risultati/NostriPOI/LDA/results_npmi/result.json",'r'))
res.keys()

dict_keys(['dataset_name', 'dataset_path', 'is_cached', 'kernel', 'acq_func', 'surrogate_model', 'optimization_type', 'model_runs', 'save_models', 'save_step', 'save_name', 'save_path', 'early_stop', 'early_step', 'plot_model', 'plot_best_seen', 'plot_name', 'log_scale_plot', 'search_space', 'model_name', 'model_attributes', 'use_partitioning', 'metric_name', 'extra_metric_names', 'metric_attributes', 'extra_metric_attributes', 'current_call', 'number_of_call', 'random_state', 'x0', 'y0', 'n_random_starts', 'initial_point_generator', 'topk', 'time_eval', 'dict_model_runs', 'f_val', 'x_iters'])

In [43]:
# valuto qual è l'iterazione migliore
print("Valore migliore di Topic Coherence ottenuto: " )
print(max(res["f_val"]))
print("Numero di iterazione: ")
i = 0
for j in res["f_val"]:
  if(j == max(res["f_val"])):
    i += 1
    print(i)
    break
  else:
    i += 1
alpha = res['x_iters']['alpha'][i-1]
eta = res['x_iters']['eta'][i-1]
topic = res['x_iters']['num_topics'][i-1]
print("Valore di alpha:")
print(alpha)
print("Valore di eta:")
print(eta)
print("Numero di topic:")
print(topic)

Valore migliore di Topic Coherence ottenuto: 
0.0084841240732935
Numero di iterazione: 
17
Valore di alpha:
0.14078484605497166
Valore di eta:
4.893753716997072
Numero di topic:
14


In [44]:
# Create Model
model_npmi = LDA(num_topics= topic, alpha= alpha, eta= eta)
model_npmi.partitioning(False)

In [45]:
model_output_npmi = model_npmi.train_model(dataset)

In [46]:
import pandas as pd

In [48]:
for t in model_output_npmi['topics']:
  print(" ".join(t))

topics = pd.DataFrame(model_output_npmi['topics'])
topics.to_excel('../risultati/NostriPOI/LDA/results_npmi/topics.xlsx', index=False)

primo paese centro antico castello anno contenere alto piccolo luogo
anno castello paese valle primo romano alcuno antico lungo affresco
castello provincia anno primo centro mandamento storico paese costruire storia
primo centro castello anno paese palazzo storico antico italiano famiglia
paese anno castello primo antico torre grande alcuno opera centro
anno primo paese abruzzo castello antico centro teramo facciata tre
anno primo paese museo noto antico centro spoleto storia italiano
parco valle naturale area riserva abruzzo lago provincia castello situare
castello primo paese anno centro antico noto alcuno romano opera
primo anno antico castello paese lungo alcuno lago abitante altare
anno primo lago spoleto opera storia castello alcuno stazione antico
centro anno antico primo abruzzo storico nuovo strada romano paese
museo primo anno centro abruzzo antico storico archeologico paese borgo
anno castello antico primo alcuno molto paese progetto immagine storia


In [52]:
# Carica la matrice topic-word in un DataFrame pandas
topic_word_matrix = pd.DataFrame(model_output_npmi['topic-word-matrix'])

# Imposta i nomi delle colonne come parole

# Ottieni i primi k termini da ciascuna riga come nomi di colonna
topic_word_matrix.columns = [vocabolario[i] for i in range(topic_word_matrix.shape[1])]

# Stampa il DataFrame
print(topic_word_matrix)

# inverto il colonne e righe
inverted_topic_word_matrix = topic_word_matrix.transpose()

# Stampa il DataFrame
print(inverted_topic_word_matrix)


inverted_topic_word_matrix.to_excel('../risultati/NostriPOI/LDA/results_npmi/topic_word_matrix.xlsx', index=False)

       aas\n    aavv\n     aba\n   abaco\n  abacucre\n  abamonte\n  abamore\n   
0   0.000035  0.000016  0.000018  0.000026    0.000026    0.000045   0.000016  \
1   0.000041  0.000016  0.000017  0.000020    0.000027    0.000053   0.000016   
2   0.000038  0.000016  0.000019  0.000022    0.000036    0.000070   0.000017   
3   0.000034  0.000017  0.000016  0.000020    0.000027    0.000056   0.000016   
4   0.000045  0.000016  0.000019  0.000026    0.000041    0.000066   0.000018   
5   0.000333  0.000018  0.000007  0.000067    0.000183    0.000364   0.000031   
6   0.000050  0.000016  0.000018  0.000024    0.000033    0.000052   0.000018   
7   0.000046  0.000016  0.000017  0.000026    0.000028    0.000191   0.000014   
8   0.000044  0.000016  0.000021  0.000023    0.000030    0.000056   0.000017   
9   0.000047  0.000016  0.000018  0.000022    0.000033    0.000092   0.000019   
10  0.000034  0.000016  0.000017  0.000021    0.000027    0.000048   0.000018   
11  0.000140  0.000022  0.00

In [53]:
num = 1
for topic in model_output_npmi['topic-document-matrix']:
  for i in range(0,len(topic)):
    new_df.loc[i,num] = topic[i]
  num = num+1
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,stazione acquasparta stazione ferroviario serv...,0.000888,0.000888,0.000888,0.000888,0.000888,0.988451,0.000888,0.000889,0.000888,0.000888,0.000888,0.000890,0.000888,0.000888
1,scoppio frazione disabitato acquasparta sire c...,0.001020,0.001020,0.001020,0.001020,0.001020,0.986738,0.001020,0.001021,0.001020,0.001020,0.001020,0.001021,0.001020,0.001020
2,cecilia parrocchiale acquasparta provincia ter...,0.000703,0.000703,0.000704,0.000703,0.000704,0.990853,0.000704,0.000704,0.000703,0.000704,0.000703,0.000704,0.000704,0.000703
3,palazzo cese situare luogo antico rocca acquas...,0.000507,0.000507,0.000507,0.000507,0.000507,0.993413,0.000507,0.000507,0.000507,0.000507,0.000507,0.000507,0.000507,0.000507
4,acquasparto italiano abitante provincia terno ...,0.000501,0.000501,0.000501,0.000501,0.000501,0.993475,0.000501,0.000501,0.000501,0.000501,0.000501,0.000511,0.000501,0.000501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3136,costantinopoli sire piazza rocco dire localmen...,0.001030,0.001030,0.001030,0.001030,0.001030,0.986602,0.001030,0.001031,0.001030,0.001031,0.001030,0.001032,0.001031,0.001030
3137,saneustachio sire scanno provincia aquila inti...,0.000543,0.000543,0.000543,0.000543,0.000543,0.992937,0.000543,0.000543,0.000543,0.000543,0.000543,0.000544,0.000543,0.000543
3138,rocco dire carmine scanno provincia aquila sto...,0.000478,0.000478,0.000478,0.000478,0.000478,0.993780,0.000478,0.000479,0.000478,0.000478,0.000478,0.000479,0.000479,0.000478
3139,palazzo rienzo sire scanno provincia aquila pa...,0.000496,0.000496,0.000496,0.000496,0.000496,0.993555,0.000496,0.000496,0.000496,0.000496,0.000496,0.000497,0.000496,0.000496


In [None]:
# la salvo
new_df.to_excel('../risultati/NostriPOI/LDA/results_npmi/topic-document-matrix.xlsx', index=False)

In [None]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(model_output_npmi)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(model_output_npmi)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.30714285714285716
Coherence: -0.008732089028663712
