## Arquivo: Galaxia Project

## Setup Dependencies

### Import libraries

In [None]:
import requests
import pprint
import json
import datetime
import requests
import io
import os
import re
import sqlite3
from pathlib import Path


import numpy as np
import pandas as pd


from tqdm.notebook import trange, tqdm


from bs4 import BeautifulSoup



import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
import seaborn as sns


import datamapplot
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})



from sentence_transformers import SentenceTransformer

from scipy.cluster import hierarchy as sch
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA



import umap

from transformers import pipeline
from sentence_transformers import SentenceTransformer

from bertopic import BERTopic
from bertopic.representation import TextGeneration, KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import PartOfSpeech
from bertopic.vectorizers import ClassTfidfTransformer





from hdbscan import HDBSCAN
import hdbscan


import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import spacy
# spacy.cli.download('pt_core_news_sm')


### Initial setup

In [None]:
sbert_model_name = 'sentence-transformers/distiluse-base-multilingual-cased-v1'
dbml_model_name = 'sentence-transformers/distiluse-base-multilingual-cased-v1'
sbert_model = SentenceTransformer(dbml_model_name)


In [None]:
stopwords_pt = nltk.corpus.stopwords.words('portuguese')
stopwords_custom = stopwords_pt + ["sobre", "vai", "diz", "anos", "milhões", "mil", "não", "ter", "ano", "pode", "pede", 
                    "dois", "tres", "três", "país", "dias", "vida", "ainda", "quase", "quer", "faz", "fazer",
                    "dia", "viver", "vamos"]


In [None]:
RANDOM_STATE = 249875
rng = np.random.default_rng(seed=RANDOM_STATE)

In [None]:
## Default seaborn/matplotlib settings
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

## Data Extraction

### Functions

In [None]:
def parse_01(soup):
    
    classes = ["titulo20", "titulo11", "news_tituloonline", "news_txtmini"]
    
    children = []
    for c in classes:
        elements = soup.find_all(attrs={"class": c})
        for e in elements:
            children.append(e.text.strip())
            
            for e2 in e.find_all("strong"):
                children.append(e2.text.strip())
    
    children = list(set(children))
    children = [child for child in children if ((len(child) > 2) and not ("comentário" in child))]

    return children
    
def parse_02(soup, classes=[]):
    if len(classes) == 0:
        classes = ["esq02Tit", "tabsTit", "tabsDirTit", "dir01Txt2", ]

    children = []
    for c in classes:
        elements = soup.find_all(attrs={"class": c})
        
        for e in elements:
            for e2 in e.find_all("a"):
                if (len(e2.text) > 2) and not ("comentário" in e2.text):
                    children.append(e2.text.strip())
    return list(set(children))


def parse_htitle(soup, h, t='title'):
    title_elements = [h]
    h_titles = soup.find_all(h, {"class":t})

    
    children = [h_child.find("a").text for h_child in h_titles]
    children = list(set(children))

    return children


In [None]:
def get_titles(page, tstamp):

    soup = BeautifulSoup(page, "html.parser")
    children = []

    tstamp_year = int(tstamp[:4])
    if tstamp_year <= 2014:
        children = parse_01(soup)
        if len(children) < 1 :
            children = parse_02(soup)
        if len(children) < 1 : # 2012, 2013
            children = parse_htitle(soup,'h1', 'bigTitle')
            children += parse_htitle(soup,'h2', 'smallTitle')
    elif tstamp_year <=2015:
        children = parse_htitle(soup,'h2', 'bigTitle') 
    else:
        if len(children) < 1 : # 2015
            children = parse_htitle(soup,'h1')
        if len(children) < 1 :
            children = parse_htitle(soup,'h2')

   
    return children


In [None]:
def get_results(u, q=None, return_json=True, n=0, sleep_duration=5, max_tries=5):
    page = requests.get(u, params=q)
    
    if page.status_code != 200 :
        if (n > max_tries) | (page.status_code == 503):
            print("{} status. Attempt {} of {}".format(page.status_code, n, max_tries))
            return False
        print('{} status. Next try after {}.'.format(page.status_code, n))
        time.sleep(sleep_duration)
        
        return get_results(u, q=q, return_json=return_json, n=n+1)

    if return_json:
        return page.json()
    else:
        return page.text
        

In [None]:
def get_items(q, items = []):
    try: 
        items += q["response_items"]
        if ("next_page" in list(q.keys())) and (len(q['response_items'])>0):
            r2 = get_results(u=q['next_page'], return_json=True)            
            try:                
                return get_items(r2, items)
            except:
                print('get_items', q['next_page'])
                return False
        return items
            
    except:
        pprint.pprint(q)
        return False

    

### Execute

In [None]:
sites = ["expresso.pt"]
query_base = 'http://arquivo.pt/textsearch'
item_keys = ["title","originalURL","linkToArchive","linkToNoFrame","tstamp","contentLength","linkToScreenshot"]

start = 2001
fromto = [{'to': str(start)+"1231235959"}]

for i in range(1,24,1):
    fromto.append({'from': str(start+i)+"0101000000", 'to': str(start+i)+"1231235959"})


In [None]:
def extract_arquivo(query=None, query_base=query_base,
                    fromto=None,
                    item_keys=[], prettyprint='false', 
                    scrape_titles=True,
                    get_sample=False, sample_dest="./",
                    save_result=True, result_dest="./"
                   ):
    
    query_params = dict(versionHistory=query,
                        fields = ','.join(item_keys),
                        prettyPrint = 'false',
                        **fromto
                       )
    items = []
    query_result = get_results(query_base, q=query_params)
    
    items = query_result['response_items']
    
    
    keep_going = True
    query_next = query_result.copy()
    while keep_going:
        if ("next_page" in list(query_next.keys())) & (len(query_next['response_items'])>0) :
            query_next = get_results(query_next['next_page'])
            if ("response_items" in list(query_next.keys())):
                if (len(query_next['response_items'])>0) :
                    items += query_next['response_items']
                else:
                    break
            else:
                break
        else:
            break

    curr_year = fromto['to'][:4]
    results = {}
    sampled=False

    for itemi in trange(len(items), desc="{}".format(curr_year)):
        item = items[itemi]
        
        if "linkToNoFrame" in list(item.keys()):
            extract = get_results(item["linkToNoFrame"], return_json=False)
            if extract == False:
                print('failed to extract', item["linkToNoFrame"])
                pass
            else:
                curr_page = {}
                
                if scrape_titles:
                    titles = get_titles(extract, item['tstamp'])
                    curr_page['children'] = titles
                else:
                    curr_page['children'] = "Skipped title scraping of {} items.".format(len(items))
                
                for k in item_keys:
                    curr_page[k] = item[k]
                
                results[item["tstamp"]] = curr_page
            
            if (get_sample) & (sampled == False) :
                page_filename = sample_dest + query + "_" + item['tstamp'] + ".html"
                with open(page_filename, 'w') as f:
                    f.write(extract)
                sampled = True

    if save_result:
        filename = result_dest + query + "_" + curr_year + ".json"
    
        with open(filename, "w") as file:
            json.dump(results, file)
    else:
        print("Skipping save")


In [None]:
## Test on 2001
extract_arquivo(query=sites[0], 
                query_base=query_base,
                fromto=fromto[0],
                item_keys=item_keys, 
                prettyprint='false', 
                scrape_titles=False,
                get_sample=True, sample_dest="./_DELETE/",
                result_dest="./_DELETE/"
                )

In [None]:

for query in sites:
    for i in trange(len(fromto[start:]), desc="{}".format(query)):
        ft = fromto[i+start]
        
        extract_arquivo(query=sites[0], 
                query_base=query_base,
                fromto=fromto[0],
                item_keys=item_keys, 
                prettyprint='false', 
                scrape_titles=False,
                get_sample=True, sample_dest="./pages/html/",
                result_dest="./pages/"
                )

### Scrape elements


In [None]:
def get_titles(base_src=None, save_df_dest=None, return_df=False):
    exclude = ['children', 'contentLength']
    titles = []
    
    base_dir = Path(base_src)
    nchildren = 0
    
    files_in_basepath = base_dir.iterdir()
    
    for item in files_in_basepath:
        if (item.is_file()) and (item.name.endswith('.json')):
            file_path = base_src + item.name
    
            with open(file_path) as json_file:
                file_data = json.load(json_file)
                
                for timestamp in file_data:
                    snap = file_data[timestamp]
                    meta_keys = [k for k in list(snap.keys()) if k not in exclude]
                    meta = {k: snap[k] for k in meta_keys}
                    nchildren += len(snap['children'])
                    for title in snap['children']:
                        row = meta.copy()
                        row['text'] = title
                        titles.append(row)

    df = pd.DataFrame(titles)
    
    if save_df_dest != None:
        print('Saving df {} to {}'.format(df.shape, save_df_dest))
        df.to_csv(save_df_dest, index=False)
    
    if return_df:
        return df
    
    else:
        return titles

In [None]:
df_titles = get_titles("./pages/", return_df=True)
df_titles.head()

## Preprocess Data

In [None]:
def preprocess(sentence, 
               basic=False,
               stop_list=[]
              ):
    sentence = str(sentence)
    sentence = sentence.replace('{html}',"") 
    sentence = sentence.strip()

    if basic==True:
        return sentence
    else:    
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, '', sentence)
        sentence = re.sub(r'http\S+', '', sentence)
        sentence = re.sub('[0-9]+', '', sentence)
        sentence = re.sub(r"\s+", " ", sentence)
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(sentence)
        
        filtered_words = [w for w in tokens if (not str.lower(w) in stop_list)]
        filtered_words = [w for w in filtered_words if (len(w) > 2)]
        
        return " ".join(filtered_words).strip()
        


In [None]:
def run_preprocessing(data, prep_args=dict()):
    df = data.copy()

    df['text'] = df['text'].apply(lambda x: preprocess(x, basic=True))

    df['prep'] = df['text'].apply(lambda x: preprocess(x, **prep_args))
    
    ## Remove single-word text
    df = df.loc[df['prep'].apply(lambda x: len(x.split(' ')) > 1)].copy()
    
    # ## Remove empty text
    df['prep'] = df['prep'].replace('',np.nan)
    df.dropna(subset=['prep'], inplace=True)

        
    df['datetime'] = pd.to_datetime(df['tstamp'])
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month

    df.sort_values(by='datetime', ascending=True, inplace=True)
    
    ## Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df

def run_cleaning(data):
    df = data.copy()
    df['lower'] = df['prep'].apply(lambda x: x.lower())
    

    df.drop_duplicates(subset=['lower'], keep='first', inplace=True)
    df.drop(columns=['lower'], inplace=True)
    
    df = df.loc[~df['prep'].str.startswith("primeira página")].copy()
    df = df.loc[~df['prep'].str.contains("página Expresso")].copy()
    
    
    ## Reset index
    df.reset_index(drop=True, inplace=True)

    return df



In [None]:
RUN_PREP = True


In [None]:
prep_path = './preprocessed.csv' 

if RUN_PREP:
    df_prep = run_preprocessing(df_raw, prep_args=dict(stop_list=stopwords_pt))
    
    df = run_cleaning(df_prep)

df.to_csv(prep_path, index=False)


## Get document embeddings

### Functions

In [None]:
def get_pretty_now():
    return datetime.datetime.now().strftime("%Y%m%d_%H%M")
    

In [None]:
RUN_EMBED = True
SAVE_EMBED = True

In [None]:
def get_doc_vectors(fit=RUN_EMBED,
                    save=SAVE_EMBED,
                    model=None,
                    docs=None,
                    vector_path=None):
    
    if vector_path==None:
        vector_path = "./sbert_{}.csv".format(get_pretty_now())
        print("No vector path specified, using", vector_path)
    
    if (fit==True) & (model != None) & ("None" not in str(type(docs))):
        vectors = model.encode(docs, show_progress_bar=True)
        print('Embed:', datetime.datetime.now())
        if save:
            np.savetxt(vector_path, vectors,  delimiter = ",")
            print('Saved:', vector_path, datetime.datetime.now())
    elif model == None:
        print("No model provided")
        return
    elif docs == None:
        print("No data provided")
        return
        
    else :
        vectors = pd.read_csv(vector_path, header=None)
        print("Use saved vectors")

    return vectors
        

In [None]:
def get_vectors_chunks(docs=None, chunksize=10000, csv_dest=None, model=None, return_df=True):

    try:
        nchunks = int(len(docs)/chunksize)+1
        
        doc_vectors = pd.DataFrame()
        
        for i in trange(nchunks, desc="{}".format(get_pretty_now())):
            start_chunk = i*chunksize
            end_chunk = min(len(docs), (i+1)*chunksize)
            
            docs_chunk = docs[start_chunk:end_chunk]

            vectors_chunk = get_doc_vectors(fit=RUN_EMBED, 
                                            save=SAVE_EMBED, 
                                            vector_path=csv_dest.format(start_chunk), 
                                            model=model, 
                                            docs=docs_chunk
                                           )
            
            if return_df==True:
                doc_vectors = pd.concat([doc_vectors, pd.DataFrame(vectors_chunk)], axis=0, ignore_index=True)
        
        if return_df==True:
            return doc_vectors
    except Exception as error:
        print("Failed to run get_vectors. Check parameters:")
        print('docs:\t\t', docs, '\nchunksize:\t', chunksize, 
              '\ncsv_dest:\t', csv_dest, '\nmodel:\t\t', model,
              '\nreturn_df:\t', return_df
             )
        print(error)
        
        
            
        
    
        

### Get vectors of sample

In [None]:
# ## Sentence embeddings of sample data
# expresso_vectors = get_doc_vectors(fit=RUN_EMBED, 
#                                    save=SAVE_EMBED, 
#                                    vector_path="./generated/vectors/doc/expresso_sample.csv", 
#                                    model=sbert_model, docs=df_sample['prep'])


In [None]:
# %%time

# sample_vectors = get_vectors_chunks(docs=docs_sample,
#                                       csv_dest="./generated/vectors/doc/sample/expresso_{}.csv",
#                                       model=sbert_model
#                                      )
# sample_vectors

### Get all vectors

#### Execute

In [None]:
docs_full = df['prep'].values
docs_orig = df['text'].values


In [None]:
%%time

expresso_vectors = get_vectors_chunks(docs=docs_full,
                                      csv_dest="./generated/vectors/doc/expresso_vectors_{}.csv",
                                      model=sbert_model
                                     )
print(expresso_vectors.shape)
expresso_vectors.head(3)

In [None]:
vectors_full = expresso_vectors.values

In [None]:
expresso_vectors.shape

## Reduce dimensionality

In [None]:
df_data = pd.read_csv(prep_path)
print(df_data.shape)
df_data.head()


### Setup viz functions

In [None]:
scatter_k = dict(color='k',
                 alpha=.1,
                 marker='.',
                 edgecolor='none',
                 )

scatter_default = dict(alpha=.5,
                    palette='Spectral',
                    marker='.',
                    edgecolor='none'
                   )

In [None]:
def clean_ax(ax):

    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.set_aspect('equal')
    return ax

    

def plot_topics(topics_df=None,
                umap_xy=None,
                scatter_args=scatter_default,
                scatter_out=scatter_k,
                figsize=(12,12)
                ):

    df = umap_xy.copy()

    df.columns=['X','Y']
    df['TopicsID'] = topics_df['Topic']
    df['Topics'] = topics_df['Name'].apply(lambda x: ' '.join(x.split('_')))

    
    fig, ax = plt.subplots(figsize=figsize)
    df_out = df.loc[df['TopicsID']==-1]
    df_    = df.loc[df['TopicsID']!=-1]
    
    sns.scatterplot(df_out, 
                    x='X', y='Y',
                    ax=ax, 
                    **scatter_out)
    scatterplot = sns.scatterplot(df_, 
                                  x='X', y='Y',
                                  hue='Topics',
                                  ax=ax, 
                                  **scatter_args)
    
    ax.legend(loc='center left', 
              bbox_to_anchor=(1, 0.5),
              markerscale=2,
             )
    
    for lh in scatterplot.legend_.legend_handles: 
        lh.set_alpha(1)
    ax = clean_ax(ax)
        
    plt.show()




In [None]:
def plot_umap(umap_xy=None, 
              save=False, save_path=None, 
              figsize=(12,12), ax=None,
              title=None,
              scatter_args=scatter_k
             ):
    
    u_df = pd.DataFrame(umap_xy, columns=['X','Y'], )

    if ax==None:
        fig, ax = plt.subplots(figsize=figsize)
        
    sns.scatterplot(u_df, ax=ax,
                    x='X', y='Y',
                    **scatter_args)
    
    ax = clean_ax(ax)
    if title != None:
        ax.set_title(title)
        
    if save:
        if save_path == None:
            save_path = "./fig_umap_{}.png".format(get_pretty_now())
        fig.savefig(save_path)
        
    plt.show()

### Get UMAP 2D

In [None]:
umap_args_2 = dict(n_components=2, 
                  min_dist=0.1, 
                  metric='cosine', 
                  n_neighbors=5,
                  verbose=True
                  )

umap_model_2     = umap.UMAP(**umap_args_2)        
umap_embedding_2 = umap_model_2.fit(vectors_full)

In [None]:
umap_embedding_2

In [None]:
precomputed_2d_df = pd.DataFrame(umap_embedding_2.embedding_)

In [None]:
plot_umap(umap_xy=umap_embedding_2.embedding_)

In [None]:
full_2d_path = "./generated/vectors/umap/umap_xy.csv"

precomputed_2d_df.to_csv(full_2d_path, index=False)


### Get UMAP 5D

In [None]:
umap_args_5 = dict(n_components=5, 
                  min_dist=0.1, 
                  metric='cosine', 
                  n_neighbors=5,
                  verbose=True
                  )

umap_model_5     = umap.UMAP(**umap_args_5)        
umap_embedding_5 = umap_model_5.fit(vectors_full)


In [None]:
precomputed_5d_df = pd.DataFrame(umap_embedding_5.embedding_)


In [None]:
full_5d_path = "./generated/vectors/umap/umap_5.csv"

precomputed_5d_df.to_csv(full_5d_path, index=False)


## Topic Modelling

In [None]:
class PrecomputedUMAP:
    """ Return pre-calculated reduced embeddings """
    
    def __init__(self, reduced_embeddings):
        self.reduced_embeddings = reduced_embeddings
    
    def fit(self, X):
            return self
    
    def transform(self, X):
        return self.reduced_embeddings

### Get UMAP Vectors

In [None]:
umap_vectors_2 = precomputed_2d.values
umap_vectors_5 = precomputed_5d.values


### Setup for BERTopic

#### Functions

In [None]:

hdb_args = dict(
    min_cluster_size         = 800,
    min_samples              = 100,
    metric                   = 'euclidean',
    cluster_selection_method = 'leaf',
    gen_min_span_tree        = True, 
    prediction_data          = True
)

countvec_args = dict(
    ngram_range       = (1, 2),
    strip_accents     = False,
    max_df            = .3,
    stop_words        = stopwords_pt, 
)


sentence_args = dbml_model_name

ctf_args = dict(
    reduce_frequent_words = True
    )



In [None]:
%%time
clust_model = GaussianMixture(
    n_components=20, 
    covariance_type='full',
    random_state=RANDOM_STATE
)

gm_clust_labels = clust_model.fit_predict(umap_vectors_5)
gm_clust_probs = clust_model.predict_proba(umap_vectors_5)


#### Models

In [None]:
class BaseCluster:
    """ The Base Cluster class

    Using this class directly in BERTopic will make it skip
    over the cluster step. As a result, topics need to be passed 
    to BERTopic in the form of its `y` parameter in order to create 
    topic representations. 

    Examples:    

    This will skip over the cluster step in BERTopic:

    ```python
    from bertopic import BERTopic
    from bertopic.dimensionality import BaseCluster

    empty_cluster_model = BaseCluster()

    topic_model = BERTopic(hdbscan_model=empty_cluster_model)
    ```

    Then, this class can be used to perform manual topic modeling. 
    That is, topic modeling on a topics that were already generated before 
    without the need to learn them:

    ```python
    topic_model.fit(docs, y=y)
    ```
    """
    def fit(self, X, y=None):
        if y is not None:
            self.labels_ = y
        else:
            self.labels_ = None
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        return X

In [None]:
## Default models

vectorizer_model = CountVectorizer(**countvec_args)

sentence_model = SentenceTransformer(sentence_args)

ctfidf_model = ClassTfidfTransformer(**ctf_args)

precomputed_umap_model = PrecomputedUMAP(umap_vectors_5)


pos_model = PartOfSpeech("pt_core_news_sm")
mmr_model = MaximalMarginalRelevance(diversity=0.75)
kbi_model = KeyBERTInspired(top_n_words=30)

main_representation_model = {
    "MMR": mmr_model,
    "POS": pos_model
}


aspect_model1 = mmr_model
aspect_model2 = [pos_model]
aspect_model3 = [kbi_model]
aspect_model4 = [pos_model, mmr_model]

representation_model = {
   "Main":     aspect_model1,
   "AspectKBI":  aspect_model3,
}



In [None]:
tbert_args = dict(
    language             = "multilingual",
    verbose              = True,
    hdbscan_model        = BaseCluster(), 
    vectorizer_model     = vectorizer_model,
    embedding_model      = sentence_model,
    umap_model           = precomputed_umap_model,
    ctfidf_model         = ctfidf_model,
    representation_model = representation_model
    )




### Execute

In [None]:
n_clusters = 20

In [None]:
scatter_default['palette'] = cc.b_glasbey_category10[:n_clusters]

In [None]:
%%time 

clust_model = GaussianMixture(
    n_components=20, 
    covariance_type='full',
    random_state=RANDOM_STATE
)

topic_model   = BERTopic(**tbert_args)
topics, probs = topic_model.fit_transform(docs_full, vectors_full, y=gm_clust_labels)

topics_df     = topic_model.get_document_info(docs_full)
topics_info   = topic_model.get_topic_info()


In [None]:
topics_df.head()

In [None]:
plot_topics(topics_df, umap_xy=precomputed_2d, 
            scatter_args=scatter_default, 
            scatter_out=scatter_default
           )


#### Get LLM topic labels

In [None]:
all_topics = topic_model.get_topics()


In [None]:
topics_template = {}

for ti in range(topics_info.shape[0]):
    curr_topic = topics_info.loc[topics_info['Topic']==ti]
    
    curr_kw = curr_topic['AspectKBI'].to_list()[0]

    curr_kw = [kwi for kwi in curr_kw if len(kwi) > 1]
    curr_kw = list(set(curr_kw))
    top_kw = [a[0] for a in all_topics[ti]]
    top_kw = top_kw + [kwi for kwi in curr_kw if (kwi not in top_kw)]
    
    curr_docs = curr_topic['Representative_Docs'].to_list()[0]

    topics_template['Topic {}'.format(ti)] = {
        'keywords' : top_kw[:10],
        'docs'     : curr_docs,
    }

In [None]:
prompt_topic = """
I have several topics that each contain a list of keywords, and examples of relevant documents.
I want you to act as a helpful topic labeller for a research project by generating a topic label 
based on the provided keywords and documents. 

Based on the information about each topic, suggest a topic label in European Portuguese 
using at most 5 words, and provide a short sentence summarizing the topic, 
in the following json format:

"<topic number>" : {
    "label": <topic label>,
    "summary": <topic summary>
}

These are the topics I want you to label:

"""

In [None]:

for ti in topics_template:
    prompt_topic += "**{}**".format(ti)
    prompt_topic += "\n"
    prompt_topic += "Keywords: {}".format(', '.join(topics_template[ti]['keywords']))
    prompt_topic += "\n"
    prompt_topic += "Documents: \n- {}".format('\n- '.join(topics_template[ti]['docs']))
    prompt_topic += "\n\n"

f = open("./generated/prompts_topic.txt", "a")
f.write(prompt_topic)
f.close()

In [None]:
prompt_merge = """
Based on your answers, suggest which topics would make sense to combine together due to their similarity. 
You can combine more two or three topics into the same group. For the topics that should be combined, 
suggest, for each combination, a topic label that describes this merged topic, as well as a short summary 
for this merged topic. Suggest this topic label in European Portuguese using at most 5 words, and provide a short 
sentence summarizing the topic, in the following json format:

"Merged Topic <topic number>" : {
    "label": <combined topic label>,
    "source": <original topic numbers>,
    "summary": <combined topic summary>
}
"""

In [None]:
# # Extract hierarchical topics and their representations
hierarchical_topics = topic_model.hierarchical_topics(docs_full)

# # Visualize these representations
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)


In [None]:
topics_df_merged = topics_df.copy()
topics_df_merged

In [None]:
manual_update = {
    'Topics': {
        "0": {
        "label": "Política Nacional",
        "summary": "Discursos e movimentações políticas de figuras como Pedro Nuno Santos e Luís Montenegro."
        },
        "1": {
        "label": "Conflitos Globais",
        "summary": "Aborda eventos internacionais como ataques terroristas, questões geopolíticas e catástrofes."
        },
        "2": {
        "label": "Música e Festivais",
        "summary": "Explora lançamentos de álbuns, artistas e festivais de música como Rock in Rio e Primavera Sound."
        },
        "3": {
        "label": "Economia e Bem-Estar",
        "summary": "Trata de temas econômicos e sociais como preços de imóveis, subsídios e saúde pública."
        },
        "4": {
        "label": "Desporto Internacional",
        "summary": "Cobre notícias relacionadas a figuras e eventos esportivos internacionais como Cristiano Ronaldo e a Liga dos Campeões."
        },
        "5": {
        "label": "Cultura e Tecnologia",
        "summary": "Explora tópicos que envolvem realidade virtual, cultura contemporânea e avanços tecnológicos."
        },
        "6": {
        "label": "Situação Pandêmica",
        "summary": "Foca em atualizações sobre a pandemia COVID-19 em Portugal, incluindo casos, mortes e recuperações."
        },
        "7": {
        "label": "Energia e Finanças",
        "summary": "Aborda notícias sobre o mercado financeiro e energético, incluindo ações de empresas e políticas governamentais."
        },
        "8": {
        "label": "Alertas Meteorológicos",
        "summary": "Informa sobre avisos meteorológicos, como agitação marítima e precipitação, em várias regiões."
        },
        "9": {
        "label": "Justiça e Política Nacional",
        "summary": "Envolve temas de justiça, política e figuras políticas em Portugal."
        },
        "10": {
        "label": "Vacinação e Saúde Pública",
        "summary": "Diz respeito a assuntos relacionados à vacinação contra a COVID-19 e políticas de saúde pública."
        },
        "11": {
        "label": "Política Nacional e Econômica",
        "summary": "Aborda políticas governamentais, privatizações e questões econômicas em Portugal."
        },
        "12": {
        "label": "Questões Internacionais",
        "summary": "Trata de assuntos globais como imigração, política internacional e líderes políticos."
        },
        "13": {
        "label": "Justiça e Política Nacional",
        "summary": "Explora desenvolvimentos legais e políticos em Portugal, incluindo casos judiciais e ações do presidente."
        },
        "14": {
        "label": "Política Internacional",
        "summary": "Envolve notícias sobre políticos e eventos internacionais, como eleições nos EUA e movimentos geopolíticos."
        },
        "15": {
        "label": "Educação e Saúde",
        "summary": "Trata de assuntos relacionados à educação, saúde e serviços públicos em Portugal."
        },
        "16": {
        "label": "Política Britânica e Europeia",
        "summary": "Aborda questões políticas do Reino Unido e da União Europeia, como o Brexit e eleições."
        },
        "17": {
        "label": "Conflitos e Geopolítica",
        "summary": "Explora eventos e situações de conflito em diferentes partes do mundo, incluindo questões nucleares e refugiados."
        },
        "18": {
        "label": "Política Internacional",
        "summary": "Cobre notícias relacionadas a movimentos políticos e protestos em diferentes regiões do mundo."
        },
        "19": {
        "label": "Situação da Pandemia",
        "summary": "Fornece informações sobre o estado da pandemia COVID-19 em Portugal, incluindo casos ativos, internamentos e surtos."
        }
    },

    'Merged' :{
      "Merged Topic 1": {
        "label": "Política Nacional e Internacional",
        "source": [0, 11],
        "summary": "Inclui notícias e desenvolvimentos políticos tanto em Portugal quanto no cenário internacional."
      },
      "Merged Topic 2": {
        "label": "Cultura, Tecnologia e Saúde Pública",
        "source": [5, 10],
        "summary": "Aborda temas que vão desde avanços tecnológicos e cultura contemporânea até questões de saúde pública, incluindo vacinação contra a COVID-19."
      },
      "Merged Topic 3": {
        "label": "Conflitos Globais e Geopolítica",
        "source": [1, 17],
        "summary": "Explora eventos e situações de conflito em diferentes partes do mundo, incluindo questões geopolíticas e crises globais."
      },
      "Merged Topic 4": {
        "label": "Economia, Bem-Estar e Finanças",
        "source": [3, 7],
        "summary": "Engloba notícias econômicas, questões sociais e atualizações financeiras, abrangendo aspectos como preços de imóveis, saúde pública e mercado financeiro."
      },
      "Merged Topic 5": {
        "label": "Justiça e Política Nacional",
        "source": [9, 13],
        "summary": "Explora desenvolvimentos legais e políticos em Portugal, incluindo casos judiciais, ações do presidente e questões políticas."
      }
    }
}


In [None]:
topics_df_merged['TopicLabel1'] = "Unlabelled"

In [None]:
# topics_df_merged

for ti in manual_update['Topics']:
    # ti_n = ti.split(' ')[1]
    print(ti, manual_update['Topics'][ti]['label'])
    
    topics_df_merged.loc[topics_df_merged['Topic']==int(ti), 'TopicLabel1'] = manual_update['Topics'][ti]['label']




In [None]:
topics_df_merged['TopicLabel2'] = topics_df_merged['TopicLabel1']



In [None]:
# for ti in manual_update['Merges']:
#     srcs = ti['source']
#     for si in srcs:
#         topics_df_merged.loc[topics_df_merged['Topic']==si, 'Merged'] = ti['label']


for ti in manual_update['Merged']:
    srcs = manual_update['Merged'][ti]['source']
    for si in srcs:
        print(ti, si, manual_update['Merged'][ti]['label'])
        topics_df_merged.loc[topics_df_merged['Topic']==int(si), 'TopicLabel2'] = manual_update['Merged'][ti]['label']
        # topics_df_merged.loc[topics_df_merged['Topic']==int(si), 'TopicLabel2'] = "{} {} {}".format(si, ti, manual_update['Topics'][ti]['label'])


In [None]:
topics_df_merged['TopicLabel1'].value_counts()


In [None]:
df_ = topics_df_merged.copy()
df_[['X','Y']] = precomputed_2d

scatterplot = sns.scatterplot(df_, 
                              x='X', y='Y',
                              hue='TopicLabel1',
                              **scatter_default)


In [None]:
df_ = topics_df_merged.copy()
df_[['X','Y']] = precomputed_2d

scatterplot = sns.scatterplot(df_, 
                              x='X', y='Y',
                              hue='TopicLabel2',
                              **scatter_default)


In [None]:
df_

### Save final labels

In [None]:
df_full_final = pd.read_csv(prep_path)
print(df_full_final.shape)
df_full_final.head(3)

In [None]:
df_full_final[['X','Y']] = precomputed_2d

In [None]:
df_full_final[['Topic1', 'Topic2', 'TopicID']] = topics_df_merged[['TopicLabel1', 'TopicLabel2', 'Topic']]
df_full_final

In [None]:
df_full_final.to_csv("./topics_final.csv", index=False)


## Visualization

In [None]:
hover_text_template = """
<div>
    <p><strong>{hover_text}</strong></p>
    <p>{year}</p>
</div>
"""

In [None]:
custom_js = """    
    /** Add custom font **/
    let fontLink = document.createElement("link");
    fontLink.rel = "stylesheet";
    fontLink.href ="https://fonts.googleapis.com/css2?family=Rozha+One";
    document.getElementsByTagName("head")[0].appendChild(fontLink);

    /** Add sidebar **/

    let sidebar = document.querySelector('#sidebar');
    let searchBox = document.querySelector('#search-container')
    let rangeContainer = document.querySelector('#year-range')
    let minSlider = document.querySelector('#rangefrom');
    let maxSlider = document.querySelector('#rangeto');

    let searchLabelDiv = document.createElement("div");
    searchLabelDiv.id = "search-label"
    searchLabelDiv.append(document.createTextNode("Consulta de Pesquisa:"))
    
    
    sidebar.prepend(searchBox);
    searchBox.style.left = "unset";
    sidebar.prepend(searchLabelDiv);

    /** Style title and subtitle **/

    let titleSpan = document.querySelector('#title-container').firstElementChild
    let subtitleSpan = document.querySelector('#title-container').lastElementChild
    
    titleSpan.style.fontFamily = "Rozha One"
    titleSpan.style.fontWeight = "400"
    titleSpan.style.color = "var(--color1)"
    
    subtitleSpan.style.fontFamily = "Rozha One"
    subtitleSpan.style.color = "var(--color2)"
    
    /** Input ranges **/


    function checkYear(i, fromValue, toValue) {
        return (parseInt(i) >= parseInt(fromValue)) && (parseInt(i) <= parseInt(toValue))
    }
    

    function checkYears(i){
        let minYear = Math.min(parseInt(minSlider.value), parseInt(maxSlider.value))
        let maxYear = Math.max(parseInt(minSlider.value), parseInt(maxSlider.value))


        document.querySelector('#yearfrom').textContent = minYear;
        document.querySelector('#yearto').textContent = maxYear;
    

        
        return (i >= minYear && i <= maxYear )
    }
    
    maxSlider.addEventListener('input', (event) => {
      onClickDetails("","","",false)
    
      let year = parseInt(event.srcElement.value)

      selectPoints(year, (i) => (checkYears(hoverData.data.year[i])))
    
    });
    
    minSlider.addEventListener('input', (event) => {
      onClickDetails("","","",false)

      let year = parseInt(event.srcElement.value)
      let minYear = parseInt(minSlider.value)
      let maxYear = parseInt(maxSlider.value)
    
      selectPoints(year, (i) => (checkYears(hoverData.data.year[i])))
      
    
    });

    
    /** Sidebar details **/

    let textDetails = document.querySelector('#text-details');
    function onClickDetails(title, year, archiveUrl, setValues) {
        
        let detailWrapper = document.querySelector("#text-details")

        if (title !== "undefined") {
            let linkArchive = document.querySelector("#link-archive")
            linkArchive.href = archiveUrl
    
            
    
    
            let detailText = document.querySelector("#text-title")
            detailText.innerText = title
    
            let detailYear = document.querySelector("#text-year")
            detailYear.innerText = year
    
    
            if (setValues == true) {
                detailWrapper.style.display = "block"
            }
            else {
                detailWrapper.style.display = "none"
            }
            
        } 
        else {
            detailWrapper.style.display = "none"
        }
        
        
        
    }
"""

In [None]:

custom_html ="""
<div id="sidebar">
    <div id="year-range">
      <div class="years">
          Anos Selecionados: <br><span id="yearfrom">2001</span><span> - </span><span id="yearto">2024</span>
      </div>
      <div class="multi-range">
        <input id="rangefrom" type="range" min="2001" max="2024" value="2001" step="1"/>
        <input id="rangeto" type="range" min="2001" max="2024" value="2024" step="1"/>
      </div>
    </div>
    
    <div id="text-details" style="display:none">
        <div class="link-wrapper">
            <p id="text-title"></p>
            <p>
            <span id="text-year"></span> | 
            <a href="#" id="link-archive", target="_blank">Link to Arquivo.pt</a>
            </p>
        </div>
    </div>
</div>
"""

In [None]:
plot_subtitle = "Mapa dos títulos das notícias do expresso.pt entre 2001 e 2024"

### Dark Mode

In [None]:
palette_dark = dict(opposite= "#FBFAFA", 
                accent1= "#f3d5a7", 
                accent2= "#dfa691", 
                bgmid= "32, 32, 35",
                bgbottom= "#1b1b1c",
                bghigh= "#343237", 
                shadow= "#08080844",
                text= "#FBFAFA"
                   )

In [None]:
custom_css="""

:root {{
    --range-radius: 16px;
    --range-color: {accent1}AA;
    --thumb-color: {accent1};
    --thumb-action: {accent2};

    --color1: {accent1};
    --color2: {accent2};

    --bg-translucent: {bghigh}77;

    --border-accent: {accent1}aa;
    
    --box-shadow: 2px 3px 10px {shadow};
    
    --border-radius: 8px;
    --box-padding: 10px;
    --box-margin: 0;
    --box-width: 260px;
    --thumbsize:18px;

    --bg-bottom: {bgbottom};
    --bg-mid: {bgmid};
    --bg-high: {bghigh};

    --text-color: {opposite};

}}
body {{
    box-sizing: border-box;
    max-width: 1000px;
    display: flex;
    margin-left: auto;
    margin-right: auto;
    position: relative;
    font-family: ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
    background: var(--bg-bottom);
}}

#deck-container canvas {{
    box-shadow: var(--box-shadow);
    background: rgba(var(--bg-mid),1);  
}}

a {{
    color: var(--color2);
}}
.row {{
    display : flex;
    align-items : center;
    margin-bottom: 5px;
}}
.box {{
    height:10px;
    width:10px;
    border-radius:2px;
    margin-right:5px;
    
}}

#title-container {{
    max-width: 75%;
    box-shadow: unset;
    border-radius: 0;
    padding: var(--box-padding);
    color: var(--color1);
    font_family: "Rozha One";
    background: rgba(var(--bg-mid), .5);
    
}}

#title-container:child(2) {{
    color: var(--color2);

}}

#year-range {{
    
    border-radius: 0;
    padding: var(--box-padding);
    margin: var(--box-margin);
    width: var(--box-width);
    
    height: 80px;
    transition: transform 0.5s;
    z-index: 3;
    position: relative;
    display:block;
    
}}

#year-range .years
{{
    margin: 8px;
    padding: var(--box-padding);
}}

#search-label
{{
    margin-left: calc(8px + var(--box-padding));
    padding-left: var(--box-padding);
}}


.multi-range {{
  position: relative;
  height: 40px;
  margin: 0;
}}
.multi-range input[type=range]:nth-child(1)::-webkit-slider-thumb::before {{
  background-color: #808080;
}}
.multi-range input[type=range]:nth-child(2) {{
  background: none;
}}
.multi-range input[type=range]:nth-child(2)::-webkit-slider-thumb::before {{
  background-color: #808080;
}}
.multi-range input[type=range]::-moz-range-track {{
  background: none;
}}

input[type="search"] {{
  display: block;
  padding: 8px 12px 10px 12px;
  border-radius: 0;
  border: 0px;
  border-bottom: 1px solid var(--border-accent);
  background: var(--bg-high);
}}

input[type=range] {{
  position: absolute;
  width: 100%;
  padding: 0;
  margin: 0;
  border: 0;
  outline: none;
  background: linear-gradient(var(--range-color), var(--range-color)) no-repeat center;
  background-size: 100% 6px;
  -webkit-appearance: none;
     -moz-appearance: none;
          appearance: none;
  pointer-events: none;
}}
input[type=range]:active,
input[type=range]:focus,
input[type=range]::-moz-focus-outer {{
  border: none;
  outline: none;
}}
input[type=range]::-moz-range-thumb {{
  position: relative;
  height: var(--thumbsize);
  width: var(--thumbsize);
  margin: 5px 0;
  border-radius: 50%;
  background-color: var(--thumb-color);
  box-shadow: 0 1px 4px 0.5px rgba(0,0,0,0.3);
  -moz-appearance: none;
       appearance: none;
  pointer-events: all;
}}
input[type=range]::-moz-range-thumb:hover,
input[type=range]::-moz-range-thumb:active
{{
  background-color: var(--thumb-action);
}}

input[type=range]::-webkit-slider-thumb {{
  position: relative;
  height: var(--thumbsize);
  width: var(--thumbsize);
  margin: 5px 0;
  border-radius: 50%;
  background-color: var(--thumb-color);
  box-shadow: 0 1px 4px 0.5px rgba(0,0,0,0.3);
  -webkit-appearance: none;
          appearance: none;
  pointer-events: all;
}}
input[type=range]::-webkit-slider-thumb:hover,
input[type=range]::-webkit-slider-thumb:active 
{{
  background-color: var(--thumb-action);
}}

#sidebar {{
    background-color: var(--bg-translucent);
    color: {text};
    width: 280px;
    height: 400px;
    position: absolute;
    display: flex;
    flex-direction: column;
    align-content: center;
    flex-wrap: wrap;
    left: 20px;
    top: 130px;
    z-index: 2;
    box-shadow: var(--box-shadow);
    border-radius: var(--border-radius);
    padding-top: calc(2*var(--box-padding));
    
}}

#search-container {{
    background: unset;
    right: 0 !important;
    border: 0px solid red;
    position: relative;
    box-shadow: unset;
    display:block;    
    border-radius: 0;
    padding: var(--box-padding);
    margin: var(--box-margin);
    width: var(--box-width);

}}

#search-container input {{
    width: 100%;
}}

#text-details {{
    padding: var(--box-padding);
    margin: var(--box-margin);
    width: var(--box-width);
}}

#text-details p#text-title {{
    font-weight: bold;
}}

""".format(**palette_dark)


In [None]:
plot = datamapplot.create_interactive_plot(
    df_full_final[['X','Y']].values,
    df_full_final['Topic1'],
    df_full_final['Topic2'],
    
    color_label_text         = False,
    color_cluster_boundaries = True,
    
    cluster_boundary_polygons   = True,
    cluster_boundary_line_width = .1,
    
    extra_point_data         = df_full_final[['year','linkToArchive','linkToScreenshot']],
    hover_text               = df_full_final['text'],
    hover_text_html_template = hover_text_template,
    
    text_min_pixel_size     = 16,
    point_line_width = 0,
    point_radius_min_pixels = 1,
    
    custom_css   = custom_css,
    custom_html  = custom_html,
    custom_js    = custom_js,
    
    enable_search=True,
    on_click="onClickDetails(`{hover_text}`, `{year}`, `{linkToArchive}`, true)",
    title="Arquivo Galáxia",
    sub_title=plot_subtitle,
    darkmode=True,

)


plot.save('./site/index.html')

plot

# END