In [11]:
import pandas as pd

file_path = "/kaggle/input/articles-base/BASE.csv"

def create_corpus_from_csv(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Check if the 'text' column exists
    if 'text' not in df.columns:
        raise ValueError("The CSV file does not contain a 'text' column.")

    # Extract the 'text' column and take every 4th article
    skip_df = df[::4].reset_index(drop=True)
    corpus = skip_df['text']
    return corpus, skip_df

corpus, skip_df = create_corpus_from_csv(file_path)

In [None]:
print(len(skip_df))
print(skip_df.head())

In [None]:
skip_df.to_csv('/kaggle/working/skip4_base.csv', index=False)
corpus.to_csv('/kaggle/working/skip4_corpus.csv', index=False)

In [None]:
!python -m spacy download it_core_news_lg

In [None]:
import spacy

# Enable GPU for SpaCy
spacy.require_gpu()

# Load the SpaCy model
nlp = spacy.load("it_core_news_lg")

# Check if SpaCy is using the GPU
print("SpaCy is using GPU:", spacy.prefer_gpu())


In [None]:
!pip install tqdm

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import re
from tqdm import tqdm
nltk.download('stopwords')
nltk.download('punkt')

import spacy

spacy.require_gpu() 
nlp = spacy.load("it_core_news_lg")  
text = corpus

# Funzione per rimuovere gli accenti
def remove_accents(text):
    accented_chars = 'àáâäãåèéêëìíîïòóôöõùúûüÀÁÂÄÃÅÈÉÊËÌÍÎÏÒÓÔÖÕÙÚÛÜ'
    replacement_chars = 'aaaaaaeeeeiiiiooooouuuuAAAAAAEEEEIIIIOOOOOUUUU'
    translation_table = str.maketrans(accented_chars, replacement_chars)
    return text.translate(translation_table)

# Funzione per pulire il testo dai caratteri accentai, convertire in minuscolo e sostituire i caratteri speciali con degli spazi
def clean_text(text):
    text = remove_accents(text)
    text = text.lower()
    text = re.sub(r"[\.,:;!?'\-\"«»<>’]", " ", text)
    return text

# Funzione per rimuovere le stopwords
def remove_stopwords(text):
    ita_stopwords = stopwords.words('italian')
    tokens = word_tokenize(text)
    return [token for token in tokens if token not in ita_stopwords]

# Funzione per lemmatizzare il testo
def lemmatization(text):
    text = " ".join(text)  # Join tokens into a single string
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Pipeline di pre-processing del testo
def process_text(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    text = lemmatization(text)
    return text

#text_processed = text.apply(process_text)
# Applying tqdm to the processing pipeline
text_processed = [process_text(text) for text in tqdm(corpus)]

In [None]:
text_processed_series = pd.Series(text_processed)
print(text_processed_series)

In [2]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting cython<3,>=0.27 (from hdbscan>=0.8.29->bertopic)
  Downloading Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Downloading bertopic-0.16.2-py2.py3-none-any.whl (158 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m80.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading sentence_transformer

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired

random_state = 123456  # UMAP random seed

# UMAP
n_neighbors = 10, # num of high dimensional neighbours. default 15

# HDBSCAN
min_cluster_size = 30  # default:5 HDBSCAN(); default:10 BERTopic()
min_samples = 5        # default 5?

# BERTOPIC
min_topic_size = 10    # 10 default
top_n_words = 10       # 10 default
nr_topics = 30         

ita_stopwords = stopwords.words('italian')
############################################

# Step 1 Extract embeddings (SBERT)
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device=device)

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors  = 10, # num of high dimensional neighbours.
                  n_components = 5,  # default:5
                  min_dist     = 0.0,
                  random_state = random_state) # default:None

# Step 3 - Clusterize reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size = min_cluster_size,
                        #min_samples=min_samples,
                        metric='euclidean',
                        cluster_selection_method='eom',
                        prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=ita_stopwords,
                                   lowercase=False) # lowercase=False to keep Acronyms uppercase

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) # False default

# Step 6 - Fine-tune topic representations
representation_model = MaximalMarginalRelevance(diversity = 0.7,  # 0.1 default
                                                top_n_words = 15) # 10 default


############################################

# All steps together
topic_model = BERTopic(min_topic_size = min_topic_size,             # 10 default
                       top_n_words = top_n_words,                   # 10 default
                       calculate_probabilities = True,
                       embedding_model = embedding_model,           # Step 1 - Extract embeddings
                       umap_model = umap_model,                     # Step 2 - Reduce dimensionality
                       hdbscan_model = hdbscan_model,               # Step 3 - Cluster reduced embeddings
                       vectorizer_model = vectorizer_model,         # Step 4 - Tokenize topics
                       ctfidf_model = ctfidf_model,                 # Step 5 - Extract topic words
                       nr_topics = 30
                       #representation_model= representation_model   # Step 6 - (Optional) Fine-tune topic representations
                       )

In [None]:
%%time
topic_model =  topic_model.fit(text_processed_series)
probs = topic_model.probabilities_
topics = topic_model.topics_

In [None]:
import numpy as np
import zipfile

data_path='/kaggle/working/'

def save_topic_model(filename= "topic_model"):
    topic_model.save(data_path+filename)
    np.savetxt(data_path+filename+'_probs.txt', probs)
    # Create a ZIP file
    with zipfile.ZipFile(data_path + filename + '.zip', 'w') as zipf:
        zipf.write(data_path + filename, arcname=filename)
        zipf.write(data_path + filename + '_probs.txt', arcname=filename + '_probs.txt')
    

def load_topic_model(filename= "topic_model"):
    # Extract the ZIP file
    with zipfile.ZipFile(data_path + filename + '.zip', 'r') as zipf:
        zipf.extractall(data_path)
    topic_model = BERTopic.load(data_path+filename)
    probs = np.loadtxt(data_path+filename+'_probs.txt')
    return topic_model, probs

In [None]:
save_topic_model()

In [18]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,12424,-1_anno_piu_essere_fare,"[anno, piu, essere, fare, euro, altro, potere,...",[prospettiva crescito area euro deteriorato al...
1,0,4622,0_squadra_gara_gol_finale,"[squadra, gara, gol, finale, partita, vincere,...",[paese vai evento trovare luogo proponga Festi...
2,1,4125,1_pd_governo_melone_partito,"[pd, governo, melone, partito, ministro, Itali...",[senato russo commemorazione berlusconi aula m...
3,2,2966,2_ucraino_russo_Russia_Putin,"[ucraino, russo, Russia, Putin, Kiev, guerra, ...",[ucraina diciassettesimo giorno guerra potente...
4,3,696,3_paziente_vaccino_virus_covid,"[paziente, vaccino, virus, covid, tumore, infe...",[chiamare tecovirimat momento farmaco riporre ...
5,4,661,4_incidente_morire_ferito_morto,"[incidente, morire, ferito, morto, ospedale, c...",[orsa amarena essere uccidere ieri sera fucila...
6,5,652,5_nord_sud_caldo_pioggia,"[nord, sud, caldo, pioggia, temperatura, tempo...",[sbalzo termico crisi primavera appena iniziar...
7,6,530,6_elettrico_litro_prezzo_gas,"[elettrico, litro, prezzo, gas, auto, benzina,...",[girona Mazda continuare battere ferro motore ...
8,7,503,7_film_regista_attore_cinema,"[film, regista, attore, cinema, Oscar, the, pr...",[mancare solo coda sirena sedurre sembrare ina...
9,8,500,8_Papa_papa_Francesco_vaticano,"[Papa, papa, Francesco, vaticano, cardinale, c...",[citta vaticano – Papa Francesco dovere rinunc...


In [None]:
topic_model.visualize_hierarchy(custom_labels=True, width=700, height=600)

In [17]:
topic_model.visualize_topics()

In [None]:
embeddings = embedding_model.encode(text_processed_series, show_progress_bar=False)

In [None]:
fig = topic_model.visualize_documents(text_processed_series, embeddings=embeddings)
fig.write_html(data_path + "topic_cluster.html")


In [None]:
fig1 = topic_model.visualize_barchart()
fig1.write_html(data_path + "barchart.html")

In [None]:
def get_topic_info(topic_model):
    df = topic_model.get_topic_info()
    df.drop('Representative_Docs', axis=1, inplace=True) 
    return df

def get_document_info(topic_model, docs):
    doc_info = topic_model.get_document_info(docs)
    return doc_info

In [None]:
doc_info = get_document_info(topic_model, text_processed_series)

In [None]:
doc_info

In [None]:
skip_df["topic"] = doc_info["topic"]
skip_df["topic_name"] = doc_info["name"]

In [None]:
skip_df.to_csv('/kaggle/working/skip4_topic.csv', index=False)