In [None]:
import pandas as pd
import time

file_path = "/kaggle/input/articles-base/BASE.csv"

df = pd.read_csv(file_path)

df.head()


In [None]:
!pip install spacy spacy_download keybert transformers

In [None]:
!python -m spacy download it_core_news_lg

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import re

nltk.download('stopwords')
nltk.download('punkt')

# from spacy_download import load_spacy
import spacy

# Will download the model if it isn't installed yet
#nlp = load_spacy("it_core_news_sm", exclude=["parser", "tagger"])  
# Enable GPU for SpaCy

# spacy.require_gpu()

# Load the SpaCy model
# nlp = load_spacy("it_core_news_lg")
nlp = spacy.load("it_core_news_lg")

# Check if SpaCy is using the GPU

# print("SpaCy is using GPU:", spacy.prefer_gpu())

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

# Carica il modello e il tokenizer BERT italiano
model_name = "dbmdz/bert-base-italian-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

from keybert import KeyBERT

# Inizializza KeyBERT con il modello BERT italiano
kw_model = KeyBERT(model=model)

In [None]:
text = df['text'].iloc[::4]
# text = df['text'].iloc[::10000]

# Funzione per rimuovere gli accenti
def remove_accents(text):
    accented_chars = 'àáâäãåèéêëìíîïòóôöõùúûüÀÁÂÄÃÅÈÉÊËÌÍÎÏÒÓÔÖÕÙÚÛÜ'
    replacement_chars = 'aaaaaaeeeeiiiiooooouuuuAAAAAAEEEEIIIIOOOOOUUUU'
    translation_table = str.maketrans(accented_chars, replacement_chars)
    return text.translate(translation_table)

# Funzione per pulire il testo dai caratteri accentati, convertire in minuscolo e sostituire i caratteri speciali con degli spazi
def clean_text(text):
    text = remove_accents(text)
    text = text.lower()
    text = re.sub(r"[\.,:;!?'\-\"«»<>’]", " ", text)
    return text

# Funzione per rimuovere le stopwords
def remove_stopwords(text):
    ita_stopwords = stopwords.words('italian')
    tokens = word_tokenize(text)
    return [token for token in tokens if token not in ita_stopwords]

# Funzione per concatenare una lista di token in una stringa
def concatenate_list(tokens):
    return " ".join(tokens)

# Funzione per lemmatizzare il testo
def lemmatization(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Funzione per estrarre le keywords
def extract_keywords(text):
    keywords_with_scores = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1))
    return [keyword for keyword, score in keywords_with_scores]

# Pipeline di pre-processing del testo
def process_text(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    text = concatenate_list(text)
    text = lemmatization(text)
    return text



In [None]:
#### Applica la pipeline di pre-processing al testo
start_time = time.time()
text_processed = text.apply(process_text)
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time}")

In [None]:
# Estrae le keywords dal testo processato
start_time = time.time()
keywords = text_processed.apply(extract_keywords)
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time}")

In [None]:
# Convert Series to DataFrame
keywords_df = keywords.to_frame()

# Save DataFrame to CSV file
keywords_df.to_csv('keywords.csv', index=False)

In [None]:
type(keywords)

In [None]:
keywords_df.to_csv('/kaggle/working/keywords.csv', index=False)