<a href="https://colab.research.google.com/github/ciccioshake/colab/blob/main/integrated_text_processing_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install chardet
import chardet
import os
import json
import nltk
import re
import xml.etree.ElementTree as ET
from io import StringIO
from nltk.tokenize import sent_tokenize, word_tokenize

# Download the necessary Punkt Tokenizer Models
nltk.download('punkt_tab')

# Scaricare il tokenizer di NLTK se necessario
nltk.download("punkt")

# Impostazioni per la segmentazione
MAX_LEN = 384  # Lunghezza massima di un segmento
OVERLAP = 100  # Sovrapposizione tra segmenti
PUNCTUATION = [".", "!", "?", ";", ":", ","]  # Punti di spezzatura preferiti
CHUNK_SIZE = 1024 * 1024  # Dimensione del blocco di lettura (1 MB)

# Percorsi dei file di output
output_txt_path = "/content/unified_bandi.txt"
output_json_path = "/content/unified_bandi.json"
output_file_path = "/content/testo_segmentato_nltk.txt"

# Lista di file caricati
document_files = ["/content/plain_9", "/content/plain_10", "/content/plain_11", "/content/plain_12"]
input_file_paths = ["/content/plain_9", "/content/plain_10", "/content/plain_11", "/content/plain_12"]

def clean_text(text):
    """Effettua la pulizia del testo rimuovendo caratteri speciali e spazi extra."""
    text = re.sub(r'[^\w\s.,!?]', '', text)  # Rimuove caratteri non alfanumerici tranne punteggiatura
    text = re.sub(r'\s+', ' ', text).strip()  # Rimuove spazi extra
    return text

# Funzione per tokenizzare il testo usando NLTK
def nltk_tokenize(text):
    return nltk.word_tokenize(text)

def find_best_split(words, max_len):
    """Trova il miglior punto di spezzatura in base alla punteggiatura."""
    if len(words) <= max_len:
        return len(words)  # Nessuna spezzatura necessaria
    best_split = max_len  # Punto di spezzatura predefinito
    for i in range(max_len - 10, max_len - 50, -1):  # Cerca un punto vicino alla fine
        if words[i] in PUNCTUATION:
            best_split = i + 1  # Include la punteggiatura nel segmento
            break
    return best_split

def segment_text(text):
    """Segmenta il testo in blocchi di massimo MAX_LEN parole."""
    words = word_tokenize(text)
    segments = []
    buffer = []
    while words:
        buffer.extend(words[:MAX_LEN])
        words = words[MAX_LEN:]
        if len(buffer) >= MAX_LEN:
            end = find_best_split(buffer, MAX_LEN)
            segments.append(' '.join(buffer[:end]))
            buffer = buffer[end - OVERLAP:]  # Sovrapposizione
    if buffer:
        segments.append(' '.join(buffer))
    return '\n\n'.join(segments)

# Elaborazione dei file in blocchi, iterando su ogni file path
# Open the output files before the loop
with open(output_file_path, "a", encoding="utf-8") as outfile, \
     open(output_file_path.replace(".txt", ".json"), "w", encoding="utf-8") as jsonfile:

    all_segments = []  # List to store all segments for JSON

    for input_file_path in input_file_paths:
        with open(input_file_path, "r", encoding="ISO-8859-1") as f:
            buffer = []
            segment_count = 0

            while True:
                chunk = f.read(CHUNK_SIZE)
                if not chunk:
                    break

                words = nltk_tokenize(chunk)
                buffer.extend(words)

                while len(buffer) >= MAX_LEN:
                    end = find_best_split(buffer, MAX_LEN)
                    segment = buffer[:end]
                    segment_count += 1

                    # Write to text file
                    outfile.write(
                        f"Segment {segment_count} (Token Count: {len(segment)}):\n{' '.join(segment)}\n\n"
                    )

                    # Append to list for JSON
                    all_segments.append({
                        "segment_number": segment_count,
                        "token_count": len(segment),
                        "text": ' '.join(segment)
                    })

                    buffer = buffer[end - OVERLAP:]

            if buffer:
                segment_count += 1
                outfile.write(
                    f"Segment {segment_count} (Token Count: {len(buffer)}):\n{' '.join(buffer)}\n\n"
                )
                all_segments.append({
                    "segment_number": segment_count,
                    "token_count": len(buffer),
                    "text": ' '.join(buffer)
                })

    # Write all segments to JSON file
    json.dump(all_segments, jsonfile, ensure_ascii=False, indent=4)

# Creazione della lista JSON
documents = []

# Unificazione dei file
with open(output_txt_path, "w", encoding="utf-8") as txt_file:
    for file_path in document_files:
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                doc_id = os.path.basename(file_path).replace("plain_", "").strip()

                # Pulizia e segmentazione
                cleaned_content = clean_text(content)
                segmented_content = segment_text(cleaned_content)

                # Scrivere su file di testo
                txt_file.write(f"\n--- Document {doc_id} ---\n")
                txt_file.write(segmented_content + "\n")

                # Aggiungere al JSON
                documents.append({"id": doc_id, "content": segmented_content})

# Salvare il JSON
with open(output_json_path, "w", encoding="utf-8") as json_file:
    json.dump(documents, json_file, ensure_ascii=False, indent=4)

print(f"Processo completato! File salvati: {output_txt_path}, {output_json_path}")



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processo completato! File salvati: /content/unified_bandi.txt, /content/unified_bandi.json


In [1]:
!pip install chardet
import chardet
import os
import json
import nltk
import re
import xml.etree.ElementTree as ET
import glob
from io import StringIO
from nltk.tokenize import sent_tokenize, word_tokenize

# Download the necessary Punkt Tokenizer Models
nltk.download('punkt_tab')

# Scaricare il tokenizer di NLTK se necessario
nltk.download("punkt")

# Impostazioni per la segmentazione
MAX_LEN = 384  # Lunghezza massima di un segmento
OVERLAP = 100  # Sovrapposizione tra segmenti
PUNCTUATION = [".", "!", "?", ";", ":", ","]  # Punti di spezzatura preferiti
CHUNK_SIZE = 1024 * 1024  # Dimensione del blocco di lettura (1 MB)

# Percorsi dei file di output
output_txt_path = "/content/unified_bandi.txt"
output_json_path = "/content/unified_bandi.json"
output_file_path = "/content/testo_segmentato_nltk.txt"
"""
# Lista di file caricati
document_files = ["/content/plain_9", "/content/plain_10", "/content/plain_11", "/content/plain_12"]
input_file_paths = ["/content/plain_9", "/content/plain_10", "/content/plain_11", "/content/plain_12"]
"""
# Lista di file caricati (modificata)
document_files = glob.glob("/content/plain/*")
input_file_paths = glob.glob("/content/plain/*")

def clean_text(text):
    """Effettua la pulizia del testo rimuovendo caratteri speciali e spazi extra."""
    text = re.sub(r'[^\w\s.,!?]', '', text)  # Rimuove caratteri non alfanumerici tranne punteggiatura
    text = re.sub(r'\s+', ' ', text).strip()  # Rimuove spazi extra
    return text

# Funzione per tokenizzare il testo usando NLTK
def nltk_tokenize(text):
    return nltk.word_tokenize(text)

def find_best_split(words, max_len):
    """Trova il miglior punto di spezzatura in base alla punteggiatura."""
    if len(words) <= max_len:
        return len(words)  # Nessuna spezzatura necessaria
    best_split = max_len  # Punto di spezzatura predefinito
    for i in range(max_len - 10, max_len - 50, -1):  # Cerca un punto vicino alla fine
        if words[i] in PUNCTUATION:
            best_split = i + 1  # Include la punteggiatura nel segmento
            break
    return best_split

def segment_text(text):
    """Segmenta il testo in blocchi di massimo MAX_LEN parole."""
    words = word_tokenize(text)
    segments = []
    buffer = []
    while words:
        buffer.extend(words[:MAX_LEN])
        words = words[MAX_LEN:]
        if len(buffer) >= MAX_LEN:
            end = find_best_split(buffer, MAX_LEN)
            segments.append(' '.join(buffer[:end]))
            buffer = buffer[end - OVERLAP:]  # Sovrapposizione
    if buffer:
        segments.append(' '.join(buffer))
    return '\n\n'.join(segments)

# Elaborazione dei file in blocchi, iterando su ogni file path
# Open the output files before the loop
with open(output_file_path, "a", encoding="utf-8") as outfile, \
     open(output_file_path.replace(".txt", ".json"), "w", encoding="utf-8") as jsonfile:

    all_segments = []  # List to store all segments for JSON

    for input_file_path in input_file_paths:
        with open(input_file_path, "r", encoding="ISO-8859-1") as f:
            buffer = []
            segment_count = 0

            while True:
                chunk = f.read(CHUNK_SIZE)
                if not chunk:
                    break

                words = nltk_tokenize(chunk)
                buffer.extend(words)

                while len(buffer) >= MAX_LEN:
                    end = find_best_split(buffer, MAX_LEN)
                    segment = buffer[:end]
                    segment_count += 1

                    # Write to text file
                    outfile.write(
                        f"Segment {segment_count} (Token Count: {len(segment)}):\n{' '.join(segment)}\n\n"
                    )

                    # Append to list for JSON
                    all_segments.append({
                        "segment_number": segment_count,
                        "token_count": len(segment),
                        "text": ' '.join(segment)
                    })

                    buffer = buffer[end - OVERLAP:]

            if buffer:
                segment_count += 1
                outfile.write(
                    f"Segment {segment_count} (Token Count: {len(buffer)}):\n{' '.join(buffer)}\n\n"
                )
                all_segments.append({
                    "segment_number": segment_count,
                    "token_count": len(buffer),
                    "text": ' '.join(buffer)
                })

    # Write all segments to JSON file
    json.dump(all_segments, jsonfile, ensure_ascii=False, indent=4)

# Creazione della lista JSON
documents = []

# Unificazione dei file
with open(output_txt_path, "w", encoding="utf-8") as txt_file:
    for file_path in document_files:
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                doc_id = os.path.basename(file_path).replace("plain_", "").strip()

                # Pulizia e segmentazione
                cleaned_content = clean_text(content)
                segmented_content = segment_text(cleaned_content)

                # Scrivere su file di testo
                txt_file.write(f"\n--- Document {doc_id} ---\n")
                txt_file.write(segmented_content + "\n")

                # Aggiungere al JSON
                documents.append({"id": doc_id, "content": segmented_content})

# Salvare il JSON
with open(output_json_path, "w", encoding="utf-8") as json_file:
    json.dump(documents, json_file, ensure_ascii=False, indent=4)

print(f"Processo completato! File salvati: {output_txt_path}, {output_json_path}")



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Processo completato! File salvati: /content/unified_bandi.txt, /content/unified_bandi.json
