In [None]:
!pip install transformers
!pip install gensim
!pip install nltk

In [None]:
# imports
from transformers import pipeline
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import unicodedata

# descargar datos para la lda
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
# fragmentador de chat
def fragmentar_chat_cualquiera(chat):
    """
    dado un chat de Telegram o Whatsapp del celular guarda cada mensaje en una lista

    se agrega la flag "caso_normal" para los casos que no son cubiertos por
    los patrones de abajo (Telegram o Whatsapp celular)

    """
    whatsapp_phone_pattern = r"\[(\d{1,2}/\d{1,2} \d{2}:\d{2})\] (\w+): (.+)"

    whatsapp_import_pattern = r"(\d{1,2}/\d{1,2}/\d{4}, \d{1,2}:\d{1,2}) - (\w+): (.+)"

    # patron de mensajes en telegram, regex ♥
    telegram_pattern = (
        r"([A-Za-z\s]+), \[\d{1,2}/\d{1,2}/\d{2} \d{1,2}:\d{2}\s?[APM]{2}\]\n(.+?)\n"
    )

    matches_wasap = re.findall(whatsapp_phone_pattern, chat)
    matches_telegram = re.findall(telegram_pattern, chat, re.DOTALL)
    matches_import = re.findall(whatsapp_import_pattern, chat)

    telegram_list = [
        f"{name.strip()}: {content.strip()}" for name, content in matches_telegram
    ]

    whatsapp_list = [
        f"{name.strip()}: {content.strip()}" for fecha, name, content in matches_wasap
    ]

    whatsapp_import_list = [
        f"{name.strip()}: {content.strip()}" for fecha, name, content in matches_import
    ]

    caso_normal = True
    message_list = telegram_list + whatsapp_list + whatsapp_import_list

    # esto sucede si ninguno tiene hace match, entonces es otro formato
    # por ahora se asume que este formato viene sin fechas
    if message_list == []:
        message_list = chat.split("\n")
        caso_normal = False
        # message_list = list(filter(lambda x: x != "", message_list))

    return message_list, caso_normal


In [None]:


def normalizar_texto(texto):
    """
    quita tildes y lo pasa a miniscula
    """
    # Normalizamos el texto para separar las letras acentuadas
    texto_normalizado = unicodedata.normalize("NFD", texto)

    # Filtramos las letras acentuadas (que tienen una "combinación" de caracteres como la tilde)
    # y las transformamos a su versión sin tilde.
    texto_sin_tildes = "".join(
        [c for c in texto_normalizado if unicodedata.category(c) != "Mn"]
    )

    # Devolvemos el texto sin tildes
    texto_minus = re.sub(r"[,\.!?]", "", texto_sin_tildes.lower())
    return texto_minus


def model_lda(lista_mensajes, caso_normal, num_topics=6):
    """
    CAMBIO
    Paso una lista de mensajes de la forma [AUTOR: CONTENIDO]
    y devuelvo el modelito lda

    la lda se va a quedar solo con el contenido, ignorando al atuor
    se asume que no traen fecha
    """
    # limpiamos las stop swords
    stop_words = set(stopwords.words("spanish"))
    lemmatizer = WordNetLemmatizer()

    # procesamiento
    texts = []
    for msj in lista_mensajes:
        # Extraer solo el contenido del mensaje
        contenido = msj
        # si es un caso normal, entonces matchea, sacando el autor
        # sino, el contenido es el mensaje
        if caso_normal:
            match = re.search(
                r"^.*?:\s*(.+)", msj
            )  # Buscar el contenido después del nombre del autor
            if match:
                contenido = match.group(1)
            else:
                contenido = ""

        # filtro la puntuación y los paso a minuscula
        # contenido_sin_tilde = normalizar_texto(contenido)
        # contenido_sin_puntuacion = re.sub(r"[,\.!?]", "", contenido_sin_tilde.lower())
        contenido_normalizado = normalizar_texto(contenido)
        texts.append(
            [
                lemmatizer.lemmatize(palabra)
                for palabra in re.findall(r"\b\w+\b", contenido_normalizado)
                if palabra not in stop_words and len(palabra) > 2
            ]
        )

        # diccionario
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        # entrenar
        lda_model = LdaModel(
            corpus, num_topics=num_topics, id2word=dictionary, passes=15
        )

        # rta_lda = {
        #     "lda_model": lda_model,
        #     "corpus": corpus,
        #     "dictionary": dictionary,
        #     "topics": lda_model.print_topics(),
        # }
        # rta_lda = lda_model
    return lda_model


def message_list_per_topic(topics_list, message_list):
    """
    dado la lista de temas (topics_list) de la forma
    [(n_tema [('tema', porcentaje)]), ... ]
    y la lista de mensajes
    devuele una lista de la forma
    [(n_tema, [lista de temas relevantes (para testing)] , [lista de mensajes relevantes al tema]), ....]
    """
    final_list = []
    for n_tema, lista_temas in topics_list:
        # ordenamos por relevancia la lista de temas para quedarnos con
        # el 70%? mas relevante
        lista_temas.sort(key=lambda x: x[1], reverse=True)
        suma_porcentaje = 0.0
        temas_relevantes = []

        for tema, porcentaje in lista_temas:
            # si agarré el top x%  ya la corto
            if suma_porcentaje >= 0.5:
                break
            suma_porcentaje += porcentaje

            # me voy quedando con los temas mas relevantes
            temas_relevantes.append(tema)

        # una vez que salí del bucle (i.e. tengo los temas relevantes)
        # hago una lista de mensajes filtrada
        mensajes_relevantes = [
            msj
            for msj in message_list
            if any(tema in normalizar_texto(msj) for tema in temas_relevantes)
        ]
        final_list.append((n_tema, temas_relevantes, mensajes_relevantes))

    return final_list


def chat_to_filtered_per_topic(chat):
    """
    esta funcion se encarga de ir llamando todo proceduralmente
    deberia ser la unica llamada desde afuera

    # chat: string del chat
    #     ya se asume que se parseo el path o el copia_pega

    """

    lista_mensajes, caso_normal = fragmentar_chat_cualquiera(chat)

    # no se cuantos topics deberia ser lo ideal
    lda_model = model_lda(lista_mensajes, caso_normal, num_topics=6)
    topics = lda_model.show_topics(formatted=False)
    lista_filtrada = message_list_per_topic(topics, lista_mensajes)
    return lista_filtrada


In [None]:
chat = """
Tomas Martinez, [10/17/24 12:03 AM]
Habria que utilizar tecnicas no supervisadas

Tomas Martinez, [10/17/24 12:05 AM]
exactamente como el colab que estaba en el notion

Tomas Martinez, [10/17/24 12:05 AM]
ahi estuve viendo

Gaston Bonfils, [10/17/24 12:33 AM]
God

Gaston Bonfils, [10/17/24 12:33 AM]
Yo toy saliendo del fulbo

Gaston Bonfils, [10/17/24 12:34 AM]
Me atajaron el único tiro al arco 😔

Tomas Martinez, [10/17/24 12:40 AM]
no me la contes

Tomas Martinez, [10/17/24 12:40 AM]
ganaron??

Gaston Bonfils, [10/17/24 12:44 AM]
Perdimos depe

Gaston Bonfils, [10/17/24 12:44 AM]
Pero buen partido

Tomas Martinez, [10/17/24 12:46 AM]
Buenisimo
"""

In [None]:
# RESUMIR CON MODELO PEQUEÑO

def summarize_chat(chat):
    """
    resume un chat nomas
    """
    summarizer = pipeline(
        "summarization",
        model="kabita-choudhary/finetuned-bart-for-conversation-summary",
    )
    summary = summarizer(chat[:1024], max_length=1024, min_length=30, do_sample=False)
    return summary


def summarize_per_topic(chat):
    """
    resumen por tema encontrado
    """
    summary_list = []
    topic_list = chat_to_filtered_per_topic(chat)
    for n_topic, topics, message_list in topic_list:
        joined_messages = "\n".join(message_list)
        summary = summarize_chat(joined_messages)

        # print(f"Resumen de tema {n_topic}:\n {summary}\n===\n")

        summary_list.append((n_topic, topics, summary, joined_messages))

    return summary_list


def pretty_print_summary_list(summary_list, show_messages=False):
    """
    imprime de manera lejible la lista de resumenes
    """
    text = ""
    for n_tema, temas, resumen, messages in summary_list:
        text += f"""
=============
Tema #{n_tema}
Temas relevantes = {temas}
Resumen:
{resumen}

"""
        if show_messages:
          text += messages
        text += "\n=============\n"
    print(text)


In [None]:
summary = summarize_chat(chat)

summaries = summarize_per_topic(chat)

print(summary)

pretty_print_summary_list(summaries, show_messages=True)

In [None]:
# SEPARADOS

In [133]:
def summarize_with_flan(chat):

    summarizer = pipeline("summarization", "jordiclive/flan-t5-3b-summarizer", torch_dtype=torch.bfloat16, device ="cuda")
    
    # raw_document = 'You must be 18 years old to live or work in New York State...'
    raw_document = chat
    prompt = "Briefly summarize in third person the following conversation:"
    results = summarizer(
            f"{prompt} {raw_document}",
            num_beams=5,
            min_length=5,
            no_repeat_ngram_size=3,
            truncation=True,
            max_length=512,
            
        )
    return results

In [134]:
print(summarize_with_flan(chat))

Your max_length is set to 512, but your input_length is only 299. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=149)


[{'summary_text': 'Tomas Martinez, Gaston Bonfils and Yo toy saliendo del fulbo.'}]


In [135]:
def summarize_per_topic_flan(chat):
    """
    resumen por tema encontrado
    """
    summary_list = []
    topic_list = chat_to_filtered_per_topic(chat)
    for n_topic, topics, message_list in topic_list:
        joined_messages = "\n".join(message_list)
        summary = summarize_with_flan(joined_messages)

        # print(f"Resumen de tema {n_topic}:\n {summary}\n===\n")

        summary_list.append((n_topic, topics, summary, joined_messages))

    return summary_list

In [136]:
listitia = summarize_per_topic_flan(chat)
pretty_print_summary_list(listitia, show_messages=True)

Your max_length is set to 512, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 512, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)
Your max_length is set to 512, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_length is set to 512, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
Your


Tema #0
Temas relevantes = ['buenisimo', 'god', 'contes', 'ganaron', 'perdimos', 'viendo', 'ahi', 'partido', 'buen', 'depe']
Resumen:
[{'summary_text': 'Tomas Martinez and Gaston Bonfils are having a good time.'}]

Tomas Martinez: ahi estuve viendo
Gaston Bonfils: God
Tomas Martinez: no me la contes
Tomas Martinez: ganaron??
Gaston Bonfils: Perdimos depe
Gaston Bonfils: Pero buen partido
Tomas Martinez: Buenisimo

Tema #1
Temas relevantes = ['colab', 'notion', 'exactamente', 'viendo', 'ahi']
Resumen:
[{'summary_text': 'Tomas Martinez is talking about the colab that was in the notion.'}]

Tomas Martinez: exactamente como el colab que estaba en el notion
Tomas Martinez: ahi estuve viendo

Tema #2
Temas relevantes = ['habria', 'tecnicas', 'utilizar', 'supervisadas', 'partido']
Resumen:
[{'summary_text': 'Tomas Martinez and Gaston Bonfils are talking about a game.'}]

Tomas Martinez: Habria que utilizar tecnicas no supervisadas
Gaston Bonfils: Pero buen partido

Tema #3
Temas relevantes =

In [137]:
import os

def pretty_string_summary_list(summary_list, show_messages=False):
    """
    imprime de manera lejible la lista de resumenes
    """
    text = ""
    for n_tema, temas, resumen, messages in summary_list:
        text += f"""
=============
Tema #{n_tema}
Temas relevantes = {temas}
Resumen:
{resumen}

"""
        if show_messages:
            text += messages
        text += "\n=============\n"
    return text

def process_file(file_path):
    """
    caso que se pasa un archivo
    """
    content = ""
    try:
        with open(file_path) as file:
            content = file.read()
    except:
        print("El archivo no se encontró")
        # exit(1)
        content = None
    return content


def test_particular(archivo):
    """
    dado el nombre del archivo (no la ruta), corre
    - resumir todo junto
    - resumir por tema
    y los escribe a un archivo
    """
    chat = process_file(os.path.join("ejemplos", archivo))

    resumen_crudo = summarize_with_flan(chat)
    print(resumen_crudo)

    lista_resumenes = summarize_per_topic_flan(chat)

    with open(
        os.path.join("ejemplos_resultados", f"res_{archivo}"), "w+"
    ) as output_file:
        final_text = ""
        final_text += "Resumen Crudo:\n"
        final_text += str(resumen_crudo) + "\n\n"
        final_text += "Resumenes por Tema:\n"
        resuemens_por_tema = pretty_string_summary_list(
            lista_resumenes, show_messages=True
        )
        final_text += resuemens_por_tema
        output_file.write(final_text)


def full_tests():
    for archivo in os.listdir("ejemplos"):
        if archivo.endswith(".txt"):
            test_particular(archivo)


In [None]:
full_tests()

Your max_length is set to 512, but your input_length is only 385. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=192)


[{'summary_text': 'Mica, Gaston Bonfils, Benjamin Lozano and Gonzalo Canavesio are having a conversation.'}]


Your max_length is set to 512, but your input_length is only 82. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 512, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
Your max_length is set to 512, but your input_length is only 67. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 512, but your input_length is only 62. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)
You

[{'summary_text': 'Tomas Martinez, Gaston Bonfils and Yo toy saliendo del fulbo.'}]


Your max_length is set to 512, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 512, but your input_length is only 80. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)
Your max_length is set to 512, but your input_length is only 104. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 512, but your input_length is only 104. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Yo

[{'summary_text': 'Gaston Bonfils and Gonzalo Canavesio are having a conversation.'}]


Your max_length is set to 512, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 512, but your input_length is only 144. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=72)
Your max_length is set to 512, but your input_length is only 171. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=85)
Your max_length is set to 512, but your input_length is only 169. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=84)


[{'summary_text': 'Gaston Bonfils and Tomas Martinez are having a conversation.'}]


Your max_length is set to 512, but your input_length is only 483. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=241)
Your max_length is set to 512, but your input_length is only 347. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=173)
Your max_length is set to 512, but your input_length is only 421. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=210)
Your max_length is set to 512, but your input_length is only 291. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1

[{'summary_text': "Benjamin Lozano and Gaston Bonfils are having a chat about Benjamin's resume. Benjamin finds Gaston's bicho jijoante and finds it relevant."}]


Your max_length is set to 512, but your input_length is only 107. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 512, but your input_length is only 73. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
Your max_length is set to 512, but your input_length is only 138. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
Your max_length is set to 512, but your input_length is only 83. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Yo

[{'summary_text': 'Tomi, Mendo and Tomi are meeting in a restaurant. They are going to buy some food.'}]


Your max_length is set to 512, but your input_length is only 66. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 512, but your input_length is only 157. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=78)
Your max_length is set to 512, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 512, but your input_length is only 162. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=81)
Y

[{'summary_text': 'Gaston Bonfils, [7/2/24 5:41 PM] acabo de ver un post que explica las matematicas del monopoly usando CADENAS DE MARKOV Gaston bonfils. Gonzalo Canavesio paga redirecciona a un post de reddit en el reddito de un video de youtube.'}]


Your max_length is set to 512, but your input_length is only 192. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 512, but your input_length is only 138. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
Your max_length is set to 512, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 512, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Y