# Paso 1 - Preparar el ambiente instalando todo lo necesario

In [8]:
import pandas as pd
import numpy as np
import re
import string
import plotly.express as px
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

try:
  from bertopic import BERTopic
except:
  !pip install bertopic
  from bertopic import BERTopic

try:
  import es_core_news_sm
except:
  !python -m spacy download es_core_news_sm
  import es_core_news_sm

try:
  import mpld3
except:
  !pip install mpld3
  import mpld3

Collecting mpld3
  Downloading mpld3-0.5.9-py3-none-any.whl (201 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/201.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/201.2 kB[0m [31m917.8 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m143.4/201.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.2/201.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mpld3
Successfully installed mpld3-0.5.9


# Paso 2 - Descarga de los files con noticias

In [None]:
def download(months=None,url=None):
  '''
  Función para recuperar el repositorio en Github una lista de files conteniendo noticias por mes y año
  '''
  files = {}
  for m in months:
      try:
          data_url = url + m + '.csv.gz?raw=true'
          files[m] = pd.read_csv(data_url, compression='gzip')
          print('downloading',url)
      except:
          months.remove(m)

  df = pd.concat(files.values(), ignore_index=True)
  # Descartar categoría sobredimensionada para uno de los diarios
  df = df[df.category != 'mundo']
  return df

months = ['202306','202307','202308'] # '202301','202302','202303','202304','202305'
url = 'https://github.com/fermasia/news-base/blob/main/files/'
news_df = download( months = months , url = url )

downloading https://github.com/fermasia/news-base/blob/main/files/


#Paso 3 - Preparación del Dataset y Visualizaciones iniciales

In [None]:
def prepare_dataset(df):
  '''
  Función para realizar las preparaciones iniciales del dataset, agregar
  una columna con mes y año en base a una fecha
  '''
  print('Preparing dataset'
  #m = 3
  ### Force datetime format
  df['date'] = pd.to_datetime(df['date'], errors='coerce', utc=True).dt.tz_convert('America/Argentina/Buenos_Aires')
  # Discard records before 2021/12/1
  df = df[df['date'] > '2021-11-30']
  # Create yyyymm field
  df['yyyymm'] = df['date'].dt.year.astype(str) + '-' + df['date'].dt.month.astype(str).str.zfill(2)

  # Order rows by source and yyyymm and reset indexes
  df.sort_values(by=['yyyymm','source'],ascending=True,inplace=True)
  df.reset_index(drop=True,inplace=True)

  # Make sure we are only keeping the months received as parameter
  m = [ x[:4] + '-' + str(int(x[-2:])*1).zfill(2) for x in months ]
  df = df[df.yyyymm.isin(m)]
  return df

prepared_df = prepare_dataset(news_df)

In [None]:
temp = prepared_df[prepared_df.category.isin(['politica','economia','sociedad','deportes'])].groupby(['yyyymm','source']).count()['text'].reset_index()

def create_pivot_table(df, rows, columns, values):
    pivot_table = df.pivot_table(index=rows, columns=columns, values=values)
    return pivot_table

temp

create_pivot_table(temp, 'yyyymm' , 'source', 'text').fillna(0).astype(int)

In [None]:
prepared_df[prepared_df.category == 'politica'].shape

In [None]:
temp = prepared_df[prepared_df.category.isin(['politica','economia','sociedad','deportes'])].groupby(['yyyymm','category']).count()['text'].reset_index()

def create_pivot_table(df, rows, columns, values):
    pivot_table = df.pivot_table(index=rows, columns=columns, values=values)
    return pivot_table

temp

create_pivot_table(temp, 'yyyymm' , 'category', 'text').fillna(0).astype(int)

In [None]:
temp = prepared_df.groupby(['category','source']).count()['text'].reset_index()

def create_pivot_table(df, rows, columns, values):
    pivot_table = df.pivot_table(index=rows, columns=columns, values=values)
    return pivot_table

temp

create_pivot_table(temp, 'category' , 'source', 'text').fillna(0).astype(int)

In [None]:
def plot_summary(df):
    for sect in df.category.unique():
        # Group the data by "yyyym" and "source" and count the number of news articles for each combination
        df_grouped = df[df.category == sect].groupby(['yyyymm', 'source']).size().unstack().fillna(0)
        # Create an unstacked bar plot
        df_grouped.plot(kind='bar', stacked=False, figsize=(10, 3), color=['#EB172B', '#F68E1E', '#006998', '#32937f'])
        # Set the labels in Spanish
        plt.xlabel('Año-Mes')
        title = 'Noticias por Mes y Periódico - Sección: ' + str(sect).capitalize()
        plt.ylabel('N# Artículos')
        plt.title(title)
        plt.legend(title='Periódico', loc='upper right')
        plt.legend(bbox_to_anchor=(1.02, 1.0), loc='upper left')
        plt.xticks(rotation=0)  # Set x-axis labels horizontal
        plt.tight_layout()
        plt.show()

plot_summary(prepared_df)

In [None]:
section = 'politica'
content = 'title' # title, text, all

def create_corpus_df(df):
  print('Creating corpus')
  # Si se definió una sección, filtrar
  if section == '':
      corpus_df = df.copy()
  else:
      corpus_df = df[df.category == section].reset_index(drop=True)

  # Concatenar medio y mes-año
  corpus_df['source'] = corpus_df['yyyymm'] + '_' + corpus_df['source']

  # conservamos los origenes y el mes año para unir luego de procesar
  links_df = corpus_df['link']
  sources_df = corpus_df['source']
  dates_df = corpus_df['date']

  # Según se haya definido, filtrar Titulo, Cuerpo o concatenar todo
  if  content == 'title':
      corpus_df = pd.DataFrame(corpus_df.title)
      col = 'title'

  elif content == 'text':
      corpus_df = pd.DataFrame(corpus_df.text)
      col = 'text'
  else:
      corpus_df = pd.DataFrame(corpus_df.title + ' ' + corpus_df.text)
      col = 0

  # Renombrar columna y asegurar que sea STR
  corpus_df.rename(columns={col:'text'},inplace=True)
  corpus_df['text'] = corpus_df.text.astype(str)

  # Store list of timestamps to reuse later
  timestamp = dates_df.to_list()

  return corpus_df, sources_df, dates_df, links_df

corpus_df, sources_df, dates_df, links_df = create_corpus_df(prepared_df)

In [None]:
def solve_capital_middle(sentence):
    words = sentence.split()
    modified_sentence = []

    for word in words:
        for i in range(1, len(word) - 1):  # Check from the second character to the second-to-last character
            if word[i].isupper():
                split_words = [word[:i] + '.', word[i:]]
                modified_sentence.extend(split_words)
                break
        else:
            modified_sentence.append(word)

    return ' '.join(modified_sentence)

def clean_text(text):
    # Replace HTML tags with a space
    cleaned_text = re.sub(r'<[^>]+>', ' ', text)

    # Define a set of characters to replace with spaces
    replace_chars = "@#^&*()!\"'<>/-_"  # Add any additional characters here if needed

    # Replace specified characters with spaces
    for char in replace_chars:
        cleaned_text = cleaned_text.replace(char, ' ')

    return cleaned_text

def clean_corpus_df(df):
  print('Cleaning Corpus')
  df['text'] = df['text'].apply(solve_capital_middle)
  df['text'] = df['text'].apply(clean_text)

  return df

cleaned_df = clean_corpus_df(corpus_df)

In [None]:
def build_final_dataset_df(list_df):
  print('Building Final Dataset')
  return pd.concat( list_df , axis='columns')

final_df = build_final_dataset_df( [cleaned_df, sources_df , dates_df, links_df] )

# Paso 4 - Modelado


In [None]:
def make_embeddings(df_field):
  from sentence_transformers import SentenceTransformer
  # Prepare embeddings
  sentence_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
  embeddings = sentence_model.encode(df_field, show_progress_bar=True)
  # Train our topic model using our pre-trained sentence-transformers embeddings
  return embeddings

embeddings = make_embeddings(final_df['text'])

In [None]:
#!python -m spacy download es_core_news_sm
import es_core_news_sm
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance

parts_of_speech = PartOfSpeech("es_core_news_sm")
mmr = MaximalMarginalRelevance(diversity=0.3)
representation_models = [mmr, parts_of_speech]

from umap import UMAP
#umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.5, metric='cosine', random_state=912)
umap_model = UMAP(random_state=912)
from hdbscan import HDBSCAN
#hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
from sklearn.feature_extraction.text import CountVectorizer
stopwords_es = pd.read_csv('https://raw.githubusercontent.com/jbagnato/machine-learning/master/nlp/spanish.txt',header=None)[0].to_list()
vectorizer_model = CountVectorizer(stop_words=stopwords_es, min_df=2, ngram_range=(1, 2))

In [None]:
topic_model = BERTopic(language="multilingual",
                      representation_model = representation_models,
                      umap_model=umap_model,
                      vectorizer_model=vectorizer_model,
                      calculate_probabilities=True,
                      nr_topics= "auto")
topics, probs = topic_model.fit_transform(final_df.text.to_list(),embeddings)


# Paso 5 - Visualización de los resultados

In [None]:
fig = topic_model.visualize_barchart(top_n_topics=30)
fig.write_html("top30_topics.html")
fig.show()

In [None]:
def plot_bars(df,q,figsize,save=False):
    tabla = df[df.category == section].reset_index(drop=True)
    labels = pd.DataFrame()
    labels['num'] = pd.DataFrame(topic_model.generate_topic_labels()[1:])[0].str.split('_').apply(lambda x: x[0])
    labels['tema'] = pd.DataFrame(topic_model.generate_topic_labels()[1:])[0].str.split('_').apply(lambda x: "-".join(x[1:]))
    labels['num'] = labels['num'].astype(str)
    temas = pd.DataFrame(pd.DataFrame(probs).idxmax(axis=1)).rename({0:'num'},axis=1)
    temas['num'] = temas['num'].astype(str)
    temas = pd.merge(temas,labels,on='num',how='left')['tema']
    tabla = pd.merge(tabla,temas,left_index=True, right_index=True)

    agrupado = tabla.groupby(['source','tema','yyyymm']).count()['text'].reset_index().rename({'text':'porc'},axis=1)
    agrupado['porc'] /= agrupado.groupby(['source','yyyymm'])['porc'].transform('max').div(100)
    agrupado['porc'] = agrupado.porc.round(2)

    top = tabla.groupby(['tema']).count()['source'].reset_index().sort_values(by='source',ascending=False).head(q)['tema'].to_list()

    from matplotlib import pyplot as plt
    for pos, tema in enumerate(top):
        final = agrupado[agrupado.tema == tema ].drop(columns=['tema'])
        if save:
          name = 'plo_'+ section + '_' + str(pos) + '.png'
          final.pivot('yyyymm','source','porc').plot.bar(figsize=figsize,title=tema,alpha=0.9,rot=0,color=['#EB172B', '#F68E1E', '#006998', '#32937f'],xlabel='').get_figure().savefig(name)
          print(name,'stored')
        else:
          final.pivot('yyyymm','source','porc').plot.bar(figsize=figsize,title=tema,alpha=0.9,rot=0,color=['#EB172B', '#F68E1E', '#006998', '#32937f'],xlabel='')

import pandas as pd
from matplotlib import pyplot as plt

def plot_bars2(df, q, section, figsize, save=False):
    tabla = df[df.category == section].reset_index(drop=True)
    labels = pd.DataFrame()
    labels['num'] = pd.DataFrame(topic_model.generate_topic_labels()[1:])[0].str.split('_').apply(lambda x: x[0])
    labels['tema'] = pd.DataFrame(topic_model.generate_topic_labels()[1:])[0].str.split('_').apply(lambda x: "-".join(x[1:]))
    labels['num'] = labels['num'].astype(str)
    temas = pd.DataFrame(pd.DataFrame(probs).idxmax(axis=1)).rename({0: 'num'}, axis=1)
    temas['num'] = temas['num'].astype(str)
    temas = pd.merge(temas, labels, on='num', how='left')['tema']
    tabla = pd.merge(tabla, temas, left_index=True, right_index=True)

    agrupado = tabla.groupby(['source', 'tema', 'yyyymm']).count()['text'].reset_index().rename({'text': 'porc'}, axis=1)
    agrupado['porc'] /= agrupado.groupby(['source', 'yyyymm'])['porc'].transform('max').div(100)
    agrupado['porc'] = agrupado.porc.round(2)

    top = tabla.groupby(['tema']).count()['source'].reset_index().sort_values(by='source', ascending=False).head(q)['tema'].to_list()

    for pos, tema in enumerate(top):
        final = agrupado[agrupado.tema == tema].drop(columns=['tema'])
        if save:
            name = 'plo_' + section + '_' + str(pos) + '.png'
            ax = final.pivot('yyyymm', 'source', 'porc').plot.bar(
                figsize=figsize,
                title=tema,
                alpha=0.7,
                rot=0,
                color=['#EB172B', '#F68E1E', '#006998', '#32937f'],
                xlabel=''
            )
            plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Place legend outside the plot space
            for container in ax.containers:
                plt.setp(container, alpha=0.7)  # Set transparency to the bars
            plt.savefig(name, bbox_inches='tight')  # Save the plot with adjusted legend position
            print(name, 'stored')
        else:
            final.pivot('yyyymm', 'source', 'porc').plot.bar(
                figsize=figsize,
                title=tema,
                alpha=0.9,
                rot=0,
                color=['#EB172B', '#F68E1E', '#006998', '#32937f'],
                xlabel=''
            )
            plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Place legend outside the plot space

import matplotlib.pyplot as plt
import pandas as pd
import mpld3

def plot_bars3(df, q, section, figsize, save=False):
    tabla = df[df.category == section].reset_index(drop=True)
    labels = pd.DataFrame()
    labels['num'] = pd.DataFrame(topic_model.generate_topic_labels()[1:])[0].str.split('_').apply(lambda x: x[0])
    labels['tema'] = pd.DataFrame(topic_model.generate_topic_labels()[1:])[0].str.split('_').apply(lambda x: "-".join(x[1:]))
    labels['num'] = labels['num'].astype(str)
    temas = pd.DataFrame(pd.DataFrame(probs).idxmax(axis=1)).rename({0: 'num'}, axis=1)
    temas['num'] = temas['num'].astype(str)
    temas = pd.merge(temas, labels, on='num', how='left')['tema']
    tabla = pd.merge(tabla, temas, left_index=True, right_index=True)

    agrupado = tabla.groupby(['source', 'tema', 'yyyymm']).count()['text'].reset_index().rename({'text': 'porc'}, axis=1)
    agrupado['porc'] /= agrupado.groupby(['source', 'yyyymm'])['porc'].transform('max').div(100)
    agrupado['porc'] = agrupado.porc.round(2)

    top = tabla.groupby(['tema']).count()['source'].reset_index().sort_values(by='source', ascending=False).head(q)['tema'].to_list()

    num_plots = len(top)
    rows = (num_plots + 1) // 2  # Calculate number of rows for the subplot grid
    fig, axes = plt.subplots(rows, 2, figsize=(figsize[0]*2, figsize[1]*rows))  # Create subplot grid

    html_plots = []  # To store HTML representations of the plots

    for pos, tema in enumerate(top):
        final = agrupado[agrupado.tema == tema].drop(columns=['tema'])
        current_ax = axes[pos // 2, pos % 2] if rows > 1 else axes[pos % 2]  # Select subplot for the current plot
        if save:
            name = 'plo_' + section + '_' + str(pos) + '.png'
            final.pivot('yyyymm', 'source', 'porc').plot.bar(
                ax=current_ax,
                alpha=0.7,
                rot=0,
                color=['#EB172B', '#F68E1E', '#006998', '#32937f'],
                legend=False,
                xlabel='',
                title=tema
            )
            current_ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Place legend outside the plot space
            for container in current_ax.containers:
                plt.setp(container, alpha=0.7)  # Set transparency to the bars
            plt.savefig(name, bbox_inches='tight')  # Save the plot with adjusted legend position
            html_plots.append(mpld3.fig_to_html(fig))
            print(name, 'stored')

        else:
            final.pivot('yyyymm', 'source', 'porc').plot.bar(
                ax=current_ax,
                alpha=0.9,
                rot=0,
                color=['#EB172B', '#F68E1E', '#006998', '#32937f'],
                legend=False,
                xlabel='',
                title=tema
            )
            current_ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Place legend outside the plot space

    plt.tight_layout()

    if save:
        with open('plots.html', 'w') as f:
            for plot in html_plots:
                f.write(plot)  # Write HTML representations of plots to a file
    else:
        plt.show()


In [None]:
plot_bars3(prepared_df,q=30, section=section, figsize=(5,1.5),save=False)

In [None]:
import plotly.express as px
import pandas as pd
import numpy as np

def plot_topics_over_time(topic_model, final_dataset, timestamp, bins, number_of_topics=10, save=False):
    # Calculate topics over time
    ovt_df = topic_model.topics_over_time(
        final_dataset.text.to_list(),
        pd.to_datetime(pd.to_datetime(timestamp)),
        nr_bins=bins
    )

    # Filter topics based on frequency
    filtered_topics = filter_classes_within_devest(ovt_df, 'Topic', 'Frequency', 2)
    ovt_df = ovt_df[ovt_df.Topic.isin(filtered_topics)]

    # Filter topics based on the number_of_topics
    ovt_df = ovt_df[ovt_df.Topic.isin(np.arange(0, number_of_topics))]

    # Get topic names
    topic_names = topic_model.get_topic_info()[['Topic', 'Name']]

    # Merge topic names with ovt_df
    ovt_df = pd.merge(ovt_df, topic_names, how='left', left_on='Topic', right_on='Topic')

    # Convert Timestamp to date
    ovt_df['Timestamp'] = pd.to_datetime(ovt_df['Timestamp']).dt.date

    # Create a line plot with modifications
    fig = px.line(
        ovt_df,
        x="Timestamp",
        y="Frequency",
        color='Name',
        hover_name=None,
        hover_data=["Timestamp", "Frequency", "Words"]
    )

    # Adjust line thickness and transparency
    fig.update_traces(line=dict(width=4, opacity=0.7))

    # Use a pastel palette with divergent colors
    fig.update_layout(colorway=['#FFB6C1', '#87CEEB', '#FFD700', '#7FFFD4', '#FFA07A', '#ADD8E6'])

    # Set background color to soft grey for visibility
    fig.update_layout({
        'plot_bgcolor': 'rgba(240, 240, 240, 0.7)',
        'paper_bgcolor': 'rgba(240, 240, 240, 0.7)'
    })

    if save:
        # Save the plot as an HTML file
        name = 'ovt_' + section + '.html'
        fig.write_html(name)
        print(name, 'stored')
    else:
        # Show the plot
        fig.show()

# Example usage:
# plot_topics_over_time(your_topic_model, your_final_dataset, your_timestamp, your_bins, number_of_topics=10, save=False)




In [None]:
  def filter_classes_within_devest(dataframe,classname,quantity,num_deviations):
    # Calculate the mean and standard deviation of the quantity column
    mean_quantity = dataframe[quantity].mean()
    std_quantity = dataframe[quantity].std()

    # Define a threshold for inclusion (n standard deviations from the mean)
    threshold = num_deviations * std_quantity

    # Filter the DataFrame based on the threshold
    filtered_df = dataframe[abs(dataframe[quantity] - mean_quantity) <= threshold]

    # Get the list of class names that meet the criteria
    class_names = filtered_df[classname].unique()

    return class_names

def plot_topics_over_time(topic_model, final_dataset, timestamp, bins, number_of_topics=10, save=False):
      # Calculate topics over time
      ovt_df = topic_model.topics_over_time(
          final_dataset.text.to_list(),
          pd.to_datetime(pd.to_datetime(timestamp)),
          nr_bins=bins
      )

      # Filter topics based on frequency
      filtered_topics = filter_classes_within_devest(ovt_df, 'Topic', 'Frequency', 2)
      ovt_df = ovt_df[ovt_df.Topic.isin(filtered_topics)]

      # Filter topics based on the number_of_topics
      ovt_df = ovt_df[ovt_df.Topic.isin(np.arange(0, number_of_topics))]

      # Get topic names
      topic_names = topic_model.get_topic_info()[['Topic', 'Name']]

      # Merge topic names with ovt_df
      ovt_df = pd.merge(ovt_df, topic_names, how='left', left_on='Topic', right_on='Topic')

      # Convert Timestamp to date
      ovt_df['Timestamp'] = pd.to_datetime(ovt_df['Timestamp']).dt.date

      # Create a line plot
      fig = px.line(
          ovt_df,
          x="Timestamp",
          y="Frequency",
          color='Name',
          hover_name=None,
          hover_data=["Timestamp", "Frequency", "Words"]
      )

      if save:
          # Save the plot as an HTML file
          name = 'ovt_' + section + '.html'
          fig.write_html(name)
          print(name, 'stored')
      else:
          # Show the plot
          fig.show()

In [None]:
plot_topics_over_time(topic_model, final_df, dates_df, bins=12, number_of_topics=10, save=True)

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP

# # Prepare embeddings
# docs = final_df.text.to_list()
# sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(final_df.text.to_list(), reduced_embeddings=reduced_embeddings)