In [1]:
import pandas as pd

In [3]:
target_ticker = 'NVDA'
original_df = pd.read_csv('../data/df_news.csv')
filtered_df = original_df[original_df['ticker'] == target_ticker].copy()

In [4]:
numeric_columns = ['overall_sentiment_score', 'relevance_score', 'ticker_sentiment_score','affected_topic_relevance_score']
for col in numeric_columns:
    if col in filtered_df.columns:
        filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')

        # Agrupar por hora y calcular agregados
        filtered_df = filtered_df.groupby('datetime').agg({
            'overall_sentiment_score': lambda x: round(x.mean(), 4),  # Promedio con 4 decimales
            'relevance_score': lambda x: round(x.mean(), 4),  # Promedio con 4 decimales
            'ticker_sentiment_score': lambda x: round(x.mean(), 4),  # Promedio con 4 decimales
            'affected_topic_relevance_score': lambda x: round(x.mean(), 4),  # Promedio con 4 decimales
            'title': 'nunique'  # Número de títulos únicos para contar noticias distintas
        }).reset_index()

        # Renombrar la columna del conteo para mayor claridad
        filtered_df = filtered_df.rename(columns={'overall_sentiment_score': 'ticker_overall_sentiment_score_mean'})
        filtered_df = filtered_df.rename(columns={'relevance_score': 'ticker_relevance_score_mean'})
        filtered_df = filtered_df.rename(columns={'ticker_sentiment_score': 'ticker_sentiment_score_mean'})
        filtered_df = filtered_df.rename(columns={'affected_topic_relevance_score': 'ticker_affected_topic_relevance_score_mean'})
        filtered_df = filtered_df.rename(columns={'title': 'distinct_news_count'})
        filtered_df = filtered_df.sort_values(by='datetime')

In [5]:
topic = 'Technology'

# Identificar títulos asociados al target_ticker
titles_with_target_ticker = original_df[original_df['ticker'] == target_ticker]['title'].unique()

# Excluir todas las noticias cuyos títulos estén relacionados con el target_ticker
non_related_news = original_df[~original_df['title'].isin(titles_with_target_ticker)].copy()

# Excluir filas que no tienen un tópico válido en el campo affected_topic
non_related_news = non_related_news[non_related_news['affected_topic'].notnull()]

# Filtrar por tópico
topic_data = non_related_news[non_related_news['affected_topic'] == topic]

# Seleccionar columnas relevantes y eliminar duplicados por datetime
topic_data = topic_data[['datetime', 'title', 'overall_sentiment_score', 'affected_topic_relevance_score']].drop_duplicates()

# Contar el número de noticias por datetime
topic_data['news_count'] = topic_data.groupby('datetime')['datetime'].transform('count')

# Agrupar por datetime y calcular métricas similares a las del ticker
topic_metrics = topic_data.groupby('datetime').agg({
    'overall_sentiment_score': lambda x: round(x.mean(), 4),
    'affected_topic_relevance_score': lambda x: round(x.mean(), 4),
    'news_count': lambda x: x.mean()
    }).rename(columns={
        'overall_sentiment_score': f'{topic}_overall_sentiment_score_mean',
        'affected_topic_relevance_score': f'{topic}_affected_topic_relevance_score_mean',
        'news_count': f'{topic}_distinct_news_count'
    }).reset_index()

print(topic_metrics)

                 datetime  Technology_overall_sentiment_score_mean  \
0     2022-03-05 01:00:00                                  -0.2232   
1     2022-03-05 05:00:00                                  -0.0325   
2     2022-03-07 09:00:00                                  -0.1322   
3     2022-03-07 11:00:00                                   0.2263   
4     2022-03-07 14:00:00                                  -0.1873   
...                   ...                                      ...   
6600  2025-02-01 04:00:00                                   0.2831   
6601  2025-02-01 06:00:00                                   0.2338   
6602  2025-02-01 08:00:00                                   0.2225   
6603  2025-02-01 09:00:00                                   0.2324   
6604  2025-02-01 10:00:00                                  -0.0220   

      Technology_affected_topic_relevance_score_mean  \
0                                             0.2000   
1                                             1

In [7]:
merged_df = pd.merge(topic_metrics, filtered_df, on='datetime', how='outer')

print(merged_df)

                 datetime  Technology_overall_sentiment_score_mean  \
0     2022-03-05 01:00:00                                  -0.2232   
1     2022-03-05 05:00:00                                  -0.0325   
2     2022-03-07 09:00:00                                  -0.1322   
3     2022-03-07 11:00:00                                   0.2263   
4     2022-03-07 14:00:00                                  -0.1873   
...                   ...                                      ...   
7214  2025-02-01 04:00:00                                   0.2831   
7215  2025-02-01 06:00:00                                   0.2338   
7216  2025-02-01 08:00:00                                   0.2225   
7217  2025-02-01 09:00:00                                   0.2324   
7218  2025-02-01 10:00:00                                  -0.0220   

      Technology_affected_topic_relevance_score_mean  \
0                                             0.2000   
1                                             1