In [3]:
from datetime import datetime
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
import pandas as pd
from utils.utils_bigquery import *
from datetime import *
import nltk
from nltk.corpus import wordnet as wn
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import random

In [5]:
key_path = key_path
project = project_id

# Sources
table = 'silver_indicators'
dataset = 'silver'

# Tables id
table_conca = f'{project}.{dataset}.{table}'

# Loading tables
table_to_save = 'silver_news_data'
dataset_to_save = 'silver'
table_conca_to_save = f'{project}.{dataset_to_save}.{table_to_save}'

In [6]:
# Conectamos con Bigquery
bigquery = BigQueryUtils(key_path)

In [9]:
tickers_data = bigquery.run_query(
    f"""
    SELECT
        ticker,
        date,
        open,
        close,
        volume
    FROM {project}.{dataset}.{table}
    """
)
tickers_data

Unnamed: 0,ticker,date,open,close,volume
0,PCAR,2018-07-12 00:00:00+00:00,41.066666,41.153332,1613550
1,PCAR,2020-07-28 00:00:00+00:00,57.313332,56.966667,2995650
2,PCAR,2018-05-25 00:00:00+00:00,43.433334,43.093334,1427250
3,EQIX,2021-06-25 00:00:00+00:00,780.190002,783.400024,1128500
4,TRMB,2022-06-03 00:00:00+00:00,69.120003,68.629997,746900
...,...,...,...,...,...
7821003,SUSC,2017-11-27 00:00:00+00:00,25.434999,25.434999,0
7821004,THRY,2019-06-05 00:00:00+00:00,10.675000,10.675000,0
7821005,SMCP,2017-08-07 00:00:00+00:00,24.160000,24.160000,0
7821006,ENVB,2015-08-04 00:00:00+00:00,17600.000000,17550.000000,0


In [11]:
import pandas as pd
import random

# Generamos sinónimos
def get_synonyms(word, pos):
    synsets = wn.synsets(word, pos=pos)
    return list(set([lemma.name() for synset in synsets for lemma in synset.lemmas()]))


def generate_news_titles(df):
    news = []
    
    templates = [
        "{ticker} stock {movement} {percent}% as {market_condition}",
        "Investors {reaction} as {ticker} shares {movement} ${price_change}",
        "{ticker} {performance} in {volume} trading session",
        "{market_condition} leads to {movement} in {ticker} stock",
        "{ticker} shares {movement} amid {market_factor}"
    ]
    
    market_conditions = ['market volatility', 'economic uncertainty', 'sector trends', 'global factors']
    market_factors = ['earnings expectations', 'analyst reports', 'industry news', 'technological advancements']
    
    grouped = df.groupby(['ticker', 'date'])
    for (ticker, date), group in grouped:
        row = group.iloc[0]
        price_change = row['close'] - row['open']
        percent_change = (price_change / row['open']) * 100
        volume_change = row['volume'] - group['volume'].mean()
        
        movement_words = get_synonyms('increase', 'v') if price_change > 0 else get_synonyms('decrease', 'v')
        reaction_words = get_synonyms('optimistic', 'a') if price_change > 0 else get_synonyms('pessimistic', 'a')
        performance_words = get_synonyms('excel', 'v') if price_change > 0 else get_synonyms('struggle', 'v')
        volume_words = ['high-volume', 'active', 'busy'] if volume_change > 0 else ['low-volume', 'quiet', 'subdued']
        
        num_titles = random.randint(3, 6)
        titles = []
        for _ in range(num_titles):
            headline = random.choice(templates).format(
                ticker=ticker,
                movement=random.choice(movement_words).replace('_', ' '),
                percent=abs(round(percent_change, 2)),
                price_change=abs(round(price_change, 2)),
                market_condition=random.choice(market_conditions),
                reaction=random.choice(reaction_words).replace('_', ' '),
                performance=random.choice(performance_words).replace('_', ' '),
                volume=random.choice(volume_words),
                market_factor=random.choice(market_factors)
            )
            titles.append(headline)
        
        news.append({
            'ticker': ticker,
            'date': date,
            'titles': titles
        })
    
    return news


news = generate_news_titles(tickers_data)
news


KeyboardInterrupt: 

In [None]:
# Volvemos a analizar el sentimiento de las noticias sintéticas creadas

text_column = 'News_Title'  

def perform_sentiment_analysis(texts):
    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    results = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        sentiment_score = probabilities[0][2].item() - probabilities[0][0].item()  # Positive - Negative
        results.append(sentiment_score)
    
    return results

sentiment_scores = perform_sentiment_analysis(news[text_column])
news['sentiment_score'] = sentiment_scores
news['sentiment'] = pd.cut(news['sentiment_score'], 
                              bins=[-np.inf, -0.05, 0.05, np.inf], 
                              labels=['NEGATIVE', 'NEUTRAL', 'POSITIVE'])

print("\nSentiment Distribution:")
print(news['sentiment'].value_counts(normalize=True))


Sentiment Distribution:
sentiment
NEGATIVE    0.558736
NEUTRAL     0.301100
POSITIVE    0.140164
Name: proportion, dtype: float64


In [None]:
news

Unnamed: 0,Date,News_Title,sentiment_score,sentiment
0,2019-08-26,sector trends leads to increase in Apple stock,-0.891113,NEGATIVE
1,2019-08-26,Investors affirmative as Apple shares increase...,-0.856307,NEGATIVE
2,2019-08-26,Investors affirmative as Apple shares increase...,-0.856307,NEGATIVE
3,2019-08-27,Apple stock fall 1.78% as global factors,0.017897,NEUTRAL
4,2019-08-27,Apple shares lessen amid earnings expectations,0.011305,NEUTRAL
...,...,...,...,...
5724,2024-08-22,global factors leads to fall in Apple stock,0.010335,NEUTRAL
5725,2024-08-23,Investors optimistic as Apple shares increase ...,-0.882363,NEGATIVE
5726,2024-08-23,Apple stock increase 0.52% as market volatility,-0.393367,NEGATIVE
5727,2024-08-23,Apple stand out in quiet trading session,0.633735,POSITIVE


In [None]:
grouped_sint = news.groupby('date')

agg_sint = pd.DataFrame({
        'positive_sentiment': grouped_sint.apply(lambda x: x[x['sentiment'] == 'POSITIVE']['sentiment_score'].mean()),
        'negative_sentiment': grouped_sint.apply(lambda x: x[x['sentiment'] == 'NEGATIVE']['sentiment_score'].mean()),
        'neutral_sentiment': grouped_sint.apply(lambda x: x[x['sentiment'] == 'NEUTRAL']['sentiment_score'].mean()),
        'positive_count': grouped_sint.apply(lambda x: (x['sentiment'] == 'POSITIVE').sum()),
        'negative_count': grouped_sint.apply(lambda x: (x['sentiment'] == 'NEGATIVE').sum()),
        'neutral_count': grouped_sint.apply(lambda x: (x['sentiment'] == 'NEUTRAL').sum()),
        'news_count': grouped_sint.size()
    })

agg_sint = agg_sint.fillna(0)
conditions = [
    (agg_sint['positive_count'] > agg_sint['negative_count']) & (agg_sint['neutral_count'] == agg_sint['positive_count']),
    (agg_sint['negative_count'] > agg_sint['positive_count']) & (agg_sint['neutral_count'] == agg_sint['negative_count']),
    (agg_sint['neutral_count'] > agg_sint['positive_count']) & (agg_sint['neutral_count'] > agg_sint['negative_count']),
    (agg_sint['positive_count'] == agg_sint['negative_count']) & (agg_sint['positive_count'] == agg_sint['neutral_count']),
    (agg_sint['neutral_count'] == agg_sint['positive_count']) & (agg_sint['positive_count'] > agg_sint['negative_count']),
    (agg_sint['neutral_count'] == agg_sint['negative_count']) & (agg_sint['negative_count'] > agg_sint['positive_count']),
    (agg_sint['positive_count'] > agg_sint['negative_count']),
    (agg_sint['negative_count'] > agg_sint['positive_count'])
]

choices = [1, -1, 0, 0, 1, -1, 1, -1]
agg_sint['sentiment_indicator'] = np.select(conditions, choices, default=0)

In [None]:
def combine_stock_and_news(stock_df, news_df):
    final_df = stock_df.join(news_df, how='left')
    final_df.sort_index(inplace=True)
    final_df['sentiment_indicator'] = final_df['sentiment_indicator'].ffill()

final_df = combine_stock_and_news(tickers_data, agg_sint)
final_df

In [None]:
# Fields that make up the ID
id_fields = ['ticker', 'date']

# Apply the function to the DataFrame to create the 'id' column
final_df['id'] = final_df.apply(generate_id, axis=1, fields=id_fields)

final_df

In [21]:
# try:
# Filtramos solamente los nuevos registros
df_incremental = bigquery.select_for_incremental(id='id', table=table_conca_to_save, new_df=final_df)

if not df_incremental.empty:
    # Guardamos los datos en Bigquery
    bigquery.save_dataframe(df_incremental, project, dataset_to_save, table_to_save, if_exists='append', schema=None)
    print(f'New records loaded')
else:
    print('No new records to load.')

# En el caso de no tener datos en Bigquery, guardamos todo el df
# except Exception as e:
#     bigquery.save_dataframe(final_df, project, dataset_to_save, table_to_save, if_exists='replace', schema=None)
#     print('New data persisted')
#     print(f'Exception encountered: {e}')

No new records to load.
