<a href="https://colab.research.google.com/github/erlonL/series-temporais/blob/main/ST_Projeto2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install yfinance snscrape pandas_market_calendars pandas_datareader snscrape transformers torch nltk tqdm IPython jupyter ipywidgets tensorflow tf-keras

Collecting tf-keras
  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.17.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: tf-keras
Successfully installed tf-keras-2.17.0


In [5]:
import yfinance as yf
#import snscrape.modules.twitter as sntwitter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_market_calendars as mcal
import pandas_datareader.data as web
import pickle
import snscrape
from transformers import pipeline

In [6]:
new_model = pipeline(
    model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
    return_all_scores=True
)



## ETAPA 1: Coleta e Tratamento de Dados

S&P 500, abreviação de Standard & Poor's 500, ou simplesmente S&P, trata-se de um índice composto por quinhentos ativos cotados nas bolsas de NYSE ou NASDAQ, qualificados devido ao seu tamanho de mercado, sua liquidez e sua representação de grupo industrial.

### Dados do mercado de ações

In [7]:
# ticker S&P 500
ticker = '^GSPC'
index = yf.Ticker(ticker)

# de acordo com nossa escolha
start_date = '2020-01-01'
end_date = '2024-09-25'

data = index.history(start=start_date, end=end_date)

# deixar apenas a data (y/m/d) na coluna date
data.index = data.index.date

# Exiba os primeiros registros para verificação
print(data.head())

                   Open         High          Low        Close      Volume  \
2020-01-02  3244.669922  3258.139893  3235.530029  3257.850098  3459930000   
2020-01-03  3226.360107  3246.149902  3222.340088  3234.850098  3484700000   
2020-01-06  3217.550049  3246.840088  3214.639893  3246.280029  3702460000   
2020-01-07  3241.860107  3244.909912  3232.429932  3237.179932  3435910000   
2020-01-08  3238.590088  3267.070068  3236.669922  3253.050049  3726840000   

            Dividends  Stock Splits  
2020-01-02        0.0           0.0  
2020-01-03        0.0           0.0  
2020-01-06        0.0           0.0  
2020-01-07        0.0           0.0  
2020-01-08        0.0           0.0  


No artigo, foi filtrado os dados para os dias úteis do mercado. Faremos a seguir esse processamento:

In [8]:
# filtrando os dados de stock para os dias uteis
nyse = mcal.get_calendar('NYSE')

dias_uteis = nyse.valid_days(start_date=start_date, end_date=end_date)

# Pegar so a data
dias_uteis = [dia.date() for dia in dias_uteis]

In [9]:
sp500 = data.loc[data.index.intersection(dias_uteis)]

print(sp500.head())

sp500.to_csv("data/csv/sp500.csv")

                   Open         High          Low        Close      Volume  \
2020-01-02  3244.669922  3258.139893  3235.530029  3257.850098  3459930000   
2020-01-03  3226.360107  3246.149902  3222.340088  3234.850098  3484700000   
2020-01-06  3217.550049  3246.840088  3214.639893  3246.280029  3702460000   
2020-01-07  3241.860107  3244.909912  3232.429932  3237.179932  3435910000   
2020-01-08  3238.590088  3267.070068  3236.669922  3253.050049  3726840000   

            Dividends  Stock Splits  
2020-01-02        0.0           0.0  
2020-01-03        0.0           0.0  
2020-01-06        0.0           0.0  
2020-01-07        0.0           0.0  
2020-01-08        0.0           0.0  


In [10]:
sp500.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0
mean,4165.217473,4190.424592,4138.559578,4166.022263,4403520000.0,0.0,0.0
std,677.736933,675.67581,680.305417,677.989815,1049796000.0,0.0,0.0
min,2290.709961,2300.72998,2191.860107,2237.399902,1639500000.0,0.0,0.0
25%,3786.674988,3819.482483,3749.492493,3790.517395,3762062000.0,0.0,0.0
50%,4170.040039,4193.76001,4142.61499,4170.560059,4130170000.0,0.0,0.0
75%,4531.8302,4545.984863,4509.594971,4527.872559,4748670000.0,0.0,0.0
max,5727.660156,5735.319824,5704.220215,5732.930176,9976520000.0,0.0,0.0


### Dados de Notícias / Redes Sociais

In [11]:
# news articles
with open('data/pkl/df_news.pkl', 'rb') as f:
    news = pickle.load(f)
# bsky tweets
with open('data/pkl/df_socials.pkl', 'rb') as f:
    bsky = pickle.load(f)

In [12]:
news.head(2)

Unnamed: 0,title,text,label,links,symbol,company,date
0,Stocks making the biggest moves midday: Palo A...,A view of the exterior of the new Dutch head o...,-2,https://www.cnbc.com/2023/06/05/stocks-making-...,MMM,3M,2023-06-05
1,3M will spin off its health-care business into...,3M announced Tuesday it will spin off its heal...,-2,https://www.cnbc.com/2022/07/26/3m-will-spin-o...,MMM,3M,2022-07-26


In [13]:
bsky.head(2)

Unnamed: 0,name,user,date,text
0,Roberto Marsicano,@cannedcat.bsky.social,2024-09-21,a top-1% fund manager warns warren buffett is ...
1,copgamer,@copigamer.bsky.social,2024-09-21,microstrategy outperformed the s&p 500 with it...


# Etapa 2: Análise e Agregação de Sentimento através do DistilRoBERTa

In [14]:
# diminuir o número de sentenças de cada texto
'''
import requests
import json
import numpy as np

url = 'https://issam9-sumy-space.hf.space/api/predict/'
data = []
data = data + ['luhn'] + ['english'] + ['1'] + ['text']
data = data + [news['text'].iloc[0]]

json = {
    "cleared": False,
    "data": data,
    "example_id": None,
    "session_hash": "cb1l3gk9k7s"
}

news.loc[:, 'short_text'] = news['text']

for index, row in news.iterrows():
    data[4] = row['text']
    sentiments = {}
    sentiments['negative'] = [] 
    sentiments['neutral'] = []
    sentiments['positive'] = []

    for i in range(8, 15):
        try:
            data[2] = str(i)
            json['data'] = data
            
            response = requests.post(url, json=json)

            responseDICT = response.json()
            text = responseDICT['data'][0]
            print(f'iteração {i}')
            print(f'{index} of {len(news)}')
            # tentar aplicar o modelo
            result = new_model(text)

            sentiments['negative'].append(result[0][0]['score'])
            sentiments['neutral'].append(result[0][1]['score'])
            sentiments['positive'].append(result[0][2]['score'])

        except:
            print(f'Máximo de sentenças: {i}')
            news.loc[index, 'short_text'] = text
            print(result)
            # calcular a mediana dos sentimentos
            news.loc[index, 'sentiment_negative'] = np.median(sentiments['negative'])
            news.loc[index, 'sentiment_neutral'] = np.median(sentiments['neutral'])
            news.loc[index, 'sentiment_positive'] = np.median(sentiments['positive'])
            break
'''

'\nimport requests\nimport json\nimport numpy as np\n\nurl = \'https://issam9-sumy-space.hf.space/api/predict/\'\ndata = []\ndata = data + [\'luhn\'] + [\'english\'] + [\'1\'] + [\'text\']\ndata = data + [news[\'text\'].iloc[0]]\n\njson = {\n    "cleared": False,\n    "data": data,\n    "example_id": None,\n    "session_hash": "cb1l3gk9k7s"\n}\n\nnews.loc[:, \'short_text\'] = news[\'text\']\n\nfor index, row in news.iterrows():\n    data[4] = row[\'text\']\n    sentiments = {}\n    sentiments[\'negative\'] = [] \n    sentiments[\'neutral\'] = []\n    sentiments[\'positive\'] = []\n\n    for i in range(8, 15):\n        try:\n            data[2] = str(i)\n            json[\'data\'] = data\n            \n            response = requests.post(url, json=json)\n\n            responseDICT = response.json()\n            text = responseDICT[\'data\'][0]\n            print(f\'iteração {i}\')\n            print(f\'{index} of {len(news)}\')\n            # tentar aplicar o modelo\n            result

In [15]:
news['text'].iloc[0]



In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import RobertaTokenizer, TFRobertaModel
from tqdm import tqdm
# import clear_output()
from IPython.display import clear_output

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('portuguese'))

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = TFRobertaModel.from_pretrained("roberta-base")

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word.lower() not in stop_words])

def tokenize(text):
    tokenizer(text, truncation=True, max_length=500, return_tensors='tf')

clear_output()

In [17]:
news.head()

Unnamed: 0,title,text,label,links,symbol,company,date
0,Stocks making the biggest moves midday: Palo A...,A view of the exterior of the new Dutch head o...,-2,https://www.cnbc.com/2023/06/05/stocks-making-...,MMM,3M,2023-06-05
1,3M will spin off its health-care business into...,3M announced Tuesday it will spin off its heal...,-2,https://www.cnbc.com/2022/07/26/3m-will-spin-o...,MMM,3M,2022-07-26
2,"Stocks making the biggest moves midday: 3M, Pa...",A woman walks near a Bed Bath & Beyond branch ...,0,https://www.cnbc.com/2023/01/24/stocks-making-...,MMM,3M,2023-01-24
3,"3M to cut 2,500 jobs as demand weakens, profit...",An employee unboxes N95 masks as part of a gov...,0,https://www.cnbc.com/2023/01/24/3m-to-cut-2500...,MMM,3M,2023-01-24
4,"3M combat earplug lawsuits to proceed, judge r...","3M must face more than 230,000 lawsuits accusi...",-2,https://www.cnbc.com/2022/08/26/3m-combat-earp...,MMM,3M,2022-08-26


In [18]:
tqdm.pandas()

tokens = []
for index, row in news.iterrows():
    tokens.append(tokenizer(row['text'], truncation=True, max_length=500, return_tensors='tf'))

decoded = []
for token in tokens:
    decoded.append(tokenizer.decode(token['input_ids'][0]))

news['decoded'] = decoded
news['decoded'] = news['decoded'].apply(remove_stopwords)

news.loc[:, 'short_text'] = news['decoded']
news.loc[:, 'sentiment'] = news['short_text'].progress_apply(lambda x: new_model(x))

news.head()

100%|██████████| 2411/2411 [20:08<00:00,  1.99it/s]


Unnamed: 0,title,text,label,links,symbol,company,date,decoded,short_text,sentiment
0,Stocks making the biggest moves midday: Palo A...,A view of the exterior of the new Dutch head o...,-2,https://www.cnbc.com/2023/06/05/stocks-making-...,MMM,3M,2023-06-05,<s>A view of the exterior of the new Dutch hea...,<s>A view of the exterior of the new Dutch hea...,"[[{'label': 'negative', 'score': 0.16225399076..."
1,3M will spin off its health-care business into...,3M announced Tuesday it will spin off its heal...,-2,https://www.cnbc.com/2022/07/26/3m-will-spin-o...,MMM,3M,2022-07-26,<s>3M announced Tuesday it will spin off its h...,<s>3M announced Tuesday it will spin off its h...,"[[{'label': 'negative', 'score': 0.99853587150..."
2,"Stocks making the biggest moves midday: 3M, Pa...",A woman walks near a Bed Bath & Beyond branch ...,0,https://www.cnbc.com/2023/01/24/stocks-making-...,MMM,3M,2023-01-24,<s>A woman walks near Bed Bath & Beyond branch...,<s>A woman walks near Bed Bath & Beyond branch...,"[[{'label': 'negative', 'score': 0.00726246507..."
3,"3M to cut 2,500 jobs as demand weakens, profit...",An employee unboxes N95 masks as part of a gov...,0,https://www.cnbc.com/2023/01/24/3m-to-cut-2500...,MMM,3M,2023-01-24,<s>An employee unboxes N95 masks part of gover...,<s>An employee unboxes N95 masks part of gover...,"[[{'label': 'negative', 'score': 0.99874913692..."
4,"3M combat earplug lawsuits to proceed, judge r...","3M must face more than 230,000 lawsuits accusi...",-2,https://www.cnbc.com/2022/08/26/3m-combat-earp...,MMM,3M,2022-08-26,"<s>3M must face more than 230,000 lawsuits acc...","<s>3M must face more than 230,000 lawsuits acc...","[[{'label': 'negative', 'score': 0.00137379649..."


In [19]:
with open('data/pkl/df_news_sentiment.pkl', 'wb') as f:
    pickle.dump(news, f)