In [43]:
from utils.news import get_news, get_url_content
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import yfinance as yf

In [22]:
# Parameters
symbol = "NVDA"  
start_date = "2024-01-02"
end_date = "2024-08-13"
news_duration = 1

In [23]:
all_news_df = pd.DataFrame()

current_date = datetime.strptime(start_date, "%Y-%m-%d")
end_date = datetime.strptime(end_date, "%Y-%m-%d")

total_days = (end_date - current_date).days + 1

In [26]:
# Iterate from start_date to end_date with progress bar
for _ in tqdm(range(total_days), desc="Fetching news"):
    date_str = current_date.strftime("%Y-%m-%d")
    
    # Fetch news for the current date
    df = get_news(symbol, date_str, news_duration)
    
    if not df.empty:
        df = df[df['symbols'].apply(lambda x: symbol in x)]
        
        df['date'] = pd.to_datetime(df['updated_at'], format='%Y-%m-%dT%H:%M:%SZ', utc=True)
        df['symbol'] = symbol
        
        # Extract topics and content
        df['topic'], df['content'] = zip(*[get_url_content(url) for url in df['url'].values])
        
        # Append the current day's news to the overall DataFrame
        all_news_df = pd.concat([all_news_df, df], ignore_index=True)
    
    # Move to the next day
    current_date += timedelta(days=1)

Fetching news: 100%|██████████████████████████| 225/225 [08:52<00:00,  2.37s/it]


In [34]:
file_name = 'data/' + symbol + '_news_content.csv'

In [35]:
file_name

'data/NVDA_news_content.csv'

In [36]:
all_news_df.to_csv(file_name, index=False)

In [39]:
query = "Nvidia, GPU, semiconductor, gaming, AI, data center, autonomous vehicle, edge computing, IoT, machine learning, autonomous driving, NVIDIA GeForce, NVIDIA DRIVE, NVIDIA Jetson"

In [44]:
price = yf.download(symbol, start = start_date, end=end_date)

[*********************100%%**********************]  1 of 1 completed


In [46]:
price.reset_index(inplace=True)

In [47]:
price['date_str'] = price['Date'].apply(lambda x: x.date().strftime('%Y-%m-%d'))

In [62]:
nan_df = all_news_df[~all_news_df['content'].isna()]

In [50]:
len(all_news_df[~all_news_df['content'].isna()])

78

In [51]:
len(all_news_df)

2761

In [57]:
all_news_df.content.isna()

0       False
1       False
2       False
3       False
4       False
        ...  
2756     True
2757     True
2758     True
2759     True
2760     True
Name: content, Length: 2761, dtype: bool