In [1]:
import pandas as pd
import torch  # Make sure to import the torch module  
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import yfinance as yf
from datetime import datetime, timedelta
import re
import gc  # garbage collector  
from tqdm import tqdm
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/NVDA_news_content.csv')

In [3]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [4]:
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.49it/s]


In [18]:
stock='Nvidia'
symbol='NVDA'
start_date = "2024-01-02"
end_date = "2024-08-14"

In [9]:
price = yf.download(symbol, start=start_date, end=end_date)

[*********************100%%**********************]  1 of 1 completed


In [10]:
price.reset_index(inplace=True)

In [11]:
price['date_str'] = price['Date'].apply(lambda x: x.date().strftime('%Y-%m-%d'))  

In [12]:
price['daily_returns'] = (price['Adj Close'] - price['Open']) / price['Adj Close']

In [13]:
df['date'] = pd.to_datetime(df['date'])  
df['date_str'] = df['date'].dt.strftime('%Y-%m-%d')  

In [14]:
def generate_response(input_text):  
    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")  
    output = model.generate(**input_ids, max_new_tokens=200, temperature=0.1)  
    response = tokenizer.decode(output[0], skip_special_tokens=True)  
    del input_ids, output  
    torch.cuda.empty_cache()        
    return response 

In [15]:
overall_summary_template = """
You are a financial analyst tasked with summarizing key information that could impact the stock price of {stock}. 
Below is a list of news topics and their short content:
{summaries}
Based on these news items, please provide a concise summary of the major key points that might impact {stock}'s stock price. 
Focus on the following:
    1. Identify the most significant positive and negative factors.
    2. Highlight any recurring themes or trends across multiple news items.
    3. Mention any upcoming events or announcements that could influence the stock.
    4. Provide a brief overview of the general sentiment towards {stock} based on these news items.
Your summary should be clear, concise, and directly related to potential stock price impacts. Aim for a length of 3-5 paragraphs.

Overall Summary:"""

In [16]:
trade_template = """As an expert trading agent with extensive experience in trading {stock}, analyze the following information:
Date: {date}
Recent Stock Performance:
{table_string}
Recent News Summary:
{summary}

Based on the provided data, please offer an investment decision. Consider factors such as:
1. Current market trends in the technology sector
2. Nvidia's recent stock price momentum
3. Potential impact of the news on stock performance
    
Provide your analysis using the following format:
Decision: (Buy/Sell/Hold)
Confidence Level: (Low/Medium/High)
Reasoning: (Explain your decision in 3-5 concise points)

Answer:
"""

In [23]:
loop_start_date_str = "2024-05-06"      
loop_end_date_str = "2024-08-09"      
results_df = pd.DataFrame()  # Initialize new DataFrame      
  
loop_start_date = datetime.strptime(loop_start_date_str, "%Y-%m-%d")      
loop_end_date = datetime.strptime(loop_end_date_str, "%Y-%m-%d")    
  
total_days = (loop_end_date - loop_start_date).days + 1    
  
for single_date in tqdm([loop_start_date + timedelta(n) for n in range(total_days)]):    
      
    returns = price[price['Date'] == single_date]  
    if not returns.empty:  
        daily_return = returns.daily_returns.values[0]
        news_date = single_date - timedelta(days=5)      
        news_date_str = news_date.strftime("%Y-%m-%d")      
        curr_news_df = df[(df['date_str'] >= news_date_str)&(df['date_str'] < single_date.strftime("%Y-%m-%d"))]      
        curr_news_df = curr_news_df[(~curr_news_df['summary'].isna()) &  (~curr_news_df['topic'].isna())]      
  
        summaries_text = ""      
        for i in range(len(curr_news_df)):      
            summaries_text += f"Topic {i}: {curr_news_df.topic.values[i]}\n"      
            summaries_text += f"Content: {curr_news_df.summary.values[i]}\n\n"      
  
        overall_summary_prompt = overall_summary_template.format(stock=stock, summaries=summaries_text)      
        response = generate_response(overall_summary_prompt)      
        overall_summary = response.split('Overall Summary:')[-1]      
  
        stock_date = single_date - timedelta(days=7)      
        stock_date_str = stock_date.strftime("%Y-%m-%d")      
        current_price = price[(price['Date']>=stock_date_str)&(price['Date']<single_date.strftime("%Y-%m-%d"))]      
        table_string = price.to_string(index=False)      
        trade_prompt = trade_template.format(stock=stock, date = single_date.strftime("%Y-%m-%d"), table_string = table_string, summary=overall_summary)      
        response = generate_response(trade_prompt)      
        trade_decision = response.split('Answer:')[-1]      
        decision = re.search(r'Decision: (.*)\n', trade_decision).group(1)        
        confidence = re.search(r'Confidence Level: (.*)\n', trade_decision).group(1)        
        reasoning = re.search(r'Reasoning:\n(.*)', trade_decision, re.DOTALL).group(1)      
  
        new_row = pd.DataFrame({'Date': [single_date], 'News_Summary':[overall_summary], 'Input': [trade_prompt],  'Output':[trade_decision],  
                                'Decision': [decision], 'Confidence': [confidence],   'Reasoning': [reasoning],
                                'daily_return': [daily_return]})    
        results_df = pd.concat([results_df, new_row], ignore_index=True)      
  
        del news_date, news_date_str, curr_news_df, summaries_text, overall_summary_prompt, response, overall_summary, stock_date, stock_date_str, current_price, table_string, trade_prompt, trade_decision, decision, confidence, reasoning, new_row  
        gc.collect()  

  0%|          | 0/96 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/96 [00:31<50:17, 31.76s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 2/96 [01:01<47:58, 30.63s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 3/96 [01:31<46:54, 30.26s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 4/96 [02:01<46:08, 30.10s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▌         | 5/96 [02:31<45:30, 30.01s/it]Se

# Correct Decisions

In [29]:
len(results_df[(results_df['Decision'] == 'Buy') & (results_df['daily_return']>=0)])

34

In [31]:
len(results_df[(results_df['Decision'] == 'Sell') & (results_df['daily_return']<0)])

5

# Wrong Decisions

In [28]:
len(results_df[(results_df['Decision'] == 'Buy') & (results_df['daily_return']<0)])

26

In [30]:
len(results_df[(results_df['Decision'] == 'Sell') & (results_df['daily_return']>=0)])

0

# Filter wrong Decisions

In [34]:
results_df = results_df[(
    (results_df['Decision'] == 'Buy') & (results_df['daily_return']>=0)) | (
    (results_df['Decision'] == 'Sell') & (results_df['daily_return']>=0))]

In [36]:
len(results_df)

34

In [38]:
# results_df

In [39]:
dataset = Dataset.from_pandas(results_df)

In [40]:
dataset.push_to_hub("Arnab13/llama_trade_decisions")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 333.41ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.08it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/Arnab13/llama_trade_decisions/commit/985e521b50d6ac7dbd21e715430b289acedfff4c', commit_message='Upload dataset', commit_description='', oid='985e521b50d6ac7dbd21e715430b289acedfff4c', pr_url=None, pr_revision=None, pr_num=None)