## **DS 340W Modified Implementation**

### **Import Packages**

In [1]:
from dotenv import load_dotenv
import os
import requests
import pandas as pd
from time import sleep
from datetime import datetime, timedelta
import yfinance as yf
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from sklearn.model_selection import KFold

### **Get Environment for API Secret**

In [2]:
load_dotenv()
api_key = os.getenv("API_KEY")

### **Connect to Guardian API for News Articles about Specific Companies**

In [3]:
def get_guardian_articles(query, start_date, end_date, api_key, max_pages=100):
    url = "https://content.guardianapis.com/search"
    news_data = []
    page_size = 50 
    
    page = 1
    while page <= max_pages:
        params = {
            'q': query,
            'from-date': start_date,
            'to-date': end_date,
            'api-key': api_key,
            'page': page,
            'page-size': page_size
        }
        
        response = requests.get(url, params=params)
        data = response.json()
        
        articles = data['response']['results']
        
        for article in articles:
            news_data.append({
                'title': article['webTitle'],
                'url': article['webUrl'],
                'publishedAt': article['webPublicationDate']
            })
        
        ## PAGINATION
        if not data['response']['pages'] > page:
            break
        
        page += 1
        sleep(0.5) # RATE LIMITS 
    
    return pd.DataFrame(news_data)

start_date = '2019-12-31' # START DATE
end_date = '2020-12-31' # END DATE (2020 FISCAL YEAR)
queries = ['Apple', 'Tesla', 'Amazon']

output_dir = "data"
os.makedirs(output_dir, exist_ok=True) # MAKE DATA DIRECTORY

for query in queries:
    year_start = datetime.strptime(start_date, "%Y-%m-%d") # PROPER QUERY FORMAT FOR DATES
    year_end = datetime.strptime(end_date, "%Y-%m-%d")
    all_articles = pd.DataFrame()

    while year_start < year_end:
        next_year_end = min(year_start + timedelta(days=365), year_end) # 365 DAYS + START DATE
        articles_df = get_guardian_articles(query, year_start.strftime("%Y-%m-%d"), 
                                            next_year_end.strftime("%Y-%m-%d"), 
                                            api_key, max_pages=100)
        all_articles = pd.concat([all_articles, articles_df], ignore_index=True)
        year_start = next_year_end + timedelta(days=1)

    # SAVE FILES TO CSVS
    file_name = f"guardian_{query.lower()}_articles.csv"
    file_path = os.path.join(output_dir, file_name)
    all_articles.to_csv(file_path, index=False)

    print(f"Saved {query} articles to {file_path}")


Saved Apple articles to data\guardian_apple_articles.csv
Saved Tesla articles to data\guardian_tesla_articles.csv
Saved Amazon articles to data\guardian_amazon_articles.csv


### **Download 2019-2020 Stock Data for Tesla, Apple, and Amazon**

In [None]:
def download_and_fill_stock_data(symbol, start_date, end_date):
    stock_data = yf.download(symbol, start=start_date, end=end_date) # YAHOO FINANCE STOCK DATA

    all_dates = pd.date_range(start=start_date, end=end_date, freq='B')  # ONLY BUSINESS DAYS (WEEKDAYS)
    stock_data = stock_data.reindex(all_dates)

    stock_data['Close'] = stock_data['Close'].fillna(method='ffill') # FORWARD FILL (FILL ANY MISSING DATES W PREVIOUS DATE)
 
    stock_data['Close'] = stock_data['Close'].fillna(method='bfill') # EXCEPTION HANDLER IN CASE FFILL DOESN'T WORK

    return stock_data

output_dir = "data"
os.makedirs(output_dir, exist_ok=True) # DATA FOLDER

symbols = ["AAPL", "AMZN", "TSLA"] # STOCKS ANALYZED
start_date = "2019-12-31"
end_date = "2020-12-31"

for symbol in symbols: # DOWNLOADER FOR EACH STOCK
    stock_data = download_and_fill_stock_data(symbol, start_date, end_date)
    
    file_name = f"{symbol.lower()}_stock_data_2019_2020.csv"
    file_path = os.path.join(output_dir, file_name)
    
    stock_data.to_csv(file_path) # SAVE TO CSV
    
    print(f"Saved {symbol} stock data to {file_path}")


In [3]:
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment') # PRE TRAINED BERT MODEL FOR SENTIMENT ANALYSIS
model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

stock_data_apple = pd.read_csv('data/aapl_stock_data_2019_2020.csv', index_col='Date', parse_dates=True) # READ HISTORICAL STOCK DATA
stock_data_amazon = pd.read_csv('data/amzn_stock_data_2019_2020.csv', index_col='Date', parse_dates=True)
stock_data_tesla = pd.read_csv('data/tsla_stock_data_2019_2020.csv', index_col='Date', parse_dates=True)

stock_data_apple = stock_data_apple.sort_index() # SORT BY DATE (INDEX COLUMN)
stock_data_amazon = stock_data_amazon.sort_index()
stock_data_tesla = stock_data_tesla.sort_index()

# ARTICLES FOR ALL COMPANIES
df_apple = pd.read_csv("data/guardian_apple_articles.csv")
df_amazon = pd.read_csv("data/guardian_amazon_articles.csv")
df_tesla = pd.read_csv("data/guardian_tesla_articles.csv")

titles_apple = df_apple['title'].tolist()
dates_apple = df_apple['publishedAt'].tolist()

titles_amazon = df_amazon['title'].tolist()
dates_amazon = df_amazon['publishedAt'].tolist()

titles_tesla = df_tesla['title'].tolist()
dates_tesla = df_tesla['publishedAt'].tolist()

for stock_data in [stock_data_apple, stock_data_amazon, stock_data_tesla]:
    if stock_data.index.tz is not None: # ACCOUNT FOR TIME ZONE NAIVE/AWARE (FOR NO ERRORS)
        stock_data.index = stock_data.index.tz_convert(None)

### **EDA With Financial News Data**

In [7]:
print("Apple Financial News Shape: ", df_apple.shape)
print("Amazon Financial News Shape: ", df_amazon.shape)
print("Tesla Financial News Shape: ", df_tesla.shape)

Apple Financial News Shape:  (1940, 3)
Amazon Financial News Shape:  (2113, 3)
Tesla Financial News Shape:  (249, 3)


In [12]:
df_tesla.head()

Unnamed: 0,title,url,publishedAt
0,Tesla to raise another $5bn by selling shares,https://www.theguardian.com/technology/2020/de...,2020-12-08T15:09:32Z
1,Elon Musk: I tried to sell Tesla to Apple,https://www.theguardian.com/technology/2020/de...,2020-12-23T02:08:36Z
2,Tesla joins Wall Street's S&P 500 share index,https://www.theguardian.com/technology/2020/de...,2020-12-21T16:56:00Z
3,Tesla investor defends electric carmaker's soa...,https://www.theguardian.com/technology/2020/no...,2020-11-06T12:24:55Z
4,Tesla review – sparky biopic of the inventor,https://www.theguardian.com/film/2020/sep/20/t...,2020-09-20T10:00:43Z


### **Stock Price Data**

In [10]:
print("Stock Price Shape: ", stock_data_amazon.shape)

Stock Price Shape:  (263, 6)


In [11]:
stock_data_amazon.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-12-31,92.099998,92.663002,91.611504,92.391998,92.391998,50130000.0
2020-01-01,,,,92.391998,,
2020-01-02,93.75,94.900497,93.207497,94.900497,94.900497,80580000.0
2020-01-03,93.224998,94.309998,93.224998,93.748497,93.748497,75288000.0
2020-01-06,93.0,95.184502,93.0,95.143997,95.143997,81236000.0


In [5]:
def get_stock_price(symbol, date): # STOCK PRICE ON GIVEN DAY
    date = pd.to_datetime(date)
    
    if date.tz is not None:
        date = date.tz_convert(None)
    
    if symbol == "AAPL":
        stock_data = stock_data_apple
    elif symbol == "AMZN":
        stock_data = stock_data_amazon
    elif symbol == "TSLA":
        stock_data = stock_data_tesla
    
    if date in stock_data.index:
        return stock_data.loc[date, 'Close']
    else:
        previous_date = stock_data.index[stock_data.index.searchsorted(date) - 1] # USE PREVIOUS DATE IF DATA ISN'T AVAILABLE FOR CURRENT DATE
        return stock_data.loc[previous_date, 'Close']

def get_price_change(symbol, date): # GET PRICE DIFFERENCE FROM CURRENT DAY - PREVIOUS DAY
    today_price = get_stock_price(symbol, date)
    if today_price is None:
        return None
    next_day = pd.to_datetime(date) + pd.DateOffset(days=1)
    next_day_str = next_day.strftime('%Y-%m-%d')
    
    next_day_price = get_stock_price(symbol, next_day_str)
    if next_day_price is None:
        return None
    return 'up' if next_day_price > today_price else 'down'

def calculate_accuracy(predictions, articles, symbol): # ACCURACY OF MODEL
    correct_predictions = 0
    total_predictions = len(predictions)

    for idx, (prediction, article_date) in enumerate(zip(predictions, articles)):
        price_change = get_price_change(symbol, article_date)
        
        if price_change is None:
            continue

        if (prediction == 'positive' and price_change == 'up') or (prediction == 'negative' and price_change == 'down'): # CALCULATE IF CORRECT/INCORRECT
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

### **Model with K-Fold Cross-Validation**

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # GPU OPTIMIZATION
model.to(device)

def get_sentiment(titles, model, device): # SENTIMENT TAGGING
    model.eval()
    sentiments = []
    
    for title in titles:
        inputs = tokenizer(title, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device) #FORMAT DATA FOR TENSOR
        with torch.no_grad():
            outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits).item()
        
        if prediction == 4:
            sentiments.append('positive')
        else:
            sentiments.append('negative')
    
    return sentiments

def cross_validate(titles, dates, model, device, symbol, n_splits=5): # CROSS VALIDATION FOR N_SPLITS
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) # KFOLD CV
    accuracies = []
    
    for train_index, test_index in kf.split(titles):
        train_titles, test_titles = [titles[i] for i in train_index], [titles[i] for i in test_index]
        train_dates, test_dates = [dates[i] for i in train_index], [dates[i] for i in test_index]
        
        train_predictions = get_sentiment(train_titles, model, device)
        test_predictions = get_sentiment(test_titles, model, device) 
        
        accuracy = calculate_accuracy(test_predictions, test_dates, symbol) # CALCULATE ACCURACY FOR KFOLD CV
        accuracies.append(accuracy)
    
    avg_accuracy = np.mean(accuracies) # AVG ACCURACY FOR ALL FOLDS
    return avg_accuracy

symbols = ["AAPL", "AMZN", "TSLA"]
datasets = [
    (titles_apple, dates_apple, "AAPL"),
    (titles_amazon, dates_amazon, "AMZN"),
    (titles_tesla, dates_tesla, "TSLA")
]

for titles, dates, symbol in datasets:
    avg_accuracy = cross_validate(titles, dates, model, device, symbol, n_splits=5) # 5 SPLITS
    print(f"Average sentiment prediction accuracy for {symbol} (cross-validation): {avg_accuracy}")


Average sentiment prediction accuracy for AAPL (cross-validation): 0.5025773195876289
Average sentiment prediction accuracy for AMZN (cross-validation): 0.5390373432825788
Average sentiment prediction accuracy for TSLA (cross-validation): 0.5462040816326531
