In [14]:
import os
import sys
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from typing import List, Dict, Any, Tuple, Optional
import polars as pl

from datasets import load_dataset
from bs4 import BeautifulSoup

from pydantic import BaseModel, Field

import warnings
load_dotenv()
warnings.filterwarnings("ignore")

# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [17]:
def download_csv(url):
    response = requests.get(url, stream=True)
    file_size = int(response.headers.get('content-length', 0))
    block_size = 1024  
    
    with tqdm(total=file_size, unit='iB', unit_scale=True, desc="Downloading") as progress_bar:
        with open("nasdaq_external_data.csv", "wb") as file:
            for data in response.iter_content(block_size):
                progress_bar.update(len(data))
                file.write(data)
    
    if file_size != 0 and progress_bar.n != file_size:
        print("ERROR: Download incomplete")
    else:
        nasdaq_df = pd.read_csv("nasdaq_external_data.csv")
        print(nasdaq_df.head())



In [6]:
url = "https://huggingface.co/datasets/Zihan1004/FNSPID/resolve/main/Stock_news/nasdaq_exteral_data.csv"

nasdaq_df = download_csv(url)
nasdaq_df

In [20]:
stock_symbols = [
    "ADBE", "AMD", "ABNB", "GOOGL", "GOOG", "AMZN", "AEP", "AMGN", "ADI", "ANSS", "AAPL", "AMAT",
    "APP", "ARM", "ASML", "AZN", "TEAM", "ADSK", "ADP", "BKR", "BIIB", "BKNG", "AVGO", "CDNS",
    "CDW", "CHTR", "CTAS", "CSCO", "CCEP", "CTSH", "CMCSA", "CEG", "CPRT", "CSGP", "COST", "CRWD",
    "CSX", "DDOG", "DXCM", "FANG", "DASH", "EA", "EXC", "FAST", "FTNT", "GEHC", "GILD", "GFS",
    "HON", "IDXX", "ILMN", "INTC", "INTU", "ISRG", "KDP", "KLAC", "KHC", "LRCX", "LIN", "LULU",
    "MAR", "MRVL", "MELI", "META", "MCHP", "MU", "MSFT", "MRNA", "MDLZ", "MDB", "MNST", "NFLX",
    "NVDA", "NXPI", "ORLY", "ODFL", "ON", "PCAR", "PANW", "PAYX", "PYPL", "PDD", "PEP", "QCOM",
    "REGN", "ROP", "ROST", "SBUX", "SMCI", "SNPS", "TTWO", "TMUS", "TSLA", "TXN", "TTD", "VRSK",
    "VRTX", "WBD", "WDAY", "XEL", "ZS"
]

URL = "/kaggle/working/nasdaq_external_data.csv"

reader = pd.read_csv(URL, chunksize=100_000)
final_df = pd.DataFrame()
iteration = 0

for df in tqdm(reader):
    filtered_df = df[df["Stock_symbol"].isin(stock_symbols)]
    final_df = pd.concat([final_df, filtered_df])
    
    print(f"{filtered_df['Stock_symbol'].unique()} | Total Len: {len(final_df)} rows")

1it [00:10, 10.29s/it]

['AAPL' 'ADBE' 'ADI' 'ADP' 'ADSK' 'AEP'] | Total Len: 19844 rows


2it [00:19,  9.74s/it]

['AMAT' 'AMD' 'AMGN'] | Total Len: 37382 rows


3it [00:29,  9.96s/it]

['AMZN' 'ANSS' 'ASML' 'AVGO' 'AZN'] | Total Len: 44771 rows


4it [00:39,  9.80s/it]

['BIIB' 'BKNG' 'BKR'] | Total Len: 53186 rows


5it [00:48,  9.65s/it]

['CCEP'] | Total Len: 53422 rows


6it [00:59, 10.02s/it]

['CDNS' 'CHTR' 'CMCSA'] | Total Len: 63452 rows


7it [01:10, 10.30s/it]

['COST' 'CPRT' 'CRWD' 'CSGP' 'CSX' 'CTAS' 'CTSH'] | Total Len: 77255 rows


8it [01:20, 10.13s/it]

['DDOG'] | Total Len: 78441 rows


9it [01:29, 10.00s/it]

['DXCM' 'EA'] | Total Len: 83130 rows


10it [01:39,  9.75s/it]

['EXC' 'FANG' 'FAST'] | Total Len: 87723 rows


11it [01:50, 10.36s/it]

['FTNT'] | Total Len: 89417 rows


12it [02:02, 10.88s/it]

['GILD' 'GOOG'] | Total Len: 105545 rows


13it [02:13, 10.92s/it]

[] | Total Len: 105545 rows


14it [02:24, 10.92s/it]

['INTC' 'KHC' 'KLAC'] | Total Len: 116663 rows


15it [02:36, 11.28s/it]

['LIN' 'LRCX' 'MELI' 'MNST' 'MRVL' 'MSFT'] | Total Len: 128653 rows


16it [02:48, 11.49s/it]

['MU'] | Total Len: 135109 rows


17it [03:00, 11.67s/it]

['NVDA' 'ODFL' 'ON' 'ORLY'] | Total Len: 147447 rows


18it [03:12, 11.72s/it]

['PANW' 'PAYX' 'PCAR' 'PDD' 'PEP'] | Total Len: 159919 rows


19it [03:24, 11.75s/it]

['PYPL' 'QCOM' 'REGN'] | Total Len: 172265 rows


20it [03:35, 11.62s/it]

['ROP' 'ROST' 'SBUX' 'SMCI'] | Total Len: 182601 rows


21it [03:47, 11.71s/it]

['TEAM'] | Total Len: 183502 rows


22it [03:58, 11.50s/it]

['TMUS' 'TSLA' 'TTWO' 'TXN'] | Total Len: 201895 rows


23it [04:10, 11.38s/it]

[] | Total Len: 201895 rows


24it [04:21, 11.42s/it]

['VRSK' 'VRTX' 'WDAY'] | Total Len: 207213 rows


25it [04:32, 11.16s/it]

['XEL' 'ZS' 'AAPL'] | Total Len: 210041 rows


26it [04:32,  8.07s/it]

['ADBE' 'ADI' 'ADP' 'ADSK' 'AMAT' 'AMD' 'AMGN' 'AMZN' 'ANSS' 'APP' 'ASML'] | Total Len: 217391 rows


27it [04:33,  5.88s/it]

['AVGO' 'AZN' 'BIIB'] | Total Len: 222943 rows


28it [04:35,  4.63s/it]

['CDNS' 'CDW' 'CHTR' 'CMCSA' 'CPRT' 'CSCO' 'CSGP' 'CSX'] | Total Len: 228048 rows


29it [04:37,  3.91s/it]

['CSX' 'CTAS' 'CTSH' 'DXCM' 'EA'] | Total Len: 232887 rows


30it [04:39,  3.42s/it]

['FANG' 'FAST'] | Total Len: 233527 rows


31it [04:42,  3.08s/it]

['FTNT' 'GILD' 'GOOG' 'GOOGL'] | Total Len: 240160 rows


32it [04:44,  2.81s/it]

['IDXX' 'ILMN' 'INTC' 'INTU'] | Total Len: 242816 rows


33it [04:46,  2.53s/it]

['KLAC' 'LRCX' 'LULU' 'MAR' 'MCHP'] | Total Len: 246395 rows


34it [04:48,  2.38s/it]

['MDLZ' 'MNST' 'MRVL' 'MU' 'NFLX'] | Total Len: 255239 rows


35it [04:50,  2.31s/it]

['NVDA' 'NXPI' 'ORLY' 'PANW' 'PAYX' 'PEP'] | Total Len: 264228 rows


36it [04:52,  2.32s/it]

['QCOM' 'REGN' 'ROP'] | Total Len: 269029 rows


37it [04:53,  1.87s/it]

['SBUX' 'SNPS'] | Total Len: 270321 rows


38it [04:54,  1.58s/it]

['TMUS' 'TSLA' 'TXN'] | Total Len: 274389 rows


39it [04:55,  1.40s/it]

['VRSK' 'VRTX' 'WDAY' 'XEL'] | Total Len: 277393 rows


40it [05:02,  3.06s/it]

[] | Total Len: 277393 rows


41it [05:09,  4.37s/it]

[] | Total Len: 277393 rows


42it [05:16,  5.14s/it]

[] | Total Len: 277393 rows


43it [05:24,  5.92s/it]

[] | Total Len: 277393 rows


44it [05:32,  6.56s/it]

[] | Total Len: 277393 rows


45it [05:40,  6.92s/it]

[] | Total Len: 277393 rows


46it [05:47,  6.92s/it]

[] | Total Len: 277393 rows


47it [05:54,  6.99s/it]

[] | Total Len: 277393 rows


48it [05:55,  5.22s/it]

['AAPL' 'ADBE' 'ADI' 'ADP' 'ADSK' 'AMAT' 'AMGN' 'ANSS'] | Total Len: 280023 rows


49it [05:56,  3.97s/it]

['APP' 'ASML' 'AVGO' 'AZN' 'BIIB'] | Total Len: 283508 rows


50it [05:57,  3.09s/it]

['CDNS' 'CDW' 'CHTR'] | Total Len: 284437 rows


51it [05:58,  2.49s/it]

['CMCSA' 'CPRT' 'CSCO' 'CSGP' 'CSX' 'CTAS' 'CTSH'] | Total Len: 290577 rows


52it [05:59,  2.07s/it]

['DXCM' 'EA'] | Total Len: 293145 rows


53it [06:00,  1.77s/it]

['FANG' 'FAST'] | Total Len: 295130 rows


54it [06:01,  1.57s/it]

['FTNT' 'GILD'] | Total Len: 297783 rows


55it [06:03,  1.43s/it]

['GOOG' 'GOOGL'] | Total Len: 297963 rows


56it [06:04,  1.33s/it]

['IDXX' 'ILMN' 'INTC' 'INTU'] | Total Len: 303483 rows


57it [06:05,  1.26s/it]

['KLAC'] | Total Len: 304655 rows


58it [06:06,  1.22s/it]

['LRCX' 'LULU' 'MAR' 'MCHP' 'MDLZ' 'MNST' 'MRVL'] | Total Len: 311784 rows


59it [06:07,  1.19s/it]

['MU' 'NXPI'] | Total Len: 311801 rows


60it [06:08,  1.17s/it]

['ORLY' 'PANW' 'PAYX' 'PEP'] | Total Len: 316634 rows


61it [06:09,  1.16s/it]

['QCOM' 'REGN' 'ROP'] | Total Len: 319688 rows


62it [06:10,  1.15s/it]

['SBUX' 'SNPS'] | Total Len: 320364 rows


63it [06:12,  1.15s/it]

[] | Total Len: 320364 rows


64it [06:13,  1.15s/it]

['TMUS' 'TXN'] | Total Len: 322538 rows


65it [06:14,  1.11s/it]

['VRSK' 'VRTX' 'WDAY' 'XEL'] | Total Len: 326150 rows


66it [06:15,  1.09s/it]

[] | Total Len: 326150 rows


67it [06:16,  1.06s/it]

[] | Total Len: 326150 rows


68it [06:17,  1.02s/it]

[] | Total Len: 326150 rows


69it [06:18,  1.01it/s]

[] | Total Len: 326150 rows


70it [06:19,  1.00s/it]

[] | Total Len: 326150 rows


71it [06:20,  1.01s/it]

[] | Total Len: 326150 rows


72it [06:21,  1.03s/it]

[] | Total Len: 326150 rows


73it [06:22,  1.05s/it]

[] | Total Len: 326150 rows


74it [06:23,  1.06s/it]

[] | Total Len: 326150 rows


75it [06:24,  1.06s/it]

[] | Total Len: 326150 rows


76it [06:25,  1.05s/it]

[] | Total Len: 326150 rows


77it [06:26,  1.05s/it]

[] | Total Len: 326150 rows


78it [06:27,  1.05s/it]

[] | Total Len: 326150 rows


79it [06:28,  1.04s/it]

[] | Total Len: 326150 rows


80it [06:29,  1.04s/it]

[] | Total Len: 326150 rows


81it [06:30,  1.05s/it]

[] | Total Len: 326150 rows


82it [06:31,  1.04s/it]

[] | Total Len: 326150 rows


83it [06:32,  1.03s/it]

[] | Total Len: 326150 rows


84it [06:33,  1.01s/it]

[] | Total Len: 326150 rows


85it [06:34,  1.02s/it]

[] | Total Len: 326150 rows


86it [06:35,  1.02s/it]

[] | Total Len: 326150 rows


87it [06:36,  1.03s/it]

[] | Total Len: 326150 rows


88it [06:37,  1.03s/it]

[] | Total Len: 326150 rows


89it [06:38,  1.04s/it]

[] | Total Len: 326150 rows


90it [06:39,  1.04s/it]

[] | Total Len: 326150 rows


91it [06:40,  1.02s/it]

[] | Total Len: 326150 rows


92it [06:41,  1.01it/s]

[] | Total Len: 326150 rows


93it [06:42,  1.02it/s]

[] | Total Len: 326150 rows


94it [06:43,  1.01s/it]

[] | Total Len: 326150 rows


95it [06:44,  1.02s/it]

[] | Total Len: 326150 rows


96it [06:46,  1.03s/it]

[] | Total Len: 326150 rows


97it [06:47,  1.03s/it]

[] | Total Len: 326150 rows


98it [06:48,  1.03s/it]

[] | Total Len: 326150 rows


99it [06:49,  1.03s/it]

[] | Total Len: 326150 rows


100it [06:49,  4.10s/it]

[] | Total Len: 326150 rows





In [28]:
final_df["Date"] = pd.to_datetime(final_df["Date"])
final_df = final_df[final_df["Article"].notna()]
final_df.to_parquet("filtered_news.parquet", index=False, compression="brotli")

In [33]:
dropped_df = final_df.drop(["Lsa_summary", "Luhn_summary", "Lexrank_summary", "Publisher", "Author", "Unnamed: 0"], axis=1)
dropped_df

Unnamed: 0,Date,Article_title,Stock_symbol,Url,Article,Textrank_summary
12025,2023-12-16 22:00:00+00:00,My 6 Largest Portfolio Holdings Heading Into 2...,AAPL,https://www.nasdaq.com/articles/my-6-largest-p...,"After an absolute disaster of a year in 2022, ...",3: Apple There's little question that Apple (N...
12026,2023-12-16 22:00:00+00:00,Brokers Suggest Investing in Apple (AAPL): Rea...,AAPL,https://www.nasdaq.com/articles/brokers-sugges...,"When deciding whether to buy, sell, or hold a ...",Let's take a look at what these Wall Street he...
12027,2023-12-16 21:00:00+00:00,"Company News for Dec 19, 2023",AAPL,https://www.nasdaq.com/articles/company-news-f...,Shares of Apple Inc. AAPL lost 0.9% on China’s...,Click to get this free report Apple Inc. (AAPL...
12028,2023-12-16 21:00:00+00:00,NVIDIA (NVDA) Up 243% YTD: Will It Carry Momen...,AAPL,https://www.nasdaq.com/articles/nvidia-nvda-up...,NVIDIA Corporation NVDA has witnessed a remark...,Other Stocks in the $1T Club Apart from NVIDIA...
12029,2023-12-16 21:00:00+00:00,"Pre-Market Most Active for Dec 19, 2023 : BMY,...",AAPL,https://www.nasdaq.com/articles/pre-market-mos...,The NASDAQ 100 Pre-Market Indicator is up 10.1...,"Apple Inc. (AAPL) is +0.86 at $196.75, with 1,..."
...,...,...,...,...,...,...
2488761,2018-03-07 00:00:00+00:00,Here's a first: The Praetorian Group files $75...,ZS,https://www.nasdaq.com/articles/heres-first-pr...,The Praetorian Group filed what appears to be ...,"However, IPO Intelligence clients can access o..."
2488762,2018-03-05 00:00:00+00:00,Zscaler leads the unicorn charge of 2018 - her...,ZS,https://www.nasdaq.com/articles/zscaler-leads-...,"Zscaler will be the year's first ""unicorn"" IPO...","¤ Leading cybersecurity provider ¤ 50% growth,..."
2488763,2018-03-02 00:00:00+00:00,US IPO Week Ahead: 1 biotech expected in the w...,ZS,https://www.nasdaq.com/articles/us-ipo-week-ah...,Only one small biotech is on the calendar for ...,"That said, four deals are able to launch on Mo..."
2488764,2018-02-23 00:00:00+00:00,US IPO Weekly Recap: Dropbox files for a $500 ...,ZS,https://www.nasdaq.com/articles/us-ipo-weekly-...,"The year's first ""decacorn"" is here: Dropbox (...",But if Dropbox and cybersecurity unicorn Zscal...


In [37]:
dropped_df['Date'] = pd.to_datetime(dropped_df['Date'])  
dropped_df = dropped_df.drop_duplicates(subset=["Textrank_summary"])

sampled_dfs = []
for stock_symbol, group in dropped_df.groupby("Stock_symbol"):
    group_sorted = group.sort_values(by="Date", ascending=False)
    
    sample_size = max(1, int(len(group_sorted) * 0.1))  
    sampled_group = group_sorted.head(sample_size)
    
    sampled_dfs.append(sampled_group)

document_df = pd.concat(sampled_dfs, ignore_index=True)
document_df

Unnamed: 0,Date,Article_title,Stock_symbol,Url,Article,Textrank_summary
0,2023-12-16 22:00:00+00:00,My 6 Largest Portfolio Holdings Heading Into 2...,AAPL,https://www.nasdaq.com/articles/my-6-largest-p...,"After an absolute disaster of a year in 2022, ...",3: Apple There's little question that Apple (N...
1,2023-12-16 22:00:00+00:00,Brokers Suggest Investing in Apple (AAPL): Rea...,AAPL,https://www.nasdaq.com/articles/brokers-sugges...,"When deciding whether to buy, sell, or hold a ...",Let's take a look at what these Wall Street he...
2,2023-12-16 21:00:00+00:00,"Company News for Dec 19, 2023",AAPL,https://www.nasdaq.com/articles/company-news-f...,Shares of Apple Inc. AAPL lost 0.9% on China’s...,Click to get this free report Apple Inc. (AAPL...
3,2023-12-16 21:00:00+00:00,NVIDIA (NVDA) Up 243% YTD: Will It Carry Momen...,AAPL,https://www.nasdaq.com/articles/nvidia-nvda-up...,NVIDIA Corporation NVDA has witnessed a remark...,Other Stocks in the $1T Club Apart from NVIDIA...
4,2023-12-16 21:00:00+00:00,"Pre-Market Most Active for Dec 19, 2023 : BMY,...",AAPL,https://www.nasdaq.com/articles/pre-market-mos...,The NASDAQ 100 Pre-Market Indicator is up 10.1...,"Apple Inc. (AAPL) is +0.86 at $196.75, with 1,..."
...,...,...,...,...,...,...
18987,2023-09-06 00:00:00+00:00,Oppenheimer Maintains Zscaler (ZS) Outperform ...,ZS,https://www.nasdaq.com/articles/oppenheimer-ma...,"Fintel reports that on September 6, 2023, Oppe...",Additional reading: Zscaler Reports Fourth Qua...
18988,2023-09-06 00:00:00+00:00,Rosenblatt Reiterates Zscaler (ZS) Buy Recomme...,ZS,https://www.nasdaq.com/articles/rosenblatt-rei...,"Fintel reports that on September 6, 2023, Rose...",Additional reading: Zscaler Reports Fourth Qua...
18989,2023-09-06 00:00:00+00:00,Wedbush Maintains Zscaler (ZS) Outperform Reco...,ZS,https://www.nasdaq.com/articles/wedbush-mainta...,"Fintel reports that on September 6, 2023, Wedb...",Additional reading: Zscaler Reports Fourth Qua...
18990,2023-09-06 00:00:00+00:00,BTIG Maintains Zscaler (ZS) Buy Recommendation,ZS,https://www.nasdaq.com/articles/btig-maintains...,"Fintel reports that on September 6, 2023, BTIG...",Additional reading: Zscaler Reports Fourth Qua...


In [40]:
document_df.to_parquet("document.parquet", compression="brotli")
document_df

Unnamed: 0,Date,Article_title,Stock_symbol,Url,Article,Textrank_summary
0,2023-12-16 22:00:00+00:00,My 6 Largest Portfolio Holdings Heading Into 2...,AAPL,https://www.nasdaq.com/articles/my-6-largest-p...,"After an absolute disaster of a year in 2022, ...",3: Apple There's little question that Apple (N...
1,2023-12-16 22:00:00+00:00,Brokers Suggest Investing in Apple (AAPL): Rea...,AAPL,https://www.nasdaq.com/articles/brokers-sugges...,"When deciding whether to buy, sell, or hold a ...",Let's take a look at what these Wall Street he...
2,2023-12-16 21:00:00+00:00,"Company News for Dec 19, 2023",AAPL,https://www.nasdaq.com/articles/company-news-f...,Shares of Apple Inc. AAPL lost 0.9% on China’s...,Click to get this free report Apple Inc. (AAPL...
3,2023-12-16 21:00:00+00:00,NVIDIA (NVDA) Up 243% YTD: Will It Carry Momen...,AAPL,https://www.nasdaq.com/articles/nvidia-nvda-up...,NVIDIA Corporation NVDA has witnessed a remark...,Other Stocks in the $1T Club Apart from NVIDIA...
4,2023-12-16 21:00:00+00:00,"Pre-Market Most Active for Dec 19, 2023 : BMY,...",AAPL,https://www.nasdaq.com/articles/pre-market-mos...,The NASDAQ 100 Pre-Market Indicator is up 10.1...,"Apple Inc. (AAPL) is +0.86 at $196.75, with 1,..."
...,...,...,...,...,...,...
18987,2023-09-06 00:00:00+00:00,Oppenheimer Maintains Zscaler (ZS) Outperform ...,ZS,https://www.nasdaq.com/articles/oppenheimer-ma...,"Fintel reports that on September 6, 2023, Oppe...",Additional reading: Zscaler Reports Fourth Qua...
18988,2023-09-06 00:00:00+00:00,Rosenblatt Reiterates Zscaler (ZS) Buy Recomme...,ZS,https://www.nasdaq.com/articles/rosenblatt-rei...,"Fintel reports that on September 6, 2023, Rose...",Additional reading: Zscaler Reports Fourth Qua...
18989,2023-09-06 00:00:00+00:00,Wedbush Maintains Zscaler (ZS) Outperform Reco...,ZS,https://www.nasdaq.com/articles/wedbush-mainta...,"Fintel reports that on September 6, 2023, Wedb...",Additional reading: Zscaler Reports Fourth Qua...
18990,2023-09-06 00:00:00+00:00,BTIG Maintains Zscaler (ZS) Buy Recommendation,ZS,https://www.nasdaq.com/articles/btig-maintains...,"Fintel reports that on September 6, 2023, BTIG...",Additional reading: Zscaler Reports Fourth Qua...
