In [None]:
import pandas as pd

url = "https://huggingface.co/datasets/Zihan1004/FNSPID/resolve/main/Stock_news/nasdaq_exteral_data.csv"

chunks = []
for chunk in pd.read_csv(url, usecols=["Date", "Article_title", "Stock_symbol", "Textrank_summary"],
                         chunksize=100_000, low_memory=False):

    chunk.rename(columns={"Article_title": "Title", "Stock_symbol": "Ticker"}, inplace=True)
    chunk["Ticker"] = chunk["Ticker"].astype(str).str.upper()
    #chunk = chunk[chunk["Ticker"].isin(valid_tickers)]
    chunk['Date'] = pd.to_datetime(chunk['Date'], errors='coerce')
    #chunk = chunk.dropna(subset=["Textrank_summary"])
    chunk = chunk.drop_duplicates(subset=["Date", "Ticker", "Textrank_summary"])
    chunks.append(chunk)

FNSID = pd.concat(chunks).reset_index(drop=True)
FNSID = FNSID.dropna(subset = ["Ticker"])
FNSID_archive = FNSID.copy(deep=True)

# Keep 2009-01-01 till the end of 2023. (dataset extends a couple of days into 2024 but discard these)
start = "2009-01-01"
end = "2023-12-31"
FNSID = FNSID[FNSID['Date'] >= pd.Timestamp(start, tz = "UTC")]
FNSID = FNSID[FNSID['Date'] <= pd.Timestamp(end, tz = "UTC")]

print(FNSID.shape)

(6840808, 4)


In [None]:
def count_ticker(df: pd.DataFrame,
                 tic: str) -> int:
    temp_df = df[df['Ticker'] == tic].reset_index(drop=True)
    rows = temp_df.shape[0]
    del temp_df
    return rows

tickers = [
    "ABT", "UNH", "JNJ", "CVS",
    "AMZN", "MCD", "F", "GM",
    "JPM", "BRK.B", "AXP", "BLK",
    "CVX", "XOM", "DVN", "CTRA",
    "NVDA", "AAPL", "AMD", "CSCO",
    "T", "CMCSA", "TMUS", "VZ",
    "KO", "PG", "COST", "KMB",
    "GE", "MMM", "DAL", "BA"
]

for tic in tickers:
    print(f"{tic}: {count_ticker(FNSID, tic)}")

ABT: 4025
UNH: 157
JNJ: 1378
CVS: 946
AMZN: 4779
MCD: 953
F: 1485
GM: 118
JPM: 564
BRK.B: 249
AXP: 1854
BLK: 2269
CVX: 8604
XOM: 7004
DVN: 3540
CTRA: 437
NVDA: 9844
AAPL: 8886
AMD: 8941
CSCO: 588
T: 9403
CMCSA: 7858
TMUS: 4417
VZ: 1319
KO: 9027
PG: 658
COST: 6725
KMB: 3576
GE: 8616
MMM: 5213
DAL: 2457
BA: 5972


In [None]:
# Only keep the stocks I am interested in. These are one main stock per sector, and 3 sector dummy stocks which will act like a stand-in for sector sentiment.
FNSID = FNSID[FNSID['Ticker'].isin(tickers)]
print(FNSID.shape)

FNSID["Textrank_summary"] = FNSID["Textrank_summary"].fillna(FNSID["Title"])

# Drop rows with missing summaries
FNSID = FNSID.dropna(subset=["Textrank_summary"])
print(FNSID.shape)

# Drop duplicate rows
FNSID = FNSID.drop_duplicates(subset=["Date", "Ticker", "Textrank_summary"])
print(FNSID.shape)

# Some companies might have multiple news items in one day. To simulate realistic trading scenarios (Multiple news signal per day) I chose to concatenate these into one big summary, which will be passed for sentiment analysis.
FNSID = FNSID.groupby(["Date", "Ticker"])["Textrank_summary"].apply(lambda x: " ".join(x)).reset_index()

# Also normalize date to not include timezones
FNSID['Date'] = FNSID['Date'].dt.tz_localize(None)
print(FNSID.shape)

# Rename concatenated column to Concatenated_Textrank_Summaries for clarity
FNSID.rename(columns={"Textrank_summary": "Concatenated_Textrank_Summaries"}, inplace=True)

# Keep english characters (some summaries apparently are in Russian so I want to filter these out)
FNSID = FNSID[FNSID["Concatenated_Textrank_Summaries"].str.contains(r'[a-zA-Z]', na=False)]
print(FNSID.shape)

(131862, 4)
(131862, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FNSID["Textrank_summary"] = FNSID["Textrank_summary"].fillna(FNSID["Title"])


(131848, 4)
(48332, 3)
(48332, 3)


In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(
    "tpapa/FNSPID_portfolio",
    token="Removed this for my safety :)",
    repo_type="dataset",
    private=True
)

FNSID.to_csv("FNSPID_portfolio.csv", index=False, encoding = "utf-8")

api.upload_file(
    path_or_fileobj="FNSPID_portfolio.csv",
    path_in_repo="FNSPID_portfolio.csv",
    repo_id="tpapa/FNSPID_portfolio",
    repo_type="dataset",
    token="Removed this for my safety :)"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/tpapa/FNSPID_portfolio/commit/c06874014802f838bcfb41661f82fda6470b088f', commit_message='Upload FNSPID_portfolio.csv with huggingface_hub', commit_description='', oid='c06874014802f838bcfb41661f82fda6470b088f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tpapa/FNSPID_portfolio', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tpapa/FNSPID_portfolio'), pr_revision=None, pr_num=None)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels = 3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer, truncation = True, max_length = 512)

sentences = FNSID['Concatenated_Textrank_Summaries'].values.tolist()

results = nlp(sentences)

FNSID['sentiment'] = [r['label'] for r in results]
FNSID['score'] = [r['score'] for r in results]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(
    "tpapa/FNSPID_portfolio_finbert_sentiment_score",
    token="Removed this for my safety :)",
    repo_type="dataset",
    private=True
)

FNSID = FNSID.drop(columns=['Concatenated_Textrank_Summaries'],
                   axis = 1)

FNSID.to_csv("FNSPID_portfolio_finbert_sentiment_score.csv", index=False)

api.upload_file(
    path_or_fileobj="FNSPID_portfolio_finbert_sentiment_score.csv",
    path_in_repo="FNSPID_portfolio_finbert_sentiment_score.csv",
    repo_id="tpapa/FNSPID_portfolio_finbert_sentiment_score",
    repo_type="dataset",
    token="Removed this for my safety :)"
)

CommitInfo(commit_url='https://huggingface.co/datasets/tpapa/FNSPID_portfolio_finbert_sentiment_score/commit/06634f091dfe0be6e8706d5199b48edd25fa2585', commit_message='Upload FNSPID_portfolio_finbert_sentiment_score.csv with huggingface_hub', commit_description='', oid='06634f091dfe0be6e8706d5199b48edd25fa2585', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tpapa/FNSPID_portfolio_finbert_sentiment_score', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tpapa/FNSPID_portfolio_finbert_sentiment_score'), pr_revision=None, pr_num=None)