In [1]:
# imports

import os
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from items import Item
import matplotlib.pyplot as plt
import pandas as pd
import re
import pickle

In [2]:
# environment

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

In [3]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
# Load in our dataset

dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})


In [5]:
# build training dataframe
df_train = pd.DataFrame(dataset['train'])
df_train['label_category'] = df_train.label.apply(lambda x: 'bearish' if x == 0 else 'bullish' if x == 1 else 'neutral' if x == 2 else 'unknown')
df_train.head()

Unnamed: 0,text,label,label_category
0,$BYND - JPMorgan reels in expectations on Beyo...,0,bearish
1,$CCL $RCL - Nomura points to bookings weakness...,0,bearish
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0,bearish
3,$ESS: BTIG Research cuts to Neutral https://t....,0,bearish
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0,bearish


In [6]:
# build test dataframe
df_test = pd.DataFrame(dataset['validation'])
df_test['label_category'] = df_test.label.apply(lambda x: 'bearish' if x == 0 else 'bullish' if x == 1 else 'neutral' if x == 2 else 'unknown')
df_test.head()

Unnamed: 0,text,label,label_category
0,$ALLY - Ally Financial pulls outlook https://t...,0,bearish
1,"$DELL $HPE - Dell, HPE targets trimmed on comp...",0,bearish
2,$PRTY - Moody's turns negative on Party City h...,0,bearish
3,$SAN: Deutsche Bank cuts to Hold,0,bearish
4,$SITC: Compass Point cuts to Sell,0,bearish


In [7]:
# build training dataset
train = []
for i in range(len(df_train)):
  datapoint = df_train.iloc[i]
  train.append(Item(datapoint, datapoint['label_category']))
print(f'Training points: {len(train)}')

# build test dataset
test = []
for i in range(len(df_test)):
  datapoint = df_test.iloc[i]
  test.append(Item(datapoint, datapoint['label_category']))
print(f'Testing points: {len(test)}')

Training points: 9543
Testing points: 2388


In [8]:
# build test and train prompts
train_prompts = [item.prompt for item in train if item.prompt]
train_labels = [item.label for item in train if item.prompt]
train_label_categories = [item.label_category for item in train if item.prompt]
test_prompts = [item.test_prompt() for item in test if item.prompt]
test_labels = [item.label for item in test if item.prompt]
test_label_categories = [item.label_category for item in test if item.prompt]

# Create a Dataset from the lists

train_dataset = Dataset.from_dict({"text": train_prompts, "label": train_labels, "label_category": train_label_categories})
test_dataset = Dataset.from_dict({"text": test_prompts, "label": test_labels, "label_category": test_label_categories})
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [None]:
# upload data to HF

HF_USER = "falakjain"
DATASET_NAME = f"{HF_USER}/financial_sentiment_data"
dataset.push_to_hub(DATASET_NAME, private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########|  503kB /  503kB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########|  137kB /  137kB            

README.md:   0%|          | 0.00/447 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/falakjain/financial_sentiment_data/commit/8fe6095b0c76cc685dce063294cc4f628612272c', commit_message='Upload dataset', commit_description='', oid='8fe6095b0c76cc685dce063294cc4f628612272c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/falakjain/financial_sentiment_data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='falakjain/financial_sentiment_data'), pr_revision=None, pr_num=None)

In [10]:
# pickle the training and test dataset so we don't have to execute all this code next time!

with open('train.pkl', 'wb') as file:
    pickle.dump(train, file)

with open('test.pkl', 'wb') as file:
    pickle.dump(test, file)

In [15]:
test_dataset[0]

{'text': 'What is the financial sentiment of this tweet?\n\n$DELL $HPE - Dell, HPE targets trimmed on compute headwinds https\n\nSentiment is ',
 'label': 0,
 'label_category': 'bearish'}