In [1]:
# imports

import os
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from items import Item
import matplotlib.pyplot as plt
import pandas as pd
import re

In [2]:
# environment

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

In [3]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
%matplotlib inline

In [5]:
# Load in our dataset

dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})


In [6]:
# build training dataframe
df_train = pd.DataFrame(dataset['train'])
df_train['label_category'] = df_train.label.apply(lambda x: 'bearish' if x == 0 else 'bullish' if x == 1 else 'neutral' if x == 2 else 'unknown')
df_train.head()

Unnamed: 0,text,label,label_category
0,$BYND - JPMorgan reels in expectations on Beyo...,0,bearish
1,$CCL $RCL - Nomura points to bookings weakness...,0,bearish
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0,bearish
3,$ESS: BTIG Research cuts to Neutral https://t....,0,bearish
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0,bearish


In [7]:
# explore label categories
df_train.label_category.value_counts(normalize = True)

label_category
neutral    0.647386
bullish    0.201509
bearish    0.151106
Name: proportion, dtype: float64

In [8]:
# Investigate a particular datapoint
datapoint = df_train.iloc[2]
print(datapoint)

text              $CX - Cemex cut at Credit Suisse, J.P. Morgan ...
label                                                             0
label_category                                              bearish
Name: 2, dtype: object


In [9]:
# testing prompt construction
item = Item(datapoint, datapoint['label_category'])
item.label_category

'bearish'

In [10]:
item.prompt.split('Sentiment is')[0] + 'Sentiment is'

'What is the financial sentiment of this tweet?\n\n$CX - Cemex cut at Credit Suisse, J.P. Morgan on weak building outlook https\n\nSentiment is'

In [11]:
items = []
for i in range(len(df_train)):
  datapoint = df_train.iloc[i]
  items.append(Item(datapoint, datapoint['label_category']))
print(len(items))

9543


In [12]:
count = 0
for item in items:
  if not item.prompt:
    continue
  else:
    count += 1
    print(item.prompt)

print(count)

What is the financial sentiment of this tweet?

$BYND - JPMorgan reels in expectations on Beyond Meat https

Sentiment is bearish
What is the financial sentiment of this tweet?

$CCL $RCL - Nomura points to bookings weakness at Carnival and Royal Caribbean https

Sentiment is bearish
What is the financial sentiment of this tweet?

$CX - Cemex cut at Credit Suisse, J.P. Morgan on weak building outlook https

Sentiment is bearish
What is the financial sentiment of this tweet?

$FNKO - Funko slides after Piper Jaffray PT cut https

Sentiment is bearish
What is the financial sentiment of this tweet?

$FTI - TechnipFMC downgraded at Berenberg but called Top Pick at Deutsche Bank https //t.co/XKcPDilIuU

Sentiment is bearish
What is the financial sentiment of this tweet?

$HNHAF $HNHPD $AAPL - Trendforce cuts iPhone estimate after Foxconn delay https //t.co/rlnEwzlzzS

Sentiment is bearish
What is the financial sentiment of this tweet?

$HOG - Moody's warns on Harley-Davidson https //t.co/Lu

In [13]:
item.test_prompt()

'What is the financial sentiment of this tweet?\n\nYNDX, I, QD and OESX among tech movers\n\nSentiment is '