In [53]:
from datasets import load_dataset
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/yangyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yangyang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## FOMC

In [54]:
dataset = load_dataset("gtfintechlab/fomc_communication")

train_context, train_actual_labels = [], []

stop_words = set(stopwords.words("english"))

context_stop_words_counts = []
context_non_stop_words_counts = []

for sentence in dataset["train"]:
    context = sentence["sentence"]
    train_context.append(context)
    train_actual_label = sentence["label"]
    train_actual_labels.append(train_actual_label)
    context_words = word_tokenize(context)

    context_stop_words_count = sum(
        1 for word in context_words if word.lower() in stop_words
    )

    context_non_stop_words_count = len(context_words) - context_stop_words_count

    context_stop_words_counts.append(context_stop_words_count)
    # 1 account for label
    context_non_stop_words_counts.append(context_non_stop_words_count + 1)

df_fomc = pd.DataFrame(
    {
        "context": train_context,
        "context_stop_words_count": context_stop_words_counts,
        "context_non_stop_words_count": context_non_stop_words_counts,
    }
)

In [55]:
df_fomc

Unnamed: 0,context,context_stop_words_count,context_non_stop_words_count
0,Broad equity price indexes fell sharply over t...,3,11
1,"For example, although a number of features of ...",16,30
2,"On balance, most participants still expected i...",6,12
3,Low readings on overall and core consumer pric...,10,27
4,But I want to emphasize that we do have a comm...,10,9
...,...,...,...
1979,"Consumer price inflation, as measured by the 1...",9,22
1980,We said that we expect to maintain an accommod...,38,54
1981,Increased rates and a smaller balance sheet ra...,6,15
1982,Participants judged that an appropriate firmin...,15,34


## Financial Phrasebank

In [56]:
configs = [
    "sentences_50agree",
    "sentences_66agree",
    "sentences_75agree",
    "sentences_allagree",
]

for config in configs:
    dataset = load_dataset("financial_phrasebank", config)
    train_sentence, train_actual_labels = [], []
    sentence_stop_words_counts, sentence_non_stop_words_counts = [], []

    for sentence in dataset["train"]:
        text = sentence["sentence"]
        train_sentence.append(text)
        train_actual_label = sentence["label"]
        train_actual_labels.append(train_actual_label)
        sentence_words = word_tokenize(text)

        sentence_stop_words_count = sum(
            1 for word in sentence_words if word.lower() in stop_words
        )
        sentence_non_stop_words_count = len(sentence_words) - sentence_stop_words_count

        sentence_stop_words_counts.append(sentence_stop_words_count)
        sentence_non_stop_words_counts.append(sentence_non_stop_words_count + 1)

    # Create the DataFrame
    df_fpb = pd.DataFrame(
        {
            "context": train_sentence,
            "context_stop_words_count": sentence_stop_words_counts,
            "context_non_stop_words_count": sentence_non_stop_words_counts,
        }
    )

In [57]:
df_fpb

Unnamed: 0,context,context_stop_words_count,context_non_stop_words_count
0,"According to Gran , the company has no plans t...",12,14
1,"For the last quarter of 2010 , Componenta 's n...",16,24
2,"In the third quarter of 2010 , net sales incre...",8,22
3,Operating profit rose to EUR 13.1 mn from EUR ...,6,19
4,"Operating profit totalled EUR 21.1 mn , up fro...",4,19
...,...,...,...
2259,Operating result for the 12-month period decre...,11,17
2260,HELSINKI Thomson Financial - Shares in Cargote...,11,30
2261,LONDON MarketWatch -- Share prices ended lower...,7,20
2262,Operating profit fell to EUR 35.4 mn from EUR ...,4,20


## Numclaim

In [58]:
dataset = load_dataset("gtfintechlab/Numclaim")
context, actual_labels = [], []

# Initialize lists to store stop and non-stop word counts
context_stop_words_counts, context_non_stop_words_counts = [], []

# Iterating through the train split of the dataset
for sentence in dataset["train"]:
    # Append the context and actual label
    text = sentence["context"]
    context.append(text)
    actual_label = sentence[
        "response"
    ]  # Assuming 'response' is the correct key for the label
    actual_labels.append(actual_label)

    # Tokenize context
    context_words = word_tokenize(text)

    # Count stop and non-stop words in context
    context_stop_words_count = sum(
        1 for word in context_words if word.lower() in stop_words
    )
    context_non_stop_words_count = len(context_words) - context_stop_words_count

    context_stop_words_counts.append(context_stop_words_count)
    context_non_stop_words_counts.append(context_non_stop_words_count + 1)

# Create the DataFrame
df_numcliam = pd.DataFrame(
    {
        "context": context,
        "context_stop_words_count": context_stop_words_counts,
        "context_non_stop_words_count": context_non_stop_words_counts,
    }
)

## FinBench

In [59]:
dataset = load_dataset("gtfintechlab/FinBench")

instructions, instruction_stop_words_counts, instruction_non_stop_words_counts = (
    [],
    [],
    [],
)

for entry in dataset["train"]:
    text = entry["X_instruction_for_profile"]
    instructions.append(text)

    instruction_words = word_tokenize(text)

    instruction_stop_words_count = sum(
        1 for word in instruction_words if word.lower() in stop_words
    )
    instruction_non_stop_words_count = (
        len(instruction_words) - instruction_stop_words_count
    )

    # metadata
    metadata_count = (
        len(entry["num_idx"])
        + len(entry["cat_idx"])
        + len(entry["cat_dim"])
        + len(entry["cat_str"])
        + len(entry["col_name"])
    )
    # Adding 1 for num_classes and num_features
    metadata_count += 2

    total_non_stop_words_count = instruction_non_stop_words_count + metadata_count

    instruction_stop_words_counts.append(instruction_stop_words_count)
    instruction_non_stop_words_counts.append(total_non_stop_words_count)

df_finbench = pd.DataFrame(
    {
        "context": instructions,
        "context_stop_words_count": instruction_stop_words_counts,
        "context_non_stop_words_count": instruction_non_stop_words_counts,
    }
)

## FinEntity

In [60]:
subsets = ["5768", "78516", "944601"]  # Actual subset names

for subset in subsets:
    dataset = load_dataset("gtfintechlab/finentity", subset)

    texts, stop_words_counts, non_stop_words_counts = [], [], []
    for entry in dataset["train"]:
        text = entry["content"]
        texts.append(text)

        content_words = word_tokenize(text)
        content_stop_words_count = sum(
            1 for word in content_words if word.lower() in stop_words
        )
        content_non_stop_words_count = len(content_words) - content_stop_words_count

        annotation_non_stop_words_count = 0
        for annotation in entry["annotations"]:
            annotation_words = word_tokenize(annotation["value"])
            annotation_non_stop_words = [
                word for word in annotation_words if word.lower() not in stop_words
            ]
            annotation_non_stop_words_count += len(annotation_non_stop_words)

        total_non_stop_words_count = (
            content_non_stop_words_count + annotation_non_stop_words_count
        )

        stop_words_counts.append(content_stop_words_count)
        non_stop_words_counts.append(total_non_stop_words_count)

    # Create DataFrame for the train split
    df_finentity = pd.DataFrame(
        {
            "context": texts,
            "context_stop_words_count": stop_words_counts,
            "context_non_stop_words_count": non_stop_words_counts,
        }
    )

In [61]:
df_finentity

Unnamed: 0,context,context_stop_words_count,context_non_stop_words_count
0,Refinitiv projected average U.S. gas demand in...,14,35
1,"""Moderating inflation is key to higher equity ...",15,36
2,"So far, the slump has been more than offset by...",33,55
3,"In the spot market, next-day power for Monday ...",17,37
4,The former is helped out by the Netherlands' l...,11,33
...,...,...,...
680,The biggest macro force at play was the strong...,13,42
681,"Milan, 9th edition conference to award the ""Pr...",8,91
682,Germany-based footwear maker Adidas AG <ADSGn....,8,23
683,That sent Micron's shares and the Philadelphia...,15,52


## ECTsum

In [62]:
dataset = load_dataset("gtfintechlab/ECTsum")
contexts, responses, total_stop_words_counts, total_non_stop_words_counts = (
    [],
    [],
    [],
    [],
)

# Iterate through the train split
for entry in dataset["train"]:
    # Extract context and response text
    context = entry["context"]
    response = entry["response"]
    combined_text = context + " " + response

    # Store the combined text
    contexts.append(combined_text)

    # Tokenize and count words in the combined text
    combined_words = word_tokenize(combined_text)
    total_stop_words_count = sum(
        1 for word in combined_words if word.lower() in stop_words
    )
    total_non_stop_words_count = len(combined_words) - total_stop_words_count

    # Append counts
    total_stop_words_counts.append(total_stop_words_count)
    total_non_stop_words_counts.append(total_non_stop_words_count)

# Create DataFrame outside the loop
df_ectsum = pd.DataFrame(
    {
        "context": contexts,
        "context_stop_words_count": total_stop_words_counts,
        "context_non_stop_words_count": total_non_stop_words_counts,
    }
)

## Finqa 

In [63]:
dataset = load_dataset("gtfintechlab/finqa")

combined_texts, total_stop_words_counts, total_non_stop_words_counts = [], [], []

for entry in dataset["train"]:
    pre_text = " ".join(entry["pre_text"])
    post_text = " ".join(entry["post_text"])

    table_text = " ".join([" ".join(row) for row in entry["table_ori"]])

    combined_text = f"{pre_text} {post_text} {table_text} {entry['question']}"
    combined_texts.append(combined_text)

    combined_words = word_tokenize(combined_text)
    total_stop_words_count = sum(
        1 for word in combined_words if word.lower() in stop_words
    )
    total_non_stop_words_count = len(combined_words) - total_stop_words_count

    total_stop_words_counts.append(total_stop_words_count)
    total_non_stop_words_counts.append(total_non_stop_words_count)

df_finqa = pd.DataFrame(
    {
        "context": combined_texts,
        "context_stop_words_count": total_stop_words_counts,
        "context_non_stop_words_count": total_non_stop_words_counts,
    }
)

In [64]:
df_finqa

Unnamed: 0,context,context_stop_words_count,context_non_stop_words_count
0,interest rate to a variable interest rate base...,207,498
1,"abiomed , inc . and subsidiaries notes to cons...",220,509
2,the following table shows annual aircraft fuel...,183,430
3,the fair value of our grants receivable is det...,144,341
4,"entergy louisiana , llc management's financial...",115,315
...,...,...,...
6246,a lump sum buyout cost of approximately $ 1.1 ...,276,551
6247,item 7 . management 2019s discussion and analy...,225,630
6248,notes to consolidated financial statements 201...,98,245
6249,taxing authorities could challenge our histori...,248,480


## Convfinqa

In [65]:
dataset = load_dataset("gtfintechlab/ConvFinQa")
combined_texts, total_stop_words_counts, total_non_stop_words_counts = [], [], []


for entry in dataset["train"]:
    pre_text = " ".join(entry["pre_text"])
    post_text = " ".join(entry["post_text"])

    table_text = " ".join([" ".join(map(str, row)) for row in entry["table_ori"]])

    question_0 = str(entry["question_0"]) if entry["question_0"] is not None else ""
    question_1 = str(entry["question_1"]) if entry["question_1"] is not None else ""
    answer_0 = str(entry["answer_0"]) if entry["answer_0"] is not None else ""
    answer_1 = str(entry["answer_1"]) if entry["answer_1"] is not None else ""

    combined_text = f"{pre_text} {post_text} {table_text} {question_0} {answer_0} {question_1} {answer_1}"
    combined_texts.append(combined_text)

    combined_words = word_tokenize(combined_text)
    total_stop_words_count = sum(
        1 for word in combined_words if word.lower() in stop_words
    )
    total_non_stop_words_count = len(combined_words) - total_stop_words_count

    total_stop_words_counts.append(total_stop_words_count)
    total_non_stop_words_counts.append(total_non_stop_words_count)

df_convfinqa = pd.DataFrame(
    {
        "context": combined_texts,
        "context_stop_words_count": total_stop_words_counts,
        "context_non_stop_words_count": total_non_stop_words_counts,
    }
)

## Finer

In [66]:
dataset = load_dataset("gtfintechlab/finer")

In [67]:
combined_texts, total_stop_words_counts, total_non_stop_words_counts = [], [], []

for entry in dataset["train"]:
    combined_text = f"{entry['gold_token']} {str(entry['gold_label'])} {str(entry['doc_idx'])} {str(entry['sent_idx'])}"
    combined_texts.append(combined_text)

    tokenized_words = word_tokenize(combined_text)
    total_stop_words_count = sum(
        1 for word in tokenized_words if word.lower() in stop_words
    )
    total_non_stop_words_count = len(tokenized_words) - total_stop_words_count

    total_stop_words_counts.append(total_stop_words_count)
    total_non_stop_words_counts.append(total_non_stop_words_count)


df_finer = pd.DataFrame(
    {
        "context": combined_texts,
        "context_stop_words_count": total_stop_words_counts,
        "context_non_stop_words_count": total_non_stop_words_counts,
    }
)

## Banking 77

In [68]:
dataset = load_dataset("gtfintechlab/banking77")

In [69]:
texts, total_stop_words_counts, total_non_stop_words_counts = [], [], []

# Iterate through the dataset
for entry in dataset["train"]:
    # Extract the text
    text = entry["text"]
    texts.append(text)

    # Tokenize and count words
    words = word_tokenize(text)
    total_stop_words_count = sum(1 for word in words if word.lower() in stop_words)
    total_non_stop_words_count = len(words) - total_stop_words_count

    # Append counts
    total_stop_words_counts.append(total_stop_words_count)
    total_non_stop_words_counts.append(total_non_stop_words_count + 1)

# Create a DataFrame with the text and word counts
df_banking77 = pd.DataFrame(
    {
        "context": texts,
        "context_stop_words_count": total_stop_words_counts,
        "context_non_stop_words_count": total_non_stop_words_counts,
    }
)

In [70]:
df_banking77

Unnamed: 0,context,context_stop_words_count,context_non_stop_words_count
0,I am still waiting on my card?,4,5
1,What can I do if my card still hasn't arrived ...,8,8
2,I have been waiting over a week. Is the card s...,7,8
3,Can I track my card while it is in the process...,9,6
4,"How do I know if I will get my card, or if it ...",11,7
...,...,...,...
9998,You provide support in what countries?,3,5
9999,What countries are you supporting?,3,4
10000,What countries are getting support?,2,5
10001,Are cards available in the EU?,3,5


In [71]:
## Prompt

# One sheet

In [72]:
df_fomc["dataset"] = "fomc"
df_fpb["dataset"] = "fpb"
df_numcliam["dataset"] = "numclaim"
df_finbench["dataset"] = "finbench"

df_finentity["dataset"] = "finentity"

df_ectsum["dataset"] = "ectsum"

df_finqa["dataset"] = "finqa"

df_convfinqa["dataset"] = "convfinqa"

df_finer["dataset"] = "finer"

df_banking77["dataset"] = "banking77"

In [73]:
df_numcliam["prompt"] = 84
df_fomc["prompt"] = 97
df_fpb["prompt"] = 88
df_finbench["prompt"] = 70
df_finentity["prompt"] = 147
df_ectsum["prompt"] = 73
df_finqa["prompt"] = 34
df_convfinqa["prompt"] = 60
df_finer["prompt"] = 149
df_banking77["prompt"] = 264

In [76]:
df_numcliam = df_numcliam.reset_index()
df_fomc = df_fomc.reset_index()

In [79]:
df_fpb = df_fpb.reset_index()
df_finbench = df_finbench.reset_index()
df_finentity = df_finentity.reset_index()
df_ectsum = df_ectsum.reset_index()
df_finqa = df_finqa.reset_index()
df_convfinqa = df_convfinqa.reset_index()
df_finer = df_finer.reset_index()
df_banking77 = df_banking77.reset_index()

In [77]:
pd.concat([df_numcliam, df_fomc])

Unnamed: 0,index,context,context_stop_words_count,context_non_stop_words_count,dataset,prompt
0,0,the stock increased 50.3% in this period compa...,7,13,numclaim,84
1,1,provision expenses are expected to be in the r...,6,11,numclaim,84
2,2,"in october 2018, endocyte entered into an agre...",15,29,numclaim,84
3,3,financial flexibility: tapestry ended first-qu...,8,26,numclaim,84
4,4,"third-quarter revenues of $1,835 million incre...",12,26,numclaim,84
...,...,...,...,...,...,...
1979,1979,"Consumer price inflation, as measured by the 1...",9,22,fomc,97
1980,1980,We said that we expect to maintain an accommod...,38,54,fomc,97
1981,1981,Increased rates and a smaller balance sheet ra...,6,15,fomc,97
1982,1982,Participants judged that an appropriate firmin...,15,34,fomc,97


In [82]:
# df_banking77

Unnamed: 0,index,context,context_stop_words_count,context_non_stop_words_count,dataset,prompt
0,0,I am still waiting on my card?,4,5,banking77,264
1,1,What can I do if my card still hasn't arrived ...,8,8,banking77,264
2,2,I have been waiting over a week. Is the card s...,7,8,banking77,264
3,3,Can I track my card while it is in the process...,9,6,banking77,264
4,4,"How do I know if I will get my card, or if it ...",11,7,banking77,264
...,...,...,...,...,...,...
9998,9998,You provide support in what countries?,3,5,banking77,264
9999,9999,What countries are you supporting?,3,4,banking77,264
10000,10000,What countries are getting support?,2,5,banking77,264
10001,10001,Are cards available in the EU?,3,5,banking77,264


In [80]:
word_count = pd.concat(
    [
        df_numcliam,
        df_fomc,
        df_fpb,
        df_finbench,
        df_finentity,
        df_ectsum,
        df_finqa,
        df_convfinqa,
        df_finer,
        df_banking77,
    ]
)

In [83]:
word_count1 = word_count.drop(columns="context")

In [84]:
word_count1.to_csv("word_count.csv")