<a href="https://colab.research.google.com/github/Incredible88/FinBERT-FOMC/blob/main/Finbert-finetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get Started

In [None]:
# Import different python libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# !pip install chardet
## chardet is used for detecting file encoding before reading raw bytes.
## No need for chardet — Parquet stores text as UTF-8 internally.

# import chardet
# result = chardet.detect(parquet["text_pr"])
# encoding = result['encoding']

## To find what encoding type of data
# encoding

# Load SEC Press Releases

In [None]:
# Read Parquet File `sec.parquet`
parquet = pd.read_parquet('sec.parquet')
parquet.head()

# Focus on Sec Press Releases - `text_pr` column
parquet_text = parquet["text_pr"]
parquet_text.head()

In [None]:
parquet.info()

# Load Finbert 

In [None]:
# pip install transformers==4.28.

from transformers import BertTokenizer, BertForSequenceClassification, pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
finbert = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

## Using spacy to split sentences

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

sentences = []
for text in tqdm(text['sentence']):
    doc = nlp(text)
    for sent in doc.sents:
        sentences.append(sent.text)


# Predict the original dataset


In [None]:
nlp(sentences_list[1])

In [None]:
results = []
for sentence in tqdm(sentences):
    doc = finbert(sentence)
    label = doc[0]['label']
    score = doc[0]['score']
    results.append({'sentence': sentence, 'label': label, 'score': score})

df = pd.DataFrame(results)

In [None]:
df.head()

In [None]:
# Save the results
df.to_csv('FOMC_results.csv', index=False)

# Text Simplification (sentiment focus)

**Step1.**  
To **remove comma** when two paragraph should be together. For Example: 
 
However, the apparent pickup in longer-term expectations, while worrisome, was relatively small  
⇓  
However, the apparent pickup in longer-term expectations, while worrisome was relatively small 

---
**Step2.**  
If there is a transition such as **although**, **though** and **while**, the focus will be on those paragraph except this one, and if there is a **but**, the focus will be on the paragraph containing the but. For example:

**Although** some scattered signs of cooling of the housing sector had emerged, the pace of construction activity and sales remained brisk.  
⇓  
the pace of construction activity and sales remained brisk.

Starts of new single-family homes dropped back somewhat in October from September's very strong pace, **but** permit issuance remained elevated.   
⇓  
permit issuance remained elevated.



In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def remove_comma(sentence):
    doc = nlp(sentence)
    indices = []
    for i, token in enumerate(doc):
        if token.dep_ == "punct":
            try:               
                next_token = doc[i+1]
                if next_token.dep_ == "ROOT" or next_token.dep_ == "conj":
                    indices.append(i)
            except IndexError:
                pass
    if not indices:
        return sentence
    else:
        parts = []
        last_idx = 0
        for idx in indices:
            parts.append(doc[last_idx:idx].text.strip())

            last_idx = idx+1
        parts.append(doc[last_idx:].text.strip())
        return " ".join(parts)


In [None]:
# Example of remove_comma
remove_comma("The personal saving rate--while still slightly negative,moved up in October.")

In [None]:
def sentiment_focus(sentence):
    doc = nlp(sentence)
    focus = ""
    focus_changed = 1
    for token in doc[:-1]:
      if token.lower_ == "but":
          focus = doc[token.i + 1:]
          return str(focus).strip(),focus_changed

    for sent in doc.sents:
        sent_tokens = [token for token in sent]
        for token in sent_tokens:
            if token.lower_ == "although" or token.lower_ == "though":
                try:
                    comma_index_back = [token1.i for token1 in doc[token.i:] if token1.text == ','][0]
                except IndexError:
                    try:
                      comma_index_front = [token1.i for token1 in doc[:token.i] if token1.text == ','][-1]
                    except IndexError:
                      return str(doc).strip(),focus_changed
                    focus = doc[:comma_index_front].text
                    return str(focus).strip(),focus_changed
                try:
                      comma_index_front = [token1.i for token1 in doc[:token.i] if token1.text == ','][-1]
                except IndexError:
                  focus = doc[comma_index_back+1:].text
                  return str(focus).strip(),focus_changed
                focus = doc[:comma_index_front].text+doc[comma_index_back:].text
                return str(focus).strip(),focus_changed

    if doc[0].lower_ == "while":
      try:
        comma_index_back1 = [token2.i for token2 in doc if token2.text == ','][0]
      except IndexError:
        return str(doc).strip(),focus_changed
      focus = doc[comma_index_back1+1:].text
      return str(focus).strip(),focus_changed

    focus_changed = 0 
    return str(doc).strip(),focus_changed

## The same sentence after focus processing

Example: Actually works well

In [None]:
finbert("While light vehicle sales had slowed in the fall, consumer spending outside the auto sector appeared to have remained vigorous")

In [None]:
finbert("consumer spending outside the auto sector appeared to have remained vigorous")

# Processing focus sentiment sentences

In [None]:
# load original finbert results
df = pd.read_csv('/content/FOMC_results.csv')

In [None]:
import spacy
from tqdm.auto import tqdm
tqdm.pandas()

# Processing remove comma
df["sentence_simple"] = df["sentence"].progress_apply(remove_comma)

In [None]:
# Processing sentiment focus
df[['sentence_simple', 'focus_changed']] = df['sentence_simple'].progress_apply(sentiment_focus).apply(pd.Series)

df['focus_ornot'] = df['focus_changed'].apply(lambda x: 1 if x else 0)

df.drop('focus_changed', axis=1, inplace=True)

Now we get the sentiment focus sentemce from orginal sentence datasets

In [None]:
df.head()

## Predict with Finbert(only select changed sentences)

We only need complex sentences to fine tune FinBERT

In [None]:
df_change = df.loc[df['focus_ornot'] == 1]

In [None]:
df_change.info()

In [None]:
df_change.head()

We need new labels via FinBERT to predict with sentiment focus sentences for those complex sentences

In [None]:
tqdm.pandas()

df_change['finbert_result'] = df_change['sentence_simple'].progress_apply(lambda x: finbert(x))

df_change['label_new'] = df_change['finbert_result'].apply(lambda x: x[0]['label'])
df_change['score_new'] = df_change['finbert_result'].apply(lambda x: x[0]['score'])

df_change.drop('finbert_result', axis=1, inplace=True)

In [None]:
df_change.head()

In [None]:
# Save training data
df_change.to_csv('training_data.csv',index = False)

# Fine tuning FinBERT



Import many liberies needed in fine tuning FinBERT

In [None]:
!pip install transformers==4.28.1
!pip install datasets
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import transformers
torch.__version__, transformers.__version__

In [None]:
torch.cuda.is_available()

In [None]:
# load training data
df = pd.read_csv('/content/training_data.csv') 
df.head()

In [None]:
# We only need new labels
df = df[['sentence', 'label_new']].rename(columns={'label_new': 'label'})
df.head()

In [None]:
df['label'] = df['label'].replace({'Neutral': 0, 'Positive': 1, 'Negative': 2})
df.head()

## preparing training/validation/testing

In [None]:
df_train, df_test, = train_test_split(df, stratify=df['label'], test_size=0.1, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'],test_size=0.1, random_state=42)
print(df_train.shape, df_test.shape, df_val.shape)

## load FinBERT pretrained model
The pretrained FinBERT model path on Huggingface is https://huggingface.co/yiyanghkust/finbert-pretrain


In [None]:
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')

## prepare dataset for fine-tuning

In [None]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

dataset_train = dataset_train.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val = dataset_val.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_test = dataset_test.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length' , max_length=128), batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

## define training options

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
        output_dir = 'temp/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.005,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

trainer = Trainer(
        model=model,                        
        args=args,                  
        train_dataset=dataset_train,         
        eval_dataset=dataset_val,           
        compute_metrics=compute_metrics
)

trainer.train()  

## evaluate on testing set

In [None]:
model.eval()
trainer.predict(dataset_test).metrics

In [None]:
dataset_test

## save the fine-tuned model

In [None]:
trainer.save_model('finbert-sentiment/')

# Evaluate model

Load manual labeled testing data 

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/Incredible88/FinBERT-FOMC/main/data/dataset_label.csv')

In [None]:
data.info()

In [None]:
data = data.iloc[:1375,: ]
data.head()

In [None]:
data = data[['sentence','Ziwei']].rename(columns={'Ziwei': 'label'})

In [None]:
data['label'] = data['label'].replace({'neutral': 0, 'positive': 1, 'negative': 2})

In [None]:
data_test = Dataset.from_pandas(data)

data_test = data_test.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length' , max_length=128), batched=True)

data_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

## Accuracy with manual labels







In [None]:
model.eval()
pre = trainer.predict(data_test).predictions
trainer.predict(data_test).metrics

In [None]:
labels = np.argmax(pre, axis=1)
data['label_pre'] = labels

In [None]:
data.head()

In [None]:
tqdm.pandas()

data['finbert_result'] = data['sentence'].progress_apply(lambda x: finbert(x))

data['label_new'] = data['finbert_result'].apply(lambda x: x[0]['label'])

data.drop('finbert_result', axis=1, inplace=True)

In [None]:
data['label_pre'] = data['label_pre'].replace({0:'Neutral',  1: 'Positive', 2:'Negative'})

## Original Finbert results

In [None]:
data['label_new'] = data['label_new'].replace({'Neutral': 0, 'Positive': 1, 'Negative': 2})
(data['label'] == data['label_new']).sum() / len(data)

## focus sentiment preprocessing data as comparing

In [None]:
import spacy
from tqdm.auto import tqdm
tqdm.pandas()

data["sentence_simple"] = data["sentence"].progress_apply(remove_comma)

data[['sentence_simple', 'focus_changed']] = data['sentence_simple'].progress_apply(sentiment_focus).apply(pd.Series)

data['focus_ornot'] = data['focus_changed'].apply(lambda x: 1 if x else 0)

data.drop('focus_changed', axis=1, inplace=True)

In [None]:
tqdm.pandas()

data['label_new_s'] = data['sentence_simple'].progress_apply(lambda x: finbert(x)[0]['label'])


In [None]:
data['label_new_s'] = data['label_new_s'].replace({'Neutral': 0, 'Positive': 1, 'Negative': 2})
(data['label'] == data['label_new_s']).sum() / len(data)

In [None]:
data['label_pre'] = data['label_pre'].replace({'Neutral': 0, 'Positive': 1, 'Negative': 2})

In [None]:
data.head()

In [None]:
fine_tuned_model = (data['label_pre'] == data['label']).sum() / len(data)

In [None]:
finbert_acc = (data['label_new'] == data['label']).sum() / len(data)

In [None]:
"{:.2%}".format(finbert_acc)

In [None]:
data_d = pd.DataFrame({'Model': ['FinBERT', 'Fine-tuned'],
                       'Accuracy': [finbert_acc, fine_tuned_model]})

ax = sns.barplot(x='Model', y='Accuracy', width=0.3,data=data_d)


for i, score in enumerate(data_d['Accuracy']):
    plt.annotate("{:.2%}".format(score), (i, score), ha='center', va='bottom')

plt.ylim(0, 1) 
plt.title('Comparison of Finbert and Fine-tuned')
plt.xlabel('Model')
plt.ylabel('Accuracy')

plt.show()


## Compare with only changed sentences to manual labeling

In [None]:
data_change = data.loc[data.sentence != data.sentence_simple]


In [None]:
data_change.info()

In [None]:
data_change1 = data.loc[data.label_new != data.label_new_s]


In [None]:
fine_tuned_model_1 = (data_change['label'] == data_change['label_pre']).sum() / len(data_change)

In [None]:
finbert_1 = (data_change['label'] == data_change['label_new']).sum() / len(data_change)

In [None]:
finbert_sf = (data_change['label'] == data_change['label_new_s']).sum() / len(data_change)

In [None]:
data_change.head()

In [None]:
data_s = pd.DataFrame({'Model': ['FinBERT', 'Fine-tuned','FinBERT after SF'],
                     'Accuracy': [finbert_1, fine_tuned_model_1,finbert_sf]})

plt.figure(figsize=(8, 6)) 

sns.barplot(x='Model', y='Accuracy', width=0.4, data=data_s)

bars = ax.patches
hatch_pattern = '////'
for bar in bars:
    bar.set_hatch(hatch_pattern)

for i, score in enumerate(data_s['Accuracy']):
    plt.annotate("{:.2%}".format(score), (i, score), ha='center', va='bottom')

plt.ylim(0, 1) 
plt.title('Comparison of Finbert and Fine-tuned in only sentiment focus')
plt.xlabel('Model')
plt.ylabel('Accuracy')

plt.show()

## Performance for sentences contains "But"

In [None]:
new_df_but = data_change[data_change['sentence'].str.contains('but')]

In [None]:
(new_df_but['label'] == new_df_but['label_pre']).sum() / len(new_df_but)

In [None]:
(new_df_but['label'] == new_df_but['label_new']).sum() / len(new_df_but)

In [None]:
(new_df_but['label'] == new_df_but['label_new_s']).sum() / len(new_df_but)

## Performance for sentences contains "while"

In [None]:
new_df_while = data_change[data_change['sentence'].str.contains('while')]

In [None]:
(new_df_while['label'] == new_df_while['label_pre']).sum() / len(new_df_while)

In [None]:
(new_df_while['label'] == new_df_while['label_new']).sum() / len(new_df_while)

In [None]:
(new_df_while['label'] == new_df_while['label_new_s']).sum() / len(new_df_while)

## Performance for sentences contains "though"

In [None]:
new_df_though = data_change[data_change['sentence'].str.contains('though')]

In [None]:
(new_df_though['label'] == new_df_though['label_pre']).sum() / len(new_df_though)

In [None]:
(new_df_though['label'] == new_df_though['label_new']).sum() / len(new_df_though)

In [None]:
(new_df_though['label'] == new_df_though['label_new_s']).sum() / len(new_df_though)

In [None]:
data1 = [(new_df_but['label'] == new_df_but['label_pre']).sum() / len(new_df_but),(new_df_but['label'] == new_df_but['label_new']).sum() / len(new_df_but)]
data2 = [(new_df_while['label'] == new_df_while['label_pre']).sum() / len(new_df_while),(new_df_while['label'] == new_df_while['label_new']).sum() / len(new_df_while)]
data3 = [(new_df_though['label'] == new_df_though['label_pre']).sum() / len(new_df_though),(new_df_though['label'] == new_df_though['label_new']).sum() / len(new_df_though)]

data = pd.DataFrame({'Category': ['But', 'But', 'While', 'While', 'Though', 'Though'],
                     'Group': ['FinBERT', 'Fine-tune','FinBERT', 'Fine-tune','FinBERT', 'Fine-tune'],
                     'Accuracy': [data1[1], data1[0], data2[1], data2[0], data3[1], data3[0]]})

plt.figure(figsize=(8, 6)) 

sns.barplot(x='Accuracy', y='Category', hue='Group',width=0.5, data=data)

plt.xlim([0, 1.15])

plt.title('Comparison of Categories')
plt.xlabel('Accuracy')
plt.ylabel('Category')

plt.show()


# load model with transformers

In [None]:
!pip install transformers==4.28.1
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

finbert = BertForSequenceClassification.from_pretrained('ZiweiChen/FinBERT-FOMC',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('ZiweiChen/FinBERT-FOMC')
finbert_fomc = pipeline("text-classification", model=finbert, tokenizer=tokenizer)


In [None]:
finbert_fomc('Spending on cars and light trucks increased somewhat in July after a lackluster pace in the second quarter but apparently weakened in August')

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/Incredible88/FinBERT-FOMC/main/data/dataset_label.csv')

In [None]:
data = data.iloc[:1375,: ]
data.head()

In [None]:
data = data[['sentence','Ziwei']].rename(columns={'Ziwei': 'label'})

In [None]:
data

In [None]:
tqdm.pandas()

data['label_p'] = data['sentence'].progress_apply(lambda x: finbert_fomc(x)[0]['label'])

In [None]:
data['label'] = data['label'].replace({'neutral':'Neutral', 'positive':'Positive', 'negative':'Negative'})

In [None]:
(data['label'] == data['label_p']).sum() / len(data)