# FinBert Fine-Tuning and Streamlit Deploy

Date: March, 2024

Authors:
- Daniel Espinoza
- Fausto Bravo 

**Dependencies**

In [1]:
#%%capture
#!pip install pandas
#!pip install pyarrow
#!pip install numpy
#!pip install torch torchvision torchaudio
#!pip install transformers
#!pip install matplotlib
#!pip install nltk
#!pip install spacy
#!pip install scikit-learn
#!pip install tqdm



**Libraries**

In [2]:
# General
import re
import os
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Tokenization
import spacy
import string
from nltk.tokenize import word_tokenize
from spacy.lang.en import English

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
# Base Model - Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#Base Model - Other Models
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

# Transformers Model
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification,BertTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# Evaluation Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


**Import data**

- Twitter Data

In [4]:
twitter_train = pd.read_csv('./categorizacion/topic_train.csv')
twitter_test = pd.read_csv('./categorizacion/topic_valid.csv')
twitter = pd.concat([twitter_train,twitter_test],axis=0)

# with open('./categorizacion/labels.json', 'r') as json_file:
#     labels = pd.DataFrame(list(json.load(json_file).items()),columns=['Label','Description'])

- Sentiment Data

In [4]:
directory_sent='./sentimiento'
data_list = []

for filename in os.listdir(directory_sent):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory_sent,filename)
        with open(file_path,'r',encoding='latin1') as file:
            lines = file.readlines()
            for line in lines:
                sentence, sentiment = line.rsplit('@',1)
                sentiment=sentiment.strip()
                data_list.append({'sentence':sentence,'sentiment': sentiment})

sent = pd.DataFrame(data_list)
sent = sent.drop_duplicates()


- Examples

**Neutral:** Tikkurila Powder Coatings has some 50 employees at its four paint plants , which generated revenues of EUR2 .4 m USD3 .3 m in 2010 

**Positive:** Sales rose 10 pct to 566 mln eur on the back of strong volume and favourable currency effects 

**Negative:** Pharmaceuticals group Orion Corp reported a fall in its third-quarter earnings that were hit by larger expenditures on R&D and marketing

**Pre-processing functions**

We create our own cleaning function to get rid of stopwords and puctuation

In [22]:
def clean_stop_words(data):
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    lista = [palabra for palabra in data if palabra not in stop_words and palabra not in string.punctuation and palabra != '``']
    return lista

In [24]:
# Processing Text
sent_base=sent.copy()
sent_base['tokenized_sentence'] = sent_base['sentence'].apply(word_tokenize)
sent_base['cleaned_sentence']=sent_base['tokenized_sentence'].apply(clean_stop_words)
sent_base['processed_sentence'] = sent_base['cleaned_sentence'].apply(lambda x: ' '.join(x))

# Labeling "sentiment" (target variable)
value_mapping = {'negative': 1, 'neutral': 2, 'positive': 0}
sent_base.loc[:, 'sentiment_numeric'] = sent_base['sentiment'].map(value_mapping)

# Train-Test Split (80% - 20%)
sent_train, sent_test = train_test_split(sent_base, test_size=0.2, stratify=sent_base['sentiment'], random_state=42)

# Vectorizing
vectorizer = TfidfVectorizer(max_features=1000)

# train (sent_train_tfid -> features, y_sent_train -> target)
sent_train_tfid = vectorizer.fit_transform(sent_train['processed_sentence'])
y_sent_train=sent_train['sentiment_numeric']

# test (sent_test_tfid -> features, y_sent_test -> target)
sent_test_tfid = vectorizer.transform(sent_test['processed_sentence'])
y_sent_test=sent_test['sentiment_numeric']

**Base Line Model (Logistic Regression)**

In [28]:
# Logistic Regression Model
model = LogisticRegression()
model.fit(sent_train_tfid, y_sent_train)

predictions = model.predict(sent_test_tfid)

accuracy = accuracy_score(y_sent_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
print("-----------------Logistic Regression-----------------")
print("Classification Report:")
print(classification_report(y_sent_test, predictions))

Accuracy: 0.73
-----------------Logistic Regression-----------------
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.52      0.60       272
           1       0.64      0.32      0.43       121
           2       0.74      0.91      0.82       575

    accuracy                           0.73       968
   macro avg       0.69      0.58      0.61       968
weighted avg       0.72      0.73      0.71       968



**Base Line Model (Other Models)**

In [29]:
models = [
    LogisticRegression(),
    RandomForestClassifier(),
    SVC(),
    MultinomialNB(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    MLPClassifier()
]

for model in models:
    model.fit(sent_train_tfid, y_sent_train)
    
    y_train_pred = model.predict(sent_train_tfid)
    train_accuracy = accuracy_score(y_sent_train, y_train_pred)
    
    y_test_pred = model.predict(sent_test_tfid)
    test_accuracy = accuracy_score(y_sent_test, y_test_pred)
    
    print(f"{model.__class__.__name__}:")
    print(f"  Training Accuracy: {train_accuracy}")
    print(f"  Test Accuracy: {test_accuracy}")
    print("------")

LogisticRegression:
  Training Accuracy: 0.8081095041322314
  Test Accuracy: 0.7262396694214877
------
RandomForestClassifier:
  Training Accuracy: 0.9974173553719008
  Test Accuracy: 0.7200413223140496
------
SVC:
  Training Accuracy: 0.918904958677686
  Test Accuracy: 0.7200413223140496
------
MultinomialNB:
  Training Accuracy: 0.7551652892561983
  Test Accuracy: 0.6859504132231405
------
GradientBoostingClassifier:
  Training Accuracy: 0.8083677685950413
  Test Accuracy: 0.731404958677686
------
KNeighborsClassifier:
  Training Accuracy: 0.6771694214876033
  Test Accuracy: 0.6342975206611571
------
DecisionTreeClassifier:
  Training Accuracy: 0.9974173553719008
  Test Accuracy: 0.6580578512396694
------
MLPClassifier:
  Training Accuracy: 0.9974173553719008
  Test Accuracy: 0.6828512396694215
------




### Finbert (Basic Model) 

**Initiate Model FinBert**

In [None]:
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

**Calculate Sentiment (FinBert)**

In [31]:
def get_sentiment(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,return_tensors = 'pt')
    outputs = model(**tokens)
    probabilities = torch.nn.functional.softmax(outputs[0], dim=-1)
    category = torch.argmax(probabilities).item()
    return category

In [42]:
# Setting a copy and encoding categorical labels
sent_testing = sent.copy()
value_mapping = {'negative': 1, 'neutral': 2, 'positive': 0}
sent_testing.loc[:, 'sentiment_numeric'] = sent_testing['sentiment'].map(value_mapping)

# Generating a random sample 
sent_testing_sample = sent_testing.sample(n=4840, random_state=42)

In [43]:
# Using the model to assign sentiment
sent_testing_sample['prediction']=sent_testing_sample['sentence'].apply(get_sentiment)

In [44]:
# Evaluate accuracy
accuracy = accuracy_score(sent_testing_sample['sentiment_numeric'], sent_testing_sample['prediction'])
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('Classification Report:')
print(classification_report(sent_testing_sample['sentiment_numeric'], sent_testing_sample['prediction']))

# Display confusion matrix
conf_matrix = confusion_matrix(sent_testing_sample['sentiment_numeric'], sent_testing_sample['prediction'])
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.78
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.63      0.68      1363
           1       0.81      0.63      0.71       604
           2       0.80      0.89      0.84      2873

    accuracy                           0.78      4840
   macro avg       0.78      0.71      0.74      4840
weighted avg       0.78      0.78      0.78      4840

Confusion Matrix:
[[ 857   28  478]
 [  48  380  176]
 [ 270   60 2543]]


In [45]:
sent_testing_sample.head(5)

Unnamed: 0,sentence,sentiment,sentiment_numeric,prediction
3206,Nordea Bank AB publ holds 6.000 Alma Media sha...,neutral,2,2
2531,Includes company and brand share data by categ...,neutral,2,2
4107,"Commission income decreased to EUR 3.8 mn , co...",negative,1,1
1928,The fund at fair value will increase correspon...,positive,0,1
1538,`` We are delighted to welcome Elisa to our Bo...,positive,0,0


### Finbert (for Streamlit)

In [32]:
# Importing model and tokenizer
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

**Packaging the Model for easy use in the Streamlit App**

In [25]:
from joblib import dump

In [26]:
# Package the tokenizer and model into a dictionary
data_to_save = {
    "tokenizer": tokenizer,
    "model": model
}

# Save the packaged data to a file
dump(data_to_save, 'transformer_model_tokenizer.joblib')

['transformer_model_tokenizer.joblib']

In [27]:
from joblib import load

# Load the data from the file
loaded_data = load('transformer_model_tokenizer.joblib')

# Extract the tokenizer and model
tokenizer_loaded = loaded_data["tokenizer"]
model_loaded = loaded_data["model"]

**Calculate Sentiment (FinBert)**

In [33]:
def get_sentiment(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,return_tensors = 'pt')
    outputs = model(**tokens)
    probabilities = torch.nn.functional.softmax(outputs[0], dim=-1)
    category = torch.argmax(probabilities).item()
    return category

In [34]:
# Setting a copy and encoding categorical labels
sent_testing = twitter.copy()
value_mapping = {'negative': 1, 'neutral': 2, 'positive': 0}
# sent_testing.loc[:, 'sentiment_numeric'] = sent_testing['sentiment'].map(value_mapping)

# Generating a random sample 
sent_testing_sample = sent_testing.sample(n=10, random_state=42)

# Using the model to assign sentiment
sent_testing_sample['prediction']=sent_testing_sample['text'].apply(get_sentiment)

In [36]:
sent_testing_sample.tail(100)

Unnamed: 0,text,label,prediction
10459,"Roughly 60,000 of home deals fell through in J...",14,2
6084,RBA Warns Unanchored Inflation Expectations Wo...,1,1
1635,Autonomy’s Electric Vehicle Subscription Now A...,2,2
1788,ICR Co-Founder and CEO Tom Ryan Named to PRWee...,2,2
16125,"$SOND $SJ $JOAN - MCRB, MRSN and JOAN are amon...",19,2
8769,VIDEO CORRECTION: From @Breakingviews: Big ban...,9,2
15870,Coinbase pops 17% as cryptocurrencies like bit...,19,0
7809,"Greg Fleming, head of wealth adviser Rockefell...",9,2
6223,In the latest Central Banker newsletter: How i...,1,2
12151,Chinese tech stocks advance following a report...,15,0


In [38]:
sent_testing.iloc[4116]["text"]

'Apple Hospitality REIT Announces August 2022 Distribution  https://t.co/LI47CycIqo'

### Finbert (Fine-Tuning)

**Calculate Sentiment (Fine Tuning - FinBert)**

In [5]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Setting a copy and encoding categorical labels
twitter_tuning = twitter.copy()
#value_mapping = {'negative': 1, 'neutral': 2, 'positive': 0}
#sent_tuning.loc[:, 'sentiment_numeric'] = sent_tuning['sentiment'].map(value_mapping)

# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(twitter_tuning['text'], twitter_tuning['label'],stratify=twitter_tuning['label'], test_size=0.2,)

# Convert the texts and labels into tensors
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128,return_tensors='pt')
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128,return_tensors='pt')

# Create attention masks
train_masks = train_encodings['attention_mask']
val_masks = val_encodings['attention_mask']

train_inputs = train_encodings['input_ids']
val_inputs = val_encodings['input_ids']

train_labels = torch.tensor(train_labels.tolist())
val_labels = torch.tensor(val_labels.tolist())

# Create the DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)



In [7]:
train_labels

tensor([ 2, 16,  5,  ...,  6, 14, 18])

In [13]:
from transformers import BertForSequenceClassification, AdamW
train=0

if train==1:
    # Load the pre-trained BERT model for sequence classification
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=20) 
    # Define optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    
    # Move model to the device (GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Training loop
    num_epochs = 1  # number of epochs
    
    for epoch in range(num_epochs):
        model.train()
        
        # Use tqdm to display a progress bar
        data_loader = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch')
    
        for batch in data_loader:
            inputs = {'input_ids': batch[0].to(device),
                      'attention_mask': batch[1].to(device),
                      'labels': batch[2].to(device)}
    
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
    
            # Update the progress bar with the current loss
            data_loader.set_postfix({'Loss': loss.item()}, refresh=True)
    
    # Save the fine-tuned model
    # model.save_pretrained('fine_tuned_bert_model')
    
    # Save the model
    with open('fine_tuned_bert_model.pkl', 'wb') as f:
        pickle.dump(model, f)

In [5]:
# Load the fine-tuned model
with open('fine_tuned_bert_model.pkl', 'rb') as f:
    model_fine_tuning = pickle.load(f)

In [6]:
def get_sentiment_tuned(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,return_tensors = 'pt')
    outputs = model_fine_tuning(**tokens)
    probabilities = torch.nn.functional.softmax(outputs[0], dim=-1)
    category = torch.argmax(probabilities).item()
    return category

In [7]:
df_test_tun = pd.DataFrame({'text': val_texts, 'label': val_labels})

In [8]:
df_test_tun['prediction']=df_test_tun['text'].apply(get_sentiment_tuned)

In [12]:
# Evaluate accuracy
accuracy = accuracy_score(df_test_tun['label'], df_test_tun['prediction'])
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('Classification Report:')
print(classification_report(df_test_tun['label'], df_test_tun['prediction']))

# Display confusion matrix
#conf_matrix = confusion_matrix(df_test_tun['label'], df_test_tun['prediction'])
#print('Confusion Matrix:')
#print(conf_matrix)

Accuracy: 0.62
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        66
           1       0.86      0.17      0.29       210
           2       0.68      0.80      0.74       880
           3       1.00      0.11      0.20        80
           4       1.00      0.59      0.74        91
           5       0.96      0.65      0.78       246
           6       0.89      0.25      0.40       134
           7       0.84      0.74      0.79       157
           8       0.00      0.00      0.00        40
           9       0.31      0.85      0.46       379
          10       0.00      0.00      0.00        16
          11       0.00      0.00      0.00        12
          12       0.94      0.49      0.64       121
          13       1.00      0.23      0.38       117
          14       0.79      0.40      0.53       447
          15       0.53      0.41      0.46       125
          16       0.71      0.72      0.72

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
