<a href="https://colab.research.google.com/github/eitanrosenfelder/first/blob/main/training_finbert_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install transformers
from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report
import pandas as pd

from transformers import AutoModelForSequenceClassification

import nltk
nltk.download('punkt')
from google.colab import files
from google.colab import drive
drive.mount('/content/gdrive')

open('finbert.py','wb')
from finbert import *

open('utils.py','wb')
import utils as tools

# %load_ext autoreload
# %autoreload 2


pd.set_option('max_colwidth', -1)

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

lm_path = '/content/gdrive/MyDrive/content/'
cl_path = '/content/gdrive/MyDrive/content/'
cl_data_path = '/content/gdrive/MyDrive/content/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).




In [14]:
import json
uploaded = files.upload()
Config = json.loads(uploaded["config.json"].decode("utf-8"))

Saving config.json to config (2).json


In [17]:
# Clean the cl_path
try:
    shutil.rmtree(cl_path)
except:
    pass

bertmodel = AutoModelForSequenceClassification.from_pretrained('/content/gdrive/MyDrive/',cache_dir=None, num_labels=3)


config = Config(   data_dir=cl_data_path,
                   bert_model=bertmodel,
                   num_train_epochs=4,
                   model_dir=cl_path,
                   max_seq_length = 48,
                   train_batch_size = 32,
                   learning_rate = 2e-5,
                   output_mode='classification',
                   warm_up_proportion=0.2,
                   local_rank=-1,
                   discriminate=True,
                   gradual_unfreeze=True)

finbert = FinBert(config)
finbert.base_model = 'bert-base-uncased'
finbert.config.discriminate=True
finbert.config.gradual_unfreeze=True

finbert.prepare_model(label_list=['positive','negative','neutral'])

#Fine-tune the model

# Get the training examples
train_data = finbert.get_data('train')
model = finbert.create_the_model()

# [Optional]
# Fine - tune only a subset of the model
# The variable freeze determines the last layer(out of 12) to be freezed.You can skip this part if you want to fine - tune the whole model.
#
# Important: Execute this step if you want a shorter training time in the expense of accuracy.
# # This is for fine-tuning a subset of the model.
#
# freeze = 6
#
# for param in model.bert.embeddings.parameters():
#     param.requires_grad = False
#
# for i in range(freeze):
#     for param in model.bert.encoder.layer[i].parameters():
#         param.requires_grad = False

trained_model = finbert.train(train_examples = train_data, model = model)

test_data = finbert.get_data('test')
results = finbert.evaluate(examples=test_data, model=trained_model)

def report(df, cols=['label','prediction','logits']):
    #print('Validation loss:{0:.2f}'.format(metrics['best_validation_loss']))
    cs = CrossEntropyLoss(weight=finbert.class_weights)
    loss = cs(torch.tensor(list(df[cols[2]])),torch.tensor(list(df[cols[0]])))
    print("Loss:{0:.2f}".format(loss))
    print("Accuracy:{0:.2f}".format((df[cols[0]] == df[cols[1]]).sum() / df.shape[0]) )
    print("\nClassification Report:")
    print(classification_report(df[cols[0]], df[cols[1]]))

results['prediction'] = results.predictions.apply(lambda x: np.argmax(x,axis=0))
report(results,cols=['labels','prediction','predictions'])

#Get predictions
text = "Later that day Apple said it was revising down its earnings expectations in \
the fourth quarter of 2018, largely because of lower sales and signs of economic weakness in China. \
The news rapidly infected financial markets. Apple’s share price fell by around 7% in after-hours \
trading and the decline was extended to more than 10% when the market opened. The dollar fell \
by 3.7% against the yen in a matter of minutes after the announcement, before rapidly recovering \
some ground. Asian stockmarkets closed down on January 3rd and European ones opened lower. \
Yields on government bonds fell as investors fled to the traditional haven in a market storm."

cl_path = project_dir
model = AutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3)
result = predict(text,model)
blob = TextBlob(text)
result['textblob_prediction'] = [sentence.sentiment.polarity for sentence in blob.sentences]
print(result)
print('/n')
print('/n')
print(f'Average sentiment is %.2f.' % (result.sentiment_score.mean()))


TypeError: ignored

# New Section