# 0. Setup

In [None]:

# # install required libraries
# !pip3 install transformers[torch]                  # HuggingFace library for interacting with BERT (and multiple other models)
# !pip3 install datasets                      # HuggingFace library to process dataframes
# !pip3 install sentence-transformers         # library to use Sentence Similarity BERT
# !pip3 install bertviz                       # visualize BERT's attention weigths
# # !pip3 install annoy                         # Spotify's library for finding nearest neighbours
# !pip3 install ipywidgets

In [None]:
# import libraries
import gdown
import pandas as pd
import numpy as np
import gdown
import random
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import torch

from transformers import AutoModel, BertModel, AutoTokenizer, BertForSequenceClassification, pipeline, TrainingArguments, Trainer, utils
from transformers.pipelines.base import KeyDataset
from datasets import load_dataset, load_metric, Dataset, DatasetDict

from gensim.models import Word2Vec
import gensim.downloader as api

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from google.colab import output
output.enable_custom_widget_manager()

# test GPU
print(f"GPU: {torch.cuda.is_available()}")

GPU: True


In [None]:
# define dictionary with paths to data in Google Drive
urls_dict = {
              "beigebook":        ("https://drive.google.com/uc?export=download&id=1PJd-huIdtYAJoyQmxvmMGWVZKULoa36j", "csv"),
             "unemploymentrate_q":      ("https://drive.google.com/uc?export=download&id=1qixCmybbi78VsxNM5dAY0tZ4ocEnPrZ8", "csv"),
              "unemploymentrate_m":     ("https://drive.google.com/uc?export=download&id=1l-jWIQRfHuVSlqpsBGRNuruzQ6XQD2wV", "csv")
            }
# download all files
for file_name, attributes in urls_dict.items():
    url = attributes[0]
    extension = attributes[1]
    gdown.download(url, f"./{file_name}.{extension}", quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1PJd-huIdtYAJoyQmxvmMGWVZKULoa36j
To: /content/beigebook.csv
100%|██████████| 42.3M/42.3M [00:01<00:00, 27.7MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1qixCmybbi78VsxNM5dAY0tZ4ocEnPrZ8
To: /content/unemploymentrate_q.csv
100%|██████████| 68.4k/68.4k [00:00<00:00, 95.3MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1l-jWIQRfHuVSlqpsBGRNuruzQ6XQD2wV
To: /content/unemploymentrate_m.csv
100%|██████████| 321k/321k [00:00<00:00, 78.8MB/s]


In [None]:
df = pd.read_csv("beigebook.csv", sep='\t')
unemp_q = pd.read_csv("unemploymentrate_q.csv", sep='\t')
unemp_m = pd.read_csv("unemploymentrate_m.csv", sep='\t')

In [None]:
data = df.copy()
numeric_df = data[['sentence']].apply(pd.to_numeric, errors='coerce')
s = numeric_df[numeric_df.sentence.notna()].index
data = data.drop(s)

In [None]:

# drop empty sentence or sentence with very few words and section title
min_words = 1
data["sentencelen"] = data["sentence"].apply(lambda x: len(str(x).split()))
data["keep_sent"] = data["sentencelen"].apply(lambda x: x > min_words)
data = data.loc[data["keep_sent"]]
data.reset_index(drop=True, inplace=True)

### BERT


In [None]:
df = data.copy()
df.groupby("district").size()

district
at    23821
bo    23326
ch    26309
cl    25394
da    27142
kc    23684
mi    23913
ny    20465
ph    22282
ri    24525
sf    21191
sl    19953
su    33180
dtype: int64

In [None]:
df.head()

Unnamed: 0,yearmonth,district,sentence,sentencelen,keep_sent
0,197005,mi,Although indications of softening in the Ninth...,41,True
1,197005,mi,"At the same time, however, there does not seem...",20,True
2,197005,mi,Consumers apparently are cutting back on their...,30,True
3,197005,mi,"Currently, a number of construction workers in...",24,True
4,197005,mi,Information relating to the Teamsters' strike ...,20,True


In [None]:
df = df[df["sentence"].str.startswith("For more information about District economic conditions visit") == False]

In [None]:
# merge each sentence with the unemployment rate from its district and year-month
df = df.merge(unemp_m,how='left',on=['yearmonth','district']).copy().dropna(subset=['value'])

In [None]:
# lets explore a random sentence from our corpus
i = np.random.randint(0, len(df))
print(f"Sentence from: {df.loc[i, 'district']}\n")
print(df.loc[i, "sentence"])

Sentence from: mi

A large mining company announced that it will idle some of its production   at operations in Minnesota and the Upper Peninsula in 2013, citing lower ore   prices and reduced global demand


In [None]:
# save the dataset
# df.to_parquet("bb_unemp.parquet", index=False)

# 2. Accessing BERT through HuggingFace

### Text tokenization

We will start by using the ```AutoTokenizer``` class to load the tokenizer from ```bert-base-uncased```. BERT´s Tokenizer was trained on English Wikipedia and the Book Corpus and contains a total amount of 30,522 unique tokens.

In [None]:
# load a tokenizer using the name of the model we want to use
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
# inspect the configuration of the tokenizer
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

Passing a list of sequences to the tokenizer object will apply the following steps to each sequence:

1. Breakdown the sequence into individual tokens that are part of BERT's vocabulary
2. Transform tokens into their ids
3. Add special tokens
4. Apply truncation and padding (optional)

In [None]:
# # pass all sequences through the tokenizer
# encoded_sentences = tokenizer(list(df["sentence"].values),     # list of sequences we want to tokenize
#                               truncation=True,                  # truncate sequences longer than specified length
#                               max_length=60,                    # maximum number of tokens per sequence
#                               padding="max_length",             # pad all sequences to the same size
#                               return_tensors='pt'               # data type of results
#                               )
# # inspect the results
# encoded_sentences.keys()

In [None]:
# # examine BERT's tokenization in detail for a random sentence
# i = np.random.randint(0, len(df))
# print("Original sentence:")
# print(df.loc[i, "sentence"])
# print("\n------------------------------------------\n")
# print("Tokens:")
# temp_tokens = encoded_sentences["input_ids"][i]
# print(tokenizer.convert_ids_to_tokens(temp_tokens))
# print("\n------------------------------------------\n")
# print("Tokens IDs:")
# print(temp_tokens)

There are several important features of the tokenization process that are worth highlighting:
1. **Special Tokens**: BERT's tokenizer introduces three types of special tokens to each sentence it tokenizes.
    - *Class token* ```[CLS]```: Gets introduced at the start of each sequence and, broadly speaking, it is intented to capture the relevant information of a sequence for a particular prediction task
    - *End of sequence token* ```[SEP]```: Demarcates the end of a sequence. This token becomes very relevant in sceneraios where a single sequence contains two distinc pieces of information (e.g. question/answer)
    - *Padding token* ```[PAD]```: Facillitates the use of arrays and tensors by making all sequences of equal length
2. **Punctuation marks** get their own tokens
3. **Subwords**: Words that are not included in BERT's vocabulary get divided into subwords that are part of the vocabulary.

### Loading and using a model

We will now use the ```AutoModel``` class to load our model and transform our tokenized sequences into their embedded representations.


In [None]:
# # HuggingFace´s generic class for working with language models out-of-the-box
# AutoModel

In [None]:
# load a model using its name and explore its configuration
model = AutoModel.from_pretrained("bert-base-uncased",          # our choice of model
                                  output_hidden_states=True,    # output all hidden states so that we can fully explore the model
                                  output_attentions=True        # output attention weigths so that we can fully explore the model
                                  )

# put model in evaluation model (we will not do any training)
model = model.eval()

In [None]:
# # if we wish to further inspect the model's configuration in detail we can use the config attribute
print(model.config)

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_attentions": true,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



### Passing a sequence through the model

Generating an embedded representation of a sequence with BERT requires passing its tokens through multiple layers of trained weights.

In [None]:
# # lets first get a single sentence as an example
# sent_position = 7
# sent = df.iloc[7].sentence
# print( sent + "\n")

# # tokenize
# sent_encoded = tokenizer(sent,  max_length=60, padding="max_length", truncation=True, return_tensors='pt')
# sent_encoded["input_ids"]

In [None]:
# # apply forward pass through the model (do not accumulate gradients; we are not training)
# with torch.no_grad():
#     result = model(**sent_encoded)

In [None]:
# what is "result" ?

In [None]:
# # explore output from model
# print(f"Number of hidden layers: {len(result.hidden_states)}")
# print(f"Shape of output of each hidden layer: {result.hidden_states[5].shape}") # batch_size, number of tokens, embedding dimension
# print(f"Shape of pooler output: {result.pooler_output.shape}") # batch_size, embedding dimension

### Creating a sequence representation

There are several ways in which the output of BERT can be used to generate an embedded representation of a sequence of text. We will show some of them below. However, following the way in which BERT was pre-trained, we will focus on the embedded representation of the ```[CLS]``` token in the last hidden layer. This representation, is the one used to fullfil the next sentence prediction task on which BERT is trained.

In [None]:
# # use the embedding of the [CLS] token as the representation of the sequence
# cls_emb = result.hidden_states[-1][0][0]
# print(f"Shape of [CLS] embedding: {cls_emb.shape}")

In [None]:
# # we can also average the embeddings of all tokens in a given hidden state (e.g. layer 11)
# avg_emb = torch.mean(result.hidden_states[-2][0], dim=0)
# print(f"Shape of average embedding: {avg_emb.shape}")

# 3. Generating features for a regression model

We will use the embedded sequence representation that we have constructed as a covariate of a prediction model. Concretely, we will estimate a multivariate logistic regression with regularization where, for each sentence $i$ in our corpus, we predict the 2-digit NAICS sector of the firm.

### BERT features

HuggingFace provides a very convenient interface for tokenizing and passing sequences through the model with few lines of code. To do this, we will use the ```pipeline()``` class. This class allows us to choose a particular task (e.g. features-extraction, text-classification) and, with the appropriate model and tokenizer, it will generate the correct output for the task (e.g. embedded features, classification probabilitites). All the available pipelines can be explored [here](https://huggingface.co/docs/transformers/main_classes/pipelines).

In [None]:
# generate features from text using a pipeline object
feature_extraction = pipeline(task="feature-extraction",      # define the task
                              model=model,                    # model should be appropriate for the task
                              tokenizer=tokenizer,            # selected tokenizer
                              batch_size=16,                  # batching only supported for GPUs
                              device=0,                       # -1 for CPU and 0 for GPU
                              framework="pt"                  # data type (pt = pytorch)
                              )
# verify the task we selected
feature_extraction.task

'feature-extraction'

In [None]:
df = df.loc[216208:]

In [None]:
# transform dataframe into Dataset class (easier to work with pipeline)
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['yearmonth', 'district', 'sentence', 'sentencelen', 'keep_sent', 'year', 'month', 'value', 'change', '__index_level_0__'],
    num_rows: 86519
})

In [None]:
# iterate through our sequences to extract the [CLS] embedding (takes a couple of minutes)
all_cls = []
for out in tqdm(feature_extraction(KeyDataset(dataset, "sentence"),
                                   truncation=True,
                                   max_length=60,
                                   padding=False)
                ):

    # extract the [CLS] embedding from all sentences
    all_cls.append(out[0][0])
  # only using text from 201001, batch size 16, took 16 minutes on T4 GPU Colab

  0%|          | 0/5408 [00:00<?, ?it/s]

In [None]:
# build a dataframe with the features obtained using BERT (num_docs x 768)
df_features_bert = pd.DataFrame(all_cls)
df_features_bert

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.174412,-0.624525,-0.162623,-0.010187,0.190232,-0.306029,-0.117452,0.358987,-0.164427,-0.518598,...,-0.078941,0.110602,-0.234183,-0.041415,-0.162482,-0.125804,-0.160138,-0.490149,0.551221,0.042481
1,-0.469563,-0.262048,-0.462355,-0.095217,-0.601478,0.338046,0.698058,0.952954,-0.279171,-0.192739,...,0.071862,0.094950,0.081828,0.222709,0.532710,-0.427495,-0.187725,-0.618710,0.519265,-0.412500
2,-0.341705,-0.241713,0.490195,-0.549440,-0.167592,-0.119806,0.178801,0.615342,-0.163971,-0.168407,...,-0.169051,-0.348475,0.080954,0.310150,0.160906,-0.305528,-0.381552,-0.228056,0.193880,0.183090
3,-0.428109,-0.725834,-0.377600,-0.276084,-0.288773,0.505337,0.182204,0.758563,0.297468,-0.394165,...,0.129805,-0.039064,0.584993,0.496433,0.194329,-0.076100,-0.037932,-0.806802,0.281585,-0.156309
4,-0.478854,-0.757280,-0.005666,-0.194791,-0.482716,0.394825,-0.166216,0.610825,-0.036999,-0.011187,...,0.055578,-0.307464,0.116982,-0.206165,0.190313,-0.200107,-0.813707,-0.183190,0.571792,0.132012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86514,-0.433307,-0.433407,-0.967174,0.017751,-0.490015,0.086356,0.549783,0.811569,-0.199876,-0.169756,...,0.224933,0.052450,0.266871,0.164655,0.316175,-0.036697,-0.121637,-0.640279,0.822378,-0.108541
86515,-0.519334,0.053685,-1.135710,0.434873,-0.102463,0.151388,0.135491,0.792355,-0.337779,-0.197576,...,-0.053217,-0.472103,0.051998,0.233606,0.334826,0.294379,-0.238644,-0.626327,0.405813,0.006394
86516,-0.604897,-0.841650,-0.293491,-0.273255,0.119491,-0.121542,0.013606,0.956161,0.259269,-0.003842,...,0.040477,-0.021661,0.362763,0.616415,0.123729,0.608757,0.064516,-0.557803,0.287203,-0.337126
86517,-0.251447,-0.169100,-0.530550,0.087250,-0.653295,-0.094658,0.581185,1.027400,-0.118385,-0.171084,...,-0.287887,0.042789,-0.049742,0.197822,0.407080,-0.128762,-0.177245,-0.631871,0.194791,0.020785


In [None]:
# get only the NAICS2 code for each sentence to use as the labels for our regression
labels = df[["district"]]

In [None]:
print(df_features_bert.shape, labels.shape) #86908-1000?

(86519, 768) (86519, 1)


### Word embeddings features

We will also generate an embedded representation of sequences using word embeddings from a pre-trained model. This will give us a reference point with respect to which we can assess the quality of the features generated with BERT.


We will use word embeddings estimated with the GloVe algorithm on Wikipedia and a large news corpus. In order to generate a single representation for a whole sentence, we will average the individual word embeddings of all words from the sentence.


In [None]:
# download the model and return as an object ready to use (takes a couple of minutes)
# other available models can be found here: https://kavita-ganesan.com/easily-access-pre-trained-word-embeddings-with-gensim/
#w2v_model = api.load("word2vec-google-news-300")    # model is too large to load in Colab
w2v_model = api.load("glove-wiki-gigaword-300")



### Estimate regressions

In [None]:
# create list with all the indexes of available sentences
sent_idxs = list(range(0, len(labels)))

86519

In [None]:
# perform a train/test split
train_idxs, test_idxs = train_test_split(sent_idxs, test_size=0.2, random_state=92)
print(f" Train sentences: {len(train_idxs)}\n", f"Test sentences: {len(test_idxs)}")

 Train sentences: 69215
 Test sentences: 17304


In [None]:
# select idxs for training and testing

# BERT
train_bert_features = df_features_bert.loc[train_idxs]
test_bert_features = df_features_bert.loc[test_idxs]

# Labels
train_labels = labels_w2v.loc[train_idxs]
test_labels = labels_w2v.loc[test_idxs]

In [None]:
# BERT: fit a multinomial logistic regression to predict the sector of each sentence
lr_bert = LogisticRegression(penalty="l2",
                             multi_class = "multinomial",
                             solver="lbfgs",
                             max_iter=100000)

lr_bert.fit(train_bert_features, train_labels.values.ravel())


In [None]:
# get in sample predictions for both models
pred_train_bert = lr_bert.predict(train_bert_features)

print(f"Random guess accuracy: {1/len(df.groupby('district').size())}\n")

print("===========================\nBERT results:")
train_acc_bert = accuracy_score(train_labels, pred_train_bert)
print(f"In sample accuracy: {train_acc_bert}")
train_f1_bert = f1_score(train_labels, pred_train_bert, average="micro")
print(f"In sample F1 score: {train_f1_bert}")
train_precision_bert = precision_score(train_labels, pred_train_bert, average="micro")
print(f"In sample precision score: {train_precision_bert}")
train_recall_bert = recall_score(train_labels, pred_train_bert, average="micro")
print(f"In sample recall score: {train_recall_bert}")


In [None]:
# get out-of-sample predictions for both models
pred_test_bert = lr_bert.predict(test_bert_features)

print(f"Random guess accuracy: {1/len(df.groupby('district').size())}\n")

print("===========================\nBERT results:")
test_acc_bert = accuracy_score(test_labels, pred_test_bert)
print(f"Test accuracy: {test_acc_bert}")
test_f1_bert = f1_score(test_labels, pred_test_bert, average="micro")
print(f"Test F1 score: {test_f1_bert}")
test_precision_bert = precision_score(test_labels, pred_test_bert, average="micro")
print(f"Test precision score: {test_precision_bert}")
test_recall_bert = recall_score(test_labels, pred_test_bert, average="micro")
print(f"Test recall score: {test_recall_bert}")


Random guess accuracy: 0.08333333333333333

BERT results:
Test accuracy: 0.5745023587619377
Test F1 score: 0.5745023587619377
Test precision score: 0.5745023587619377
Test recall score: 0.5745023587619377

Word embeddings results:
Test accuracy: 0.5140950408468531
Test F1 score: 0.5140950408468531
Test precision score: 0.5140950408468531
Test recall score: 0.5140950408468531


### Out-of-corpus sentence

We can now also use our estimated regression to predict the sector of any given sequence of text we migth imagine. We will demonstrate this by using BERT.

In [None]:
# df.loc[df.district=='at'].sentence
df.loc[314269].sentence

'The cattle market remained strong as demand for beef remained high amid low supply'

In [None]:
# define a target sentence
outside_target = "The cattle market remained strong as demand for beef remained high amid low supply"

# tokenize sentence
outside_tokens = tokenizer(outside_target, return_tensors='pt')
outside_tokens = outside_tokens.to("cuda")  # required when using GPU

# apply forward pass through the model (do not accumulate gradients; we are not training)
with torch.no_grad():
    result = model(**outside_tokens, output_attentions=True)

# extract [CLS] token embedding from last layer
outside_cls_emb = result.hidden_states[-1][0][0]
outside_cls_emb = outside_cls_emb.cpu() # required when using GPU
# print(f"Shape of [CLS] embedding: {outside_cls_emb.shape}")

Shape of [CLS] embedding: torch.Size([768])


In [None]:
# generate class prediction for out-of-corpus sentence
outside_prediction = lr_bert.predict(outside_cls_emb.numpy().reshape(1, -1))
print(f"Predicted district: {outside_prediction[0]}")

# transform sector code into name
prediction_name = df.loc[df["district"] == outside_prediction[0]]["district"].values[0]
print(f"Predicted district: {prediction_name}\n--------\n")

# generate probability predictions for out-of-corpus sentence
outside_probs = lr_bert.predict_proba(outside_cls_emb.numpy().reshape(1, -1))
outside_probs

for prob, code in zip(outside_probs[0], lr_bert.classes_):
    code_name = df.loc[df["district"] == code]["district"].values[0]
    print(f"Predicted probability for {code_name}: {np.round(prob,3)}")

Predicted district: sf
Predicted district: sf
--------

Predicted probability for at: 0.165
Predicted probability for bo: 0.005
Predicted probability for ch: 0.066
Predicted probability for cl: 0.006
Predicted probability for da: 0.132
Predicted probability for kc: 0.051
Predicted probability for mi: 0.209
Predicted probability for ny: 0.001
Predicted probability for ph: 0.001
Predicted probability for ri: 0.123
Predicted probability for sf: 0.229
Predicted probability for sl: 0.012
