In [1]:
!pip install transformers[torch]==4.19.2 -qU

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m98.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
'''
Code for measurement analysis of Distilbert model for Out of Vocabulary Words
Measurements -
1. Classification metrics - Accuracy, Precision, Recall, F1
2. AUC-ROC
3. Lift Chart
4. Calibration Plot
'''


In [2]:
import pandas as pd
import numpy as np
import time
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix, precision_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from tqdm import tqdm, trange
import torch
import torch.nn as nn
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from time import time
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
import torch.nn as nn
import torch.optim as optim
import seaborn as sns
plt.rcParams["figure.dpi"] = 200

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device available: ",device)
seed = 42
np.random.seed(seed)

Device available:  cuda


In [4]:
# Input

# Trained Model
distilbert_model = '/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/model/distilbert_unprocessed_v1.pth'

# Input Data
data_path = '/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/data/df_distilbert_input_unprocessed.pkl'

# DistilBERT Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [5]:
# Function to generate batches and reshape
def batching(np_array, max_idx, batch_size):
    np_array = np_array[:max_idx]
    np_array = np_array.reshape(-1, batch_size)
    batched_list = np_array.tolist()
    return batched_list

In [6]:
# Temperature Scaling
def T_scaling(logits, args):
    temperature = args.get('temperature', None)
    return torch.div(logits, temperature)

In [7]:
# Function for predicting y
def bert_pred(model_path, X, y, th=0.5, temp=None):
    loaded_model = torch.load(model_path)
    predictions, targets = [], []
    pred_prob = []
    loaded_model.eval()
    with torch.no_grad():
        for text, labels in tqdm(zip(X, y), total=len(X)):
            #try:
            model_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            model_inputs = {k:v.to(device) for k,v in model_inputs.items()}
            output = loaded_model(**model_inputs)
            logits = output[0]
            if temp is not None:
                logits = T_scaling(logits, {'temperature':temp})
            probs = torch.sigmoid(logits)
            pred_prob.extend(probs[:,1].tolist())
            if th==0.5:
                # prediction is the argmax of the logits
                predictions.extend(logits.argmax(dim=1).tolist())
            else:
                predictions.extend(((probs[:,1]>=th)*1).tolist())
            targets.extend(labels)
            #except:
            #    print("Unable to process: ", text)
            #    continue
    return targets, pred_prob, predictions

In [8]:
# Function to evaluate predictions - classification report
def classification_metrics(targets, predictions, conf_matrix_loc):
    accuracy = metrics.accuracy_score(targets, predictions)
    print ("accuracy", accuracy)
    classification_report = metrics.classification_report(targets, predictions)
    print (classification_report)
    conf_matrix = confusion_matrix(targets, predictions)
    # Save confusion matrix
    ax = sns.heatmap(conf_matrix, annot=True, cmap='Blues')
    ax.set_title('Confusion Matrix')
    ax.set_xlabel('Predicted Values')
    ax.set_ylabel('Actual Values ')

    ## Ticket labels - List must be in alphabetical order
    ax.xaxis.set_ticklabels(['False','True'])
    ax.yaxis.set_ticklabels(['False','True'])

    ## Display the visualization of the Confusion Matrix.
    plt.savefig(conf_matrix_loc)
    return accuracy, classification_report

In [9]:
# Function to evaluate AUC ROC scores
def auc_roc(y_true, y_score):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    auc = roc_auc_score(y_true, y_score)
    return auc, fpr, tpr

In [10]:
# Function to plot ROC Curve
def plot_auc_roc(fpr, tpr, roc_auc, save_loc):
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.savefig(save_loc)
    plt.close()

In [11]:
# Function to generate decile chart over probabilities
def decile_chart(df, pred_prob, pred, targets, cuts=10):
    # Sort by predicted probabilites
    df.sort_values(by=pred_prob, ascending=False, inplace=True)
    # Split df into subsets
    df_split = np.array_split(df, cuts)
    # Create output dataframe - (#cuts, accuracy, precision, recall, f1)
    df_output = pd.DataFrame(columns=['n','n_0','n_1','pred_0','pred_1','acc','prec','recall','f1'])
    n_list = []
    n_0_list = []
    n_1_list = []
    pred_0_list = []
    pred_1_list = []
    acc_list = []
    prec_list = []
    recall_list = []
    f1_list = []
    for i in range(cuts):
        df_to_process = df_split[i]
        n = df_to_process.shape[0]
        n_1 = df_to_process[targets].sum()
        n_0 = n - n_1
        pred_1 = df_to_process[pred].sum()
        pred_0 = n - pred_1
        tn, fp, fn, tp = confusion_matrix(df_to_process[targets].tolist(), df_to_process[pred].tolist(), labels=[0, 1]).ravel()
        acc = np.round(100*(tp + tn)/(tn+fp+fn+tp),2)
        prec = np.round(100*(tp)/(fp+tp),2)
        recall = np.round(100*(tp)/(fn+tp),2)
        f1 = np.round((2*prec*recall)/(prec+recall))

        n_list.append(n)
        n_1_list.append(n_1)
        n_0_list.append(n_0)
        pred_0_list.append(pred_0)
        pred_1_list.append(pred_1)
        acc_list.append(acc)
        prec_list.append(prec)
        recall_list.append(recall)
        f1_list.append(f1)
    df_output['n'] = n_list
    df_output['n_0'] = n_0_list
    df_output['n_1'] = n_1_list
    df_output['pred_0'] = pred_0_list
    df_output['pred_1'] = pred_1_list
    df_output['acc'] = acc_list
    df_output['prec'] = prec_list
    df_output['recall'] = recall_list
    df_output['f1'] = f1_list
    return df_output

In [12]:
# Function to get threshold  - max difference b/w tpr and fpr
def get_threshold(val_targets, val_pred):
    fpr_val, tpr_val, threshold_val = roc_curve(val_targets, val_pred)
    th_val = threshold_val[np.argmax(tpr_val-fpr_val)]
    return th_val

In [13]:
# Function to get calibration plots - 10 bins
def get_calibration_curve10(target, pred_prob, save_loc, nbins=10):
    plot_y, plot_x = calibration_curve(target, pred_prob, n_bins=nbins)
    # calibration curves
    fig, ax = plt.subplots()
    plt.plot(plot_y, plot_x, marker='o', linewidth=1, label='logreg')

    # reference line, legends, and axis labels
    line = mlines.Line2D([0, 1], [0, 1], color='black')
    transform = ax.transAxes
    line.set_transform(transform)
    ax.add_line(line)
    #ax.hist(pred_prob, weights=np.ones(len(pred_prob)) / len(pred_prob), bins=nbins, color='papayawhip')
    fig.suptitle('Calibration plot for DistilBERT')
    ax.set_xlabel('Mean Predicted probability')
    ax.set_ylabel('True probability in each bin')
    plt.savefig(save_loc)
    plt.close()

    # Get Histogram
    fig, ax = plt.subplots()
    ax.hist(pred_prob, weights=np.ones(len(pred_prob)) / len(pred_prob), bins=nbins, color='green')
    fig.suptitle('Histogram for probability distribution')
    ax.set_xlabel('Mean predicted probability')
    ax.set_ylabel('Counts')
    plt.savefig(save_loc[:-4] + '_hist.png')
    plt.close()

In [14]:
# Function to get calibration plots - 10 bins and overlayed
def get_calibration_curve10_overlay(target, pred_prob, save_loc, nbins=10):
    plot_y, plot_x = calibration_curve(target, pred_prob, n_bins=nbins)
    # calibration curves
    #fig, ax = plt.subplots()
    plt.plot(plot_y, plot_x, marker='o', linewidth=1, label='logreg')
    # reference line, legends, and axis labels
    ref_x = np.arange(0, 1.1, 0.1)
    ref_y = np.arange(0, 1.1, 0.1)
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.xlim(left=0)
    plt.plot(ref_x, ref_y, color='black', linestyle='dashed')
    # Histogram
    plt.hist(pred_prob, weights=np.ones(len(pred_prob)) / len(pred_prob), bins=nbins, color='green')
    # Title and labels
    plt.xlabel('Mean Predicted probability')
    plt.ylabel('True probability in each bin')
    plt.title('Calibration plot for DistilBERT')
    plt.savefig(save_loc)
    plt.close()


In [15]:
# Function to get precision at different thresholds
def get_precision_at_thresholds(target, pred_prob, save_loc, thresholds=np.arange(0.1,1,0.1)):
    prec_dict = {'threshold':[], 'prec0':[], 'prec1':[], 'n0_actual':[], 'n1_actual':[], 'n0_pred':[], 'n1_pred':[], 'n_total':[]}
    for th in thresholds:
        predictions = [1 if x>=th else 0 for x in pred_prob]
        precisions = precision_score(target, predictions, average=None)
        prec0 = precisions[0]
        prec1 = precisions[1]
        n_total = len(target)
        n1_actual = sum(target)
        n0_actual = n_total - n1_actual
        n1_pred = sum(predictions)
        n0_pred = n_total - n1_pred
        prec_dict['threshold'].append(th)
        prec_dict['prec0'].append(prec0)
        prec_dict['prec1'].append(prec1)
        prec_dict['n0_actual'].append(n0_actual)
        prec_dict['n1_actual'].append(n1_actual)
        prec_dict['n0_pred'].append(n0_pred)
        prec_dict['n1_pred'].append(n1_pred)
        prec_dict['n_total'].append(n_total)
    df_prec = pd.DataFrame(prec_dict)
    df_prec.to_csv(save_loc, index=False)

In [16]:
# Function to convert pred probabilities into logits
def prob_to_logits(pred_prob):
    probabilities = np.array(pred_prob, dtype=np.float64)
    logits_array = np.log(probabilities/(1-probabilities))
    return logits_array

In [17]:
# Function to calibrate probabilities using temperature scaling
def calibrate_classifier(pred_prob, labels):
    temperature = nn.Parameter(torch.ones(1).cuda())
    args = {'temperature': temperature}
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.LBFGS([temperature], lr=0.001, max_iter=10000, line_search_fn='strong_wolfe')
    logits_array = prob_to_logits(pred_prob)
    labels_array = np.array(labels, dtype=np.float64)
    temps = []
    losses = []

    # Create tensors
    logits_list = torch.from_numpy(logits_array).to(device)
    labels_list = torch.from_numpy(labels_array).to(device)

    # Optimize for temperature
    def _eval():
        loss = criterion(T_scaling(logits_list, args), labels_list)
        loss.backward()
        temps.append(temperature.item())
        losses.append(loss)
        #print("Loss-List: ", loss)
        #print("Temperature-List: ", temps)
        return loss

    optimizer.step(_eval)
    print('Final T_scaling factor: {:.2f}'.format(temperature.item()))
    return temperature.item()

In [18]:
def report_metrics(op_path, temperature=None):
    # Prediction on test data
    if temperature is None:
        labels, pred_prob, predictions = bert_pred(distilbert_model, X_test, y_test)
    else:
        labels, pred_prob, predictions = bert_pred(distilbert_model, X_test, y_test, temp=temperature)

    # Classification Metrics
    acc_model, classification_report_model = classification_metrics(labels, predictions, op_path['conf_matrix_loc'])
    print("Accuracy: ", acc_model)
    print("\n")
    print("Classification Report: ", classification_report_model)
    print("\n")

    # ROC Score
    auc_model, fpr, tpr = auc_roc(labels, pred_prob)
    print("AUC: ", auc_model)
    print("\n")

    # Plot ROC
    plot_auc_roc(fpr, tpr, auc_model, op_path['roc_auc_path'])

    # Calibration Plot
    get_calibration_curve10(labels, pred_prob, op_path['calib_plot_path'])
    get_calibration_curve10_overlay(labels, pred_prob, op_path['calib_plot_path_overlay'])

    # Precision at different thresholds
    get_precision_at_thresholds(labels, pred_prob, op_path['prec_at_th_path'])

    # Decile Chart
    df_decile_input = pd.DataFrame(columns = ['pred_prob','pred','targets'])
    df_decile_input['pred_prob'] = pred_prob
    df_decile_input['pred'] = predictions
    df_decile_input['targets'] = labels
    decile = decile_chart(df_decile_input, 'pred_prob', 'pred', 'targets', cuts=10)
    decile.to_csv(op_path['decile_path'])
    print("Decile Report: ", decile)
    print("\n")

    return labels, pred_prob, predictions

In [21]:
# Read data
df = pd.read_pickle(data_path)
df['filing_dt'] = pd.to_datetime(df['filing_dt'])
# Taking only marks with wn_ind = 0
df = df[df['wn_ind']==0].copy()
df.reset_index(drop=True, inplace=True)

# Divide data into train, test and validation
df_train = df[(df['filing_dt']>=pd.to_datetime('2012-01-01')) & (df['filing_dt']<=pd.to_datetime('2017-12-31'))]
df_val = df[(df['filing_dt']>=pd.to_datetime('2018-01-01')) & (df['filing_dt']<=pd.to_datetime('2018-12-31'))]
df_test = df[(df['filing_dt']>=pd.to_datetime('2019-01-01')) & (df['filing_dt']<=pd.to_datetime('2019-12-31'))]
print("Train data shape: ", df_train.shape)
print("Validation data shape: ", df_val.shape)
print("Test data shape: ", df_test.shape)

Train data shape:  (399288, 22)
Validation data shape:  (105049, 22)
Test data shape:  (126324, 22)


In [24]:
# X_test
X_test = np.array(df_test['bert_input_unprocessed'])
# y_test
y_test = np.array(df_test['distinct_ind'])


batch_size = 16
# Batching
test_max_idx = batch_size * (len(X_test)//batch_size)
X_test = batching(X_test, test_max_idx, batch_size=16)
y_test = batching(y_test, test_max_idx, batch_size=16)

In [25]:
# Output Path Dictionary
op_no_temp = {'roc_auc_path':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/auc_dbert.png',
              'calib_plot_path':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/calib_plot_dbert.png',
              'calib_plot_path_overlay':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/calib_plot_dbert_overlay10.png',
              'prec_at_th_path':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/threshold_precisions.csv',
              'decile_path':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/decile_bert.csv',
              'conf_matrix_loc':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/confusion_matrix.png'}


In [26]:
# Get metrics
labels, pred_prob, predictions = report_metrics(op_no_temp)

# Get Optimal Temperature
temperature = calibrate_classifier(pred_prob, labels)
# Output Path Dictionary - With T_scaling
op_temp = {'roc_auc_path':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/auc_dbert_tscaled.png',
          'calib_plot_path':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/calib_plot_dbert_tscaled.png',
          'calib_plot_path_overlay':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/calib_plot_dbert_overlay10_tscaled.png',
          'prec_at_th_path':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/threshold_precisions_tscaled.csv',
          'decile_path':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/decile_bert_tscaled.csv',
          'conf_matrix_loc':'/content/drive/MyDrive/NLP_LSS/distinctiveness_indicator/distilbert/oov/plots-run-unprocessed/confusion_matrix_tscaled.png'}

# Get New Metrics - with temperature scaling
if np.isnan(temperature) == False:
    report_metrics(op_temp, temperature=temperature)
else:
    print("Temperature is NaN")


  loaded_model = torch.load(model_path)
100%|██████████| 7895/7895 [13:41<00:00,  9.61it/s]


accuracy 0.8916481950601647
              precision    recall  f1-score   support

           0       0.58      0.07      0.12     13942
           1       0.90      0.99      0.94    112378

    accuracy                           0.89    126320
   macro avg       0.74      0.53      0.53    126320
weighted avg       0.86      0.89      0.85    126320

Accuracy:  0.8916481950601647


Classification Report:                precision    recall  f1-score   support

           0       0.58      0.07      0.12     13942
           1       0.90      0.99      0.94    112378

    accuracy                           0.89    126320
   macro avg       0.74      0.53      0.53    126320
weighted avg       0.86      0.89      0.85    126320



AUC:  0.7261837995205634




  return bound(*args, **kwds)


Decile Report:         n   n_0    n_1  pred_0  pred_1    acc   prec  recall    f1
0  12632   234  12398       0   12632  98.15  98.15  100.00  99.0
1  12632   339  12293       0   12632  97.32  97.32  100.00  99.0
2  12632   578  12054       0   12632  95.42  95.42  100.00  98.0
3  12632   877  11755       0   12632  93.06  93.06  100.00  96.0
4  12632  1115  11517       0   12632  91.17  91.17  100.00  95.0
5  12632  1337  11295       0   12632  89.42  89.42  100.00  94.0
6  12632  1481  11151       0   12632  88.28  88.28  100.00  94.0
7  12632  1781  10851       0   12632  85.90  85.90  100.00  92.0
8  12632  2202  10430       0   12632  82.57  82.57  100.00  90.0
9  12632  3998   8634    1631   11001  70.37  72.23   92.03  81.0


Final T_scaling factor: 0.73


  loaded_model = torch.load(model_path)
100%|██████████| 7895/7895 [13:46<00:00,  9.55it/s]


accuracy 0.8916481950601647
              precision    recall  f1-score   support

           0       0.58      0.07      0.12     13942
           1       0.90      0.99      0.94    112378

    accuracy                           0.89    126320
   macro avg       0.74      0.53      0.53    126320
weighted avg       0.86      0.89      0.85    126320

Accuracy:  0.8916481950601647


Classification Report:                precision    recall  f1-score   support

           0       0.58      0.07      0.12     13942
           1       0.90      0.99      0.94    112378

    accuracy                           0.89    126320
   macro avg       0.74      0.53      0.53    126320
weighted avg       0.86      0.89      0.85    126320



AUC:  0.7261837947336576


Decile Report:         n   n_0    n_1  pred_0  pred_1    acc   prec  recall    f1
0  12632   234  12398       0   12632  98.15  98.15  100.00  99.0
1  12632   339  12293       0   12632  97.32  97.32  100.00  99.0
2  12632   578  120

  return bound(*args, **kwds)
