# BERT and Path signature

In [1]:
# solve issue with autocomplete
%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import pickle
import sys
import re

# sys.path.insert(0, "../../timeline_generation/")  # Adds higher directory to python modules path
# import src.data_handler

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset
dataset = load_dataset("newspop")

Using custom data configuration default
Found cached dataset newspop (/Users/khosseini/.cache/huggingface/datasets/newspop/default/0.0.0/9904d4082ffd3c0953efa538ff926c43d27da8f37c9b5d6a13f51ab96740474e)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 167.28it/s]


In [4]:
dataset_df_all = pd.DataFrame(dataset["train"])

In [5]:
dataset_df_all.head()

Unnamed: 0,id,title,headline,source,topic,publish_date,facebook,google_plus,linked_in
0,99248,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1
1,10423,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1
2,18828,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1
3,27788,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1
4,27789,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1


In [6]:
use_cols = ["id", "headline", "publish_date", "topic"]

In [7]:
dataset_df = dataset_df_all[use_cols]

In [8]:
dataset_df["postid"] = dataset_df["id"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_df["postid"] = dataset_df["id"]


In [9]:
dataset_df.head()

Unnamed: 0,id,headline,publish_date,topic,postid
0,99248,Obama Lays Wreath at Arlington National Cemete...,2002-04-02 00:00:00,obama,99248
1,10423,"Tim Haywood, investment director business-unit...",2008-09-20 00:00:00,economy,10423
2,18828,"Nouriel Roubini, NYU professor and chairman at...",2012-01-28 00:00:00,economy,18828
3,27788,Finland's economy expanded marginally in the t...,2015-03-01 00:06:00,economy,27788
4,27789,Tourism and public spending continued to boost...,2015-03-01 00:11:00,economy,27789


In [10]:
rename_cols = {
    "headline": "content",
    "publish_date": "datetime",
    "topic": "label",
    "id": "timeline_id",
    "postid": "postid"
}

In [11]:
dataset_df = dataset_df.rename(columns=rename_cols)
dataset_df.head()

Unnamed: 0,timeline_id,content,datetime,label,postid
0,99248,Obama Lays Wreath at Arlington National Cemete...,2002-04-02 00:00:00,obama,99248
1,10423,"Tim Haywood, investment director business-unit...",2008-09-20 00:00:00,economy,10423
2,18828,"Nouriel Roubini, NYU professor and chairman at...",2012-01-28 00:00:00,economy,18828
3,27788,Finland's economy expanded marginally in the t...,2015-03-01 00:06:00,economy,27788
4,27789,Tourism and public spending continued to boost...,2015-03-01 00:11:00,economy,27789


In [34]:
dataset_df['datetime'] =  pd.to_datetime(dataset_df['datetime'], format='%Y-%m-%d %H:%M:%S')

In [35]:
dataset_df.head()

Unnamed: 0,timeline_id,content,datetime,label,postid
0,99248,Obama Lays Wreath at Arlington National Cemete...,2002-04-02 00:00:00,0,99248
1,10423,"Tim Haywood, investment director business-unit...",2008-09-20 00:00:00,1,10423
2,18828,"Nouriel Roubini, NYU professor and chairman at...",2012-01-28 00:00:00,1,18828
3,27788,Finland's economy expanded marginally in the t...,2015-03-01 00:06:00,1,27788
4,27789,Tourism and public spending continued to boost...,2015-03-01 00:11:00,1,27789


In [36]:
dataset_df["label"].value_counts()

0    678
1    322
Name: label, dtype: int64

In [37]:
encode_labels = {
    "label": 
        {"economy": 1,
         "obama": 0,
         "microsoft": 0,
         "palestine": 0
        }
}

In [38]:
dataset_df = dataset_df.replace(encode_labels)
dataset_df.head()

Unnamed: 0,timeline_id,content,datetime,label,postid
0,99248,Obama Lays Wreath at Arlington National Cemete...,2002-04-02 00:00:00,0,99248
1,10423,"Tim Haywood, investment director business-unit...",2008-09-20 00:00:00,1,10423
2,18828,"Nouriel Roubini, NYU professor and chairman at...",2012-01-28 00:00:00,1,18828
3,27788,Finland's economy expanded marginally in the t...,2015-03-01 00:06:00,1,27788
4,27789,Tourism and public spending continued to boost...,2015-03-01 00:11:00,1,27789


In [39]:
dataset_df["label"].value_counts()

0    678
1    322
Name: label, dtype: int64

In [46]:
dataset_df = dataset_df[:100]
dataset_df["label"].value_counts()

0    73
1    27
Name: label, dtype: int64

In [47]:
# ================================
#model specifics
model_specifics = {"global_embedding_tp": 'SBERT', #options: SBERT, BERT_cls , BERT_mean, BERT_max
    "dimensionality_reduction_tp": 'umap', #options: ppapca, ppapcappa, umap
    "dimensionality_reduction_components": 10, # options: any int number between 1 and embedding dimensions
    "time_injection_history_tp": 'timestamp', #options: timestamp, None
    "time_injection_post_tp": 'timestamp', #options: timestamp, timediff, None
    "signature_dimensions": 3, #options: any int number larger than 1
    "post_embedding_tp": 'sentence', #options: sentence, reduced
    "feature_combination_method": 'attention', #options concatenation, attention 
    "signature_tp": 'log', # options: log, sig
    "classifier_name": 'FFN2hidden', # options: FFN2hidden (any future classifiers added)
    "classes_num": '3class', #options: 3class (5class to be added in the future)
}

In [48]:
from sentence_transformers import SentenceTransformer
sentences = dataset_df["content"].to_list()

In [49]:
sentences[:10]

['Obama Lays Wreath at Arlington National Cemetery. President Barack Obama has laid a wreath at the Tomb of the Unknowns to honor',
 'Tim Haywood, investment director business-unit head for fixed income at Gam, discusses the China beige book and the state of the economy.',
 "Nouriel Roubini, NYU professor and chairman at Roubini Global Economics, explains why the global economy isn't facing the same conditions",
 "Finland's economy expanded marginally in the three months ended December, after contracting in the previous quarter, preliminary figures from Statistics Finland showed Monday. ",
 'Tourism and public spending continued to boost the economy in January, in light of contraction in private consumption and exports, according to the Bank of Thailand data. ',
 'Over 100 attendees expected to see latest version of Microsoft Dynamics SL and Dynamics GP (PRWeb February 29, 2016) Read the full story at http://www.prweb.com/releases/2016/03/prweb13238571.htm ',
 'RAMALLAH, February 25, 2

In [50]:
st_model = SentenceTransformer('all-MiniLM-L6-v2')

In [51]:
embeddings_sentence = st_model.encode(sentences,
                                      batch_size=64,
                                      show_progress_bar=True,
                                      output_value='sentence_embedding', 
                                      convert_to_numpy=True,
                                      convert_to_tensor=False,
                                      device=None,
                                      normalize_embeddings=False,
                                     )

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.59it/s]


In [52]:
embeddings_sentence

array([[ 0.04053084,  0.12677234,  0.03034632, ...,  0.01346123,
         0.00509904, -0.02261563],
       [ 0.00132182, -0.00187154,  0.04704985, ..., -0.12755233,
        -0.00235982, -0.01032065],
       [-0.00446173, -0.077429  , -0.02367794, ..., -0.12149306,
        -0.02713376, -0.01476173],
       ...,
       [-0.03346207,  0.00325029, -0.02158457, ..., -0.10729703,
        -0.00453167,  0.11721924],
       [-0.00459458,  0.03299496,  0.01338389, ..., -0.05713576,
         0.03441487,  0.05552595],
       [-0.03066568, -0.00086962, -0.04076744, ..., -0.06957944,
         0.03964642, -0.02920447]], dtype=float32)

In [53]:
#dimensionality reduction
from src import DimensionalityReduction

reduction = DimensionalityReduction(method= model_specifics['dimensionality_reduction_tp'], 
                                    components=model_specifics['dimensionality_reduction_components'])

In [54]:
embeddings_reduced = reduction.fit_transform(embeddings_sentence)

print(embeddings_reduced.shape)

(100, 10)


In [55]:
embeddings_reduced

array([[12.302852  ,  9.243699  ,  4.7989    ,  8.696953  ,  1.9157919 ,
         8.339737  ,  1.6259124 ,  8.808037  ,  4.209265  ,  2.6453235 ],
       [10.622737  ,  8.703552  ,  4.381641  ,  8.20753   ,  1.5959508 ,
         7.63687   ,  0.9893982 ,  9.183858  ,  2.5997453 ,  2.2063086 ],
       [10.738586  ,  8.464451  ,  4.7766623 ,  8.50733   ,  2.5095427 ,
         7.652453  ,  0.5903145 ,  9.699511  ,  3.0608497 ,  1.8164501 ],
       [10.764558  ,  8.008527  ,  5.3117905 ,  6.945367  ,  1.5039841 ,
         6.959218  ,  0.83099455,  9.958994  ,  3.4660575 ,  2.706539  ],
       [10.238422  ,  8.304144  ,  5.2809787 ,  7.338672  ,  1.6640078 ,
         7.855867  ,  0.52792203,  9.99826   ,  3.1431499 ,  1.5868814 ],
       [12.051455  ,  9.241452  ,  4.1045156 ,  7.296282  ,  1.271073  ,
         7.511548  ,  2.974853  ,  9.398601  ,  3.3590808 ,  3.2355175 ],
       [11.702003  ,  8.836721  ,  4.4399977 ,  8.312136  ,  1.4363387 ,
         8.447865  ,  2.152368  ,  9.543498  

In [56]:
#concatenate new dataframe
from src.dataset import get_modeling_dataframe
df = get_modeling_dataframe(dataset_df, embeddings_sentence, embeddings_reduced)

#get time features
from src.timeinjection import TimeFeatures, Padding
tf = TimeFeatures()
df = tf.get_time_features(df)


#padding
pad = Padding()
df_padded = pad.pad_timelines(df)
df_padded.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timeline_index'][first_index:last_index] = np.arange(t_id_len)


(100, 1, 13)

In [59]:
if (model_specifics['time_injection_history_tp'] == 'timestamp'):
    path = torch.from_numpy(df_padded[: , : , 2:].astype(float))
else:
    path = torch.from_numpy(df_padded[: , : , 3:].astype(float))
  
if (model_specifics['time_injection_post_tp']== 'timestamp'):
    time_feature = torch.tensor((df[['time_encoding']].values - df['time_encoding'].mean()) / df['time_encoding'].std() )
    post_time = True
elif (model_specifics['time_injection_post_tp']== 'timediff'):
    time_feature = torch.tensor( (df[['time_diff']].values - df['time_diff'].mean()) / df['time_diff'].std()  )
    post_time = True  
else: 
    time_feature = None
    post_time = False

if (model_specifics['post_embedding_tp'] == 'sentence'):
    bert_embeddings = torch.tensor(df[[c for c in df.columns if re.match("^e\w*[0-9]", c)]].values)
else:
    bert_embeddings = None

#calculate paths
from src.dyadic_path import DyadicSignatures

dsig = DyadicSignatures(original_size = df.shape[0], d = path.shape[2], sig_d = model_specifics['signature_dimensions'], \
    intervals = 1/12, k_history= None, embedding_tp = model_specifics['post_embedding_tp'], \
    method = model_specifics['feature_combination_method'], \
    history_tp = model_specifics['signature_tp'] , add_time = post_time)

sig, last_index_dt_all = dsig.compute_signatures(path)
sig_combined = dsig.combine_signatures(sig)
x_data = dsig.create_features(path, sig_combined, last_index_dt_all, bert_embeddings, time_feature)

sig.shape, last_index_dt_all.shape, sig_combined.shape, x_data.shape

ModuleNotFoundError: No module named 'signatory'

Read raw data

In [None]:
# TalkLifeDataset = data_handler.TalkLifeDataset()
# annotations = TalkLifeDataset.return_annotated_timelines(load_from_pickle=False)
# annotations = annotations[annotations['content']!='nan']

# sample_size = annotations.shape[0]
# print(sample_size)
# annotations.head()
# #column format: "timeline_id"	"postid"	"content"	"label"	"datetime"

Define Model

In [None]:
# # ================================
# #model specifics
# model_specifics = {"global_embedding_tp": 'SBERT', #options: SBERT, BERT_cls , BERT_mean, BERT_max
#     "dimensionality_reduction_tp": 'ppapca', #options: ppapca, ppapcappa, umap
#     "dimensionality_reduction_components": 10, # options: any int number between 1 and embedding dimensions
#     "time_injection_history_tp": 'timestamp', #options: timestamp, None
#     "time_injection_post_tp": 'timestamp', #options: timestamp, timediff, None
#     "signature_dimensions": 3, #options: any int number larger than 1
#     "post_embedding_tp": 'sentence', #options: sentence, reduced
#     "feature_combination_method": 'attention', #options concatenation, attention 
#     "signature_tp": 'log', # options: log, sig
#     "classifier_name": 'FFN2hidden', # options: FFN2hidden (any future classifiers added)
#     "classes_num": '3class', #options: 3class (5class to be added in the future)
# }

Post Embeddings, Dimensionality Reduction

In [None]:
# #post embedding
# from embeddings import Representations

# rep = Representations(type = model_specifics['global_embedding_tp'])
# embeddings_sentence = rep.get_embeddings()

# print(embeddings_sentence.shape)

# #dimensionality reduction
# from dimensionality_reduction import DimensionalityReduction

# reduction = DimensionalityReduction(method= model_specifics['dimensionality_reduction_tp'], components=model_specifics['dimensionality_reduction_components'])
# embeddings_reduced = reduction.fit_transform(embeddings_sentence)

# print(embeddings_reduced.shape)

Time features

In [None]:
# #concatenate new dataframe
# from dataset import get_modeling_dataframe
# df = get_modeling_dataframe(annotations, embeddings_sentence, embeddings_reduced)

# #get time features
# from timeinjection import TimeFeatures, Padding
# tf = TimeFeatures()
# df = tf.get_time_features(df)


# #padding
# pad = Padding()
# df_padded = pad.pad_timelines(df)
# df_padded.shape

Dyadic paths and data combination

In [None]:
if (model_specifics['time_injection_history_tp'] == 'timestamp'):
    path = torch.from_numpy(df_padded[: , : , 2:].astype(float))
else:
    path = torch.from_numpy(df_padded[: , : , 3:].astype(float))
  
if (model_specifics['time_injection_post_tp']== 'timestamp'):
    time_feature = torch.tensor((df[['time_encoding']].values - df['time_encoding'].mean()) / df['time_encoding'].std() )
    post_time = True
elif (model_specifics['time_injection_post_tp']== 'timediff'):
    time_feature = torch.tensor( (df[['time_diff']].values - df['time_diff'].mean()) / df['time_diff'].std()  )
    post_time = True  
else: 
    time_feature = None
    post_time = False

if (model_specifics['post_embedding_tp'] == 'sentence'):
    bert_embeddings = torch.tensor(df[[c for c in df.columns if re.match("^e\w*[0-9]", c)]].values)
else:
    bert_embeddings = None

#calculate paths
from dyadic_path import DyadicSignatures

dsig = DyadicSignatures(original_size = df.shape[0], d = path.shape[2], sig_d = model_specifics['signature_dimensions'], \
    intervals = 1/12, k_history= None, embedding_tp = model_specifics['post_embedding_tp'], \
    method = model_specifics['feature_combination_method'], \
    history_tp = model_specifics['signature_tp'] , add_time = post_time)

sig, last_index_dt_all = dsig.compute_signatures(path)
sig_combined = dsig.combine_signatures(sig)
x_data = dsig.create_features(path, sig_combined, last_index_dt_all, bert_embeddings, time_feature)

sig.shape, last_index_dt_all.shape, sig_combined.shape, x_data.shape

K fold cross validation with random seeds in FFN

In [None]:
from sklearn import metrics
import random
from datetime import date
import math

from classification_utils import Folds, set_seed, validation, training, testing
from ffn import FeedforwardNeuralNetModel, FocalLoss

# ================================
save_results = False
# ================================

#GLOBAL MODEL PARAMETERS
input_dim = x_data.shape[1]
hidden_dim = 200 #200
output_dim = 3
dropout_rate = 0.35 #0.35 #higher dropout than 0.25 and specifically 0.35 is very promising
num_epochs = 100
learning_rate = 0.0003 #0.0003 #empirically optimal lr value: 0.0001
gamma = 2 #3 #empirically optimal gamma value: 3
BATCH_SIZE = 64
NUM_folds = 5
patience = 2
weight_decay_adam = 0.0001
RANDOM_SEED_list = [0, 1, 12, 123, 1234]

classifier_params = {"input_dim": input_dim,
  "hidden_dim": hidden_dim,
  "output_dim": output_dim,
  "dropout_rate": dropout_rate,
  "num_epochs": num_epochs,
  "learning_rate": learning_rate,
  "gamma": gamma,
  "BATCH_SIZE": BATCH_SIZE,
  "NUM_folds": NUM_folds,
  "patience": patience,
  "weight_decay_adam": weight_decay_adam,
  "RANDOM_SEED_list": RANDOM_SEED_list,
}
# ================================
model_code_name = model_specifics["global_embedding_tp"]  \
+ "_" + str(model_specifics['dimensionality_reduction_tp']) + str(model_specifics['dimensionality_reduction_components']) \
+ "_" + str(model_specifics['time_injection_history_tp']) + str(model_specifics['time_injection_post_tp']) \
+ "_" + str(model_specifics['post_embedding_tp']) + "_" + str(model_specifics['feature_combination_method']) \
+ "_" + str(model_specifics['signature_tp']) + "_" + str(model_specifics['signature_dimensions']) \
+ "_" + str(model_specifics['classifier_name']) + "_" + str(model_specifics['classes_num']) 

FOLDER_models = '/storage/ttseriotou/pathbert/models/v1/'
FOLDER_results = '/storage/ttseriotou/pathbert/results/v1/'

# ================================
KFolds = Folds(num_folds=NUM_folds)
y_data = KFolds.get_labels(df)
# ================================
#K FOLD RUNS

for my_ran_seed in RANDOM_SEED_list:
    set_seed(my_ran_seed)
    myGenerator = torch.Generator()
    myGenerator.manual_seed(my_ran_seed)    
    for test_fold in range(NUM_folds):

        print('Starting random seed #',my_ran_seed, ' and fold #', test_fold)
        #get ith-fold data
        x_test, y_test, x_valid, y_valid, x_train , y_train, test_tl_ids, test_pids = KFolds.get_splits(df, x_data, y_data, test_fold= test_fold)

        #data loaders with batches
        train = torch.utils.data.TensorDataset( x_train, y_train)
        valid = torch.utils.data.TensorDataset( x_valid, y_valid)
        test = torch.utils.data.TensorDataset( x_test, y_test)

        train_loader = torch.utils.data.DataLoader(dataset=train, batch_size = BATCH_SIZE, shuffle = True)
        valid_loader = torch.utils.data.DataLoader(dataset=valid, batch_size = BATCH_SIZE, shuffle = True)
        test_loader = torch.utils.data.DataLoader(dataset=test, batch_size = BATCH_SIZE, shuffle = True)

        #calculate alpha for focal loss
        alpha_values = torch.Tensor([math.sqrt(1/(y_train[y_train==0].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==1].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==2].shape[0]/y_train.shape[0]))])

        #early stopping params
        last_metric = 0
        trigger_times = 0
        best_metric = 0

        #model definitions
        model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim, dropout_rate)
        criterion = FocalLoss(gamma = gamma, alpha = alpha_values)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay= weight_decay_adam)

        #model train/validation per epoch
        for epoch in range(num_epochs):

            training(model, train_loader, criterion, optimizer, epoch, num_epochs)
        
            # Early stopping
            
            _ , f1_v = validation(model, valid_loader, criterion)
            print('Current Macro F1:', f1_v)

            if f1_v > best_metric :
                best_metric = f1_v

                #test and save so far best model
                predicted_test, labels_test = testing(model, test_loader)

                results = {
                    "model_code_name": model_code_name, 
                    "model_specifics": model_specifics, 
                    "classifier_params": classifier_params, 
                    "date_run": date.today().strftime("%d/%m/%Y"),
                    "test_tl_ids": test_tl_ids,
                    "test_pids": test_pids,
                    "labels": labels_test,
                    "predictions": predicted_test,
                    "test_fold": test_fold,
                    "random_seed": my_ran_seed,
                    "epoch": epoch,
                }

                if (save_results==True):
                    file_name_results = FOLDER_results + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str(test_fold) + "fold" +'.pkl'
                    file_name_model = FOLDER_models + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str(test_fold) + "fold" +'.pkl'
                    pickle.dump(results, open(file_name_results, 'wb'))
                    torch.save(model.state_dict(), file_name_model)

            if f1_v < last_metric:
                trigger_times += 1
                print('Trigger Times:', trigger_times)

                if trigger_times >= patience:
                    print('Early stopping!')
                    break

            else:
                print('Trigger Times: 0')
                trigger_times = 0

            last_metric = f1_v
     
        

Process Model results

In [None]:
from classification_utils import process_model_results
process_model_results(model_code_name, FOLDER_results)