Libraries

Libraries

In [1]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import pickle
import sys
import re

sys.path.insert(0, "../../timeline_generation/")  # Adds higher directory to python modules path
import data_handler

  from .autonotebook import tqdm as notebook_tqdm


Read raw data

In [None]:
TalkLifeDataset = data_handler.TalkLifeDataset()
annotations = TalkLifeDataset.return_annotated_timelines(load_from_pickle=False)
annotations = annotations[annotations['content']!='nan']

sample_size = annotations.shape[0]
print(sample_size)
annotations.head()
#column format: "timeline_id"	"postid"	"content"	"label"	"datetime"

Define Model

In [None]:
# ================================
#model specifics
model_specifics = {"global_embedding_tp": 'SBERT', #options: SBERT, BERT_cls , BERT_mean, BERT_max
    "dimensionality_reduction_tp": 'ppapca', #options: ppapca, ppapcappa, umap
    "dimensionality_reduction_components": 10, # options: any int number between 1 and embedding dimensions
    "time_injection_history_tp": 'timestamp', #options: timestamp, None
    "time_injection_post_tp": 'timestamp', #options: timestamp, timediff, None
    "signature_dimensions": 3, #options: any int number larger than 1
    "post_embedding_tp": 'sentence', #options: sentence, reduced
    "feature_combination_method": 'attention', #options concatenation, attention 
    "signature_tp": 'log', # options: log, sig
    "classifier_name": 'FFN2hidden', # options: FFN2hidden (any future classifiers added)
    "classes_num": '3class', #options: 3class (5class to be added in the future)
}

Post Embeddings, Dimensionality Reduction

In [None]:
#post embedding
from embeddings import Representations

rep = Representations(type = model_specifics['global_embedding_tp'])
embeddings_sentence = rep.get_embeddings()

print(embeddings_sentence.shape)

#dimensionality reduction
from dimensionality_reduction import DimensionalityReduction

reduction = DimensionalityReduction(method= model_specifics['dimensionality_reduction_tp'], components=model_specifics['dimensionality_reduction_components'])
embeddings_reduced = reduction.fit_transform(embeddings_sentence)

print(embeddings_reduced.shape)

Time features

In [None]:
#concatenate new dataframe
from dataset import get_modeling_dataframe
df = get_modeling_dataframe(annotations, embeddings_sentence, embeddings_reduced)

#get time features
from timeinjection import TimeFeatures, Padding
tf = TimeFeatures()
df = tf.get_time_features(df)


#padding
pad = Padding()
df_padded = pad.pad_timelines(df)
df_padded.shape

Dyadic paths and data combination

In [None]:
if (model_specifics['time_injection_history_tp'] == 'timestamp'):
    path = torch.from_numpy(df_padded[: , : , 2:].astype(float))
else:
    path = torch.from_numpy(df_padded[: , : , 3:].astype(float))
  
if (model_specifics['time_injection_post_tp']== 'timestamp'):
    time_feature = torch.tensor((df[['time_encoding']].values - df['time_encoding'].mean()) / df['time_encoding'].std() )
    post_time = True
elif (model_specifics['time_injection_post_tp']== 'timediff'):
    time_feature = torch.tensor( (df[['time_diff']].values - df['time_diff'].mean()) / df['time_diff'].std()  )
    post_time = True  
else: 
    time_feature = None
    post_time = False

if (model_specifics['post_embedding_tp'] == 'sentence'):
    bert_embeddings = torch.tensor(df[[c for c in df.columns if re.match("^e\w*[0-9]", c)]].values)
else:
    bert_embeddings = None

#calculate paths
from dyadic_path import DyadicSignatures

dsig = DyadicSignatures(original_size = df.shape[0], d = path.shape[2], sig_d = model_specifics['signature_dimensions'], \
    intervals = 1/12, k_history= None, embedding_tp = model_specifics['post_embedding_tp'], \
    method = model_specifics['feature_combination_method'], \
    history_tp = model_specifics['signature_tp'] , add_time = post_time)

sig, last_index_dt_all = dsig.compute_signatures(path)
sig_combined = dsig.combine_signatures(sig)
x_data = dsig.create_features(path, sig_combined, last_index_dt_all, bert_embeddings, time_feature)

sig.shape, last_index_dt_all.shape, sig_combined.shape, x_data.shape

K fold cross validation with random seeds in FFN

In [None]:
from sklearn import metrics
import random
from datetime import date
import math

from classification_utils import Folds, set_seed, validation, training, testing
from ffn import FeedforwardNeuralNetModel, FocalLoss

# ================================
save_results = False
# ================================

#GLOBAL MODEL PARAMETERS
input_dim = x_data.shape[1]
hidden_dim = 200 #200
output_dim = 3
dropout_rate = 0.35 #0.35 #higher dropout than 0.25 and specifically 0.35 is very promising
num_epochs = 100
learning_rate = 0.0003 #0.0003 #empirically optimal lr value: 0.0001
gamma = 2 #3 #empirically optimal gamma value: 3
BATCH_SIZE = 64
NUM_folds = 5
patience = 2
weight_decay_adam = 0.0001
RANDOM_SEED_list = [0, 1, 12, 123, 1234]

classifier_params = {"input_dim": input_dim,
  "hidden_dim": hidden_dim,
  "output_dim": output_dim,
  "dropout_rate": dropout_rate,
  "num_epochs": num_epochs,
  "learning_rate": learning_rate,
  "gamma": gamma,
  "BATCH_SIZE": BATCH_SIZE,
  "NUM_folds": NUM_folds,
  "patience": patience,
  "weight_decay_adam": weight_decay_adam,
  "RANDOM_SEED_list": RANDOM_SEED_list,
}
# ================================
model_code_name = model_specifics["global_embedding_tp"]  \
+ "_" + str(model_specifics['dimensionality_reduction_tp']) + str(model_specifics['dimensionality_reduction_components']) \
+ "_" + str(model_specifics['time_injection_history_tp']) + str(model_specifics['time_injection_post_tp']) \
+ "_" + str(model_specifics['post_embedding_tp']) + "_" + str(model_specifics['feature_combination_method']) \
+ "_" + str(model_specifics['signature_tp']) + "_" + str(model_specifics['signature_dimensions']) \
+ "_" + str(model_specifics['classifier_name']) + "_" + str(model_specifics['classes_num']) 

FOLDER_models = '/storage/ttseriotou/pathbert/models/v1/'
FOLDER_results = '/storage/ttseriotou/pathbert/results/v1/'

# ================================
KFolds = Folds(num_folds=NUM_folds)
y_data = KFolds.get_labels(df)
# ================================
#K FOLD RUNS

for my_ran_seed in RANDOM_SEED_list:
    set_seed(my_ran_seed)
    myGenerator = torch.Generator()
    myGenerator.manual_seed(my_ran_seed)    
    for test_fold in range(NUM_folds):

        print('Starting random seed #',my_ran_seed, ' and fold #', test_fold)
        #get ith-fold data
        x_test, y_test, x_valid, y_valid, x_train , y_train, test_tl_ids, test_pids = KFolds.get_splits(df, x_data, y_data, test_fold= test_fold)

        #data loaders with batches
        train = torch.utils.data.TensorDataset( x_train, y_train)
        valid = torch.utils.data.TensorDataset( x_valid, y_valid)
        test = torch.utils.data.TensorDataset( x_test, y_test)

        train_loader = torch.utils.data.DataLoader(dataset=train, batch_size = BATCH_SIZE, shuffle = True)
        valid_loader = torch.utils.data.DataLoader(dataset=valid, batch_size = BATCH_SIZE, shuffle = True)
        test_loader = torch.utils.data.DataLoader(dataset=test, batch_size = BATCH_SIZE, shuffle = True)

        #calculate alpha for focal loss
        alpha_values = torch.Tensor([math.sqrt(1/(y_train[y_train==0].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==1].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==2].shape[0]/y_train.shape[0]))])

        #early stopping params
        last_metric = 0
        trigger_times = 0
        best_metric = 0

        #model definitions
        model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim, dropout_rate)
        criterion = FocalLoss(gamma = gamma, alpha = alpha_values)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay= weight_decay_adam)

        #model train/validation per epoch
        for epoch in range(num_epochs):

            training(model, train_loader, criterion, optimizer, epoch, num_epochs)
        
            # Early stopping
            
            _ , f1_v = validation(model, valid_loader, criterion)
            print('Current Macro F1:', f1_v)

            if f1_v > best_metric :
                best_metric = f1_v

                #test and save so far best model
                predicted_test, labels_test = testing(model, test_loader)

                results = {
                    "model_code_name": model_code_name, 
                    "model_specifics": model_specifics, 
                    "classifier_params": classifier_params, 
                    "date_run": date.today().strftime("%d/%m/%Y"),
                    "test_tl_ids": test_tl_ids,
                    "test_pids": test_pids,
                    "labels": labels_test,
                    "predictions": predicted_test,
                    "test_fold": test_fold,
                    "random_seed": my_ran_seed,
                    "epoch": epoch,
                }

                if (save_results==True):
                    file_name_results = FOLDER_results + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str(test_fold) + "fold" +'.pkl'
                    file_name_model = FOLDER_models + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str(test_fold) + "fold" +'.pkl'
                    pickle.dump(results, open(file_name_results, 'wb'))
                    torch.save(model.state_dict(), file_name_model)

            if f1_v < last_metric:
                trigger_times += 1
                print('Trigger Times:', trigger_times)

                if trigger_times >= patience:
                    print('Early stopping!')
                    break

            else:
                print('Trigger Times: 0')
                trigger_times = 0

            last_metric = f1_v
     
        

Process Model results

In [None]:
from classification_utils import process_model_results
process_model_results(model_code_name, FOLDER_results)