In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from itertools import chain

import nltk
import sklearn
from sklearn.metrics import classification_report
import joblib
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle
import numpy as np
import pandas as pd
from ast import literal_eval
from transformers import AutoTokenizer, AutoModel
import torch


########################### MODEL PARAMETERS ############################
train_to_test_ratio = 0.9 # 10% test and 90% train

bert_model = "vinai/bertweet-base" # "bert-large-uncased"; "roberta-large"


##### DATA TO LOAD ######
dataPath = "/Users/adrianahne/PhD/causality/Causal-associations-diabetes-twitter/data/cause_effect_sentences_with_IO_tags.csv"

data = pd.read_csv(dataPath, sep=";", converters={"tokenized":literal_eval, "bio_tags":literal_eval})
print(data.shape)
data.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/adrianahne/PhD/causality/Causal-associations-diabetes-twitter/data/cause_effect_sentences_with_IO_tags.csv'

In [2]:
########################### Check if cuda available ############################
print("Cuda available: ", torch.cuda.is_available())
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print("Selected {} for this notebook".format(device))

Cuda available:  False
Selected cpu for this notebook


In [3]:
############ Choose only sentences with both cause and effect or only sentences with either cause or effect (or both) #######

dataSentFiltered = data[(data["Cause"].notnull()) & (data["Effect"].notnull())]

print(dataSentFiltered.shape)
dataSentFiltered.head()

(2118, 7)


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,"[USER, Additionally, the, medicines, are, bein...","[O, O, O, I-C, I-C, I-C, I-C, I-C, I-C, O, O, ..."
1,"I hear "" I hate being a diabetic "" .",,diabetic,hate,1.0,"[I, hear, "", I, hate, being, a, diabetic, "", .]","[O, O, O, O, I-E, O, O, I-C, O, O]"
2,"i got lime for my glucose test , was n't that ...",,glucose test,nauseous,1.0,"[i, got, lime, for, my, glucose, test, ,, was,...","[O, O, O, O, O, I-C, I-C, O, O, O, O, O, O, O,..."
3,Sounds like Willow 's blood sugar level is rea...,,blood sugar level is real low,reduce her insulin shots,1.0,"[Sounds, like, Willow, 's, blood, sugar, level...","[O, O, O, O, I-C, I-C, I-C, I-C, I-C, I-C, O, ..."
4,USER I 've always found it too sweet mustvsay ...,,dreaded diabetes,sauces are used sparingly,1.0,"[USER, I, 've, always, found, it, too, sweet, ...","[O, O, O, O, O, O, O, O, O, O, O, O, I-E, I-E,..."


In [4]:
trainingDataSample = dataSentFiltered#.sample(n=100)   # Only for testing
train = trainingDataSample.sample(frac=train_to_test_ratio, random_state=0)
test = trainingDataSample.drop(train.index)
print("Train:", train.shape)
print("Test:", test.shape)
train.head()

Train: (1906, 7)
Test: (212, 7)


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
1445,Benefit to having a child when diabetic : you ...,,blood sugar is low,lollies,1.0,"[Benefit, to, having, a, child, when, diabetic...","[O, O, O, O, O, O, O, O, O, O, O, O, O, I-E, O..."
1476,@USER my father suffering from diabetes INSULI...,,diabetes,suffering,1.0,"[@USER, my, father, suffering, from, diabetes,...","[O, O, O, I-E, O, I-C, O, O, O, O, O, O, O, O,..."
2130,I did n't knew that diabetes can be one of sid...,,Thyroid disorder,diabetes,1.0,"[I, did, n't, knew, that, diabetes, can, be, o...","[O, O, O, O, O, I-E, O, O, O, O, O, O, O, I-C,..."
1320,My blood sugar has been high all evening and m...,,blood sugar has been high,body hurts,1.0,"[My, blood, sugar, has, been, high, all, eveni...","[O, I-C, I-C, I-C, I-C, I-C, O, O, O, O, O, I-..."
2247,"@USER Cool , I spent the holiday talking about...",,struggling,#insulin4all,1.0,"[@USER, Cool, ,, I, spent, the, holiday, talki...","[O, O, O, O, O, O, O, O, O, I-E, O, O, O, I-C, O]"


In [6]:
tokenizer = AutoTokenizer.from_pretrained(bert_model, padding = "max_length", truncation = True, max_length = 60, return_offsets_mapping=True )
model = AutoModel.from_pretrained(bert_model)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def get_word_embeddings(sentence, sentence_tokenised):
    ids = tokenizer.encode(sentence) 
    ids_tensor = torch.tensor(ids).unsqueeze(0) # Batch size: 1
    word_vectors = model(ids_tensor)[0].squeeze()
    
    # get single vector for each word. (Sub words are averaged)
    word_embeddings_all = []
    for word in sentence_tokenised:
        word_encoded = tokenizer.encode(word)
        word_encoded.remove(tokenizer.cls_token_id) # we don't want <CLS> token embedding
        word_encoded.remove(tokenizer.sep_token_id) # we don't want <SEP> token embedding
        #print("word_encoded:", word_encoded)
        
        word_indices = [ids.index(encoded_id) for encoded_id in word_encoded ] 
        #print("word_indices:", word_indices)
        
        # average all sub_word vectors of word
        word_vector = torch.zeros((768))
        for sub_token_id in word_indices:
            word_vector += word_vectors[sub_token_id]
        word_vector /= len(word_indices)
        
        word_embeddings_all.append(word_vector)
        
    return word_embeddings_all

def word2features(word, i, wordembedding):

    features = {
#        'bias': 1.0,
        'word.lower()': word.lower(),
#        'word[-3:]': word[-3:],
#        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.isdigit()': word.isdigit(),
#        'postag': postag,
#        'postag[:2]': postag[:2],
        'wordlength': len(word),
        'wordinitialcap': word[0].isupper(),
        'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
        'wordallcap': len([x for x in word if x.isupper()])==len(word),
        'distfromsentbegin': i
    }

    # here you add 768 features (one for each vector component)
    for iv,value in enumerate(wordembedding):
        features['v{}'.format(iv)]=value
    
    return features



def sent2features(sentence, tokenized):
    word_vectors = get_word_embeddings(sentence, tokenized)
    return [word2features(tokenized[i], i, word_vectors[i]) for i in range(len(tokenized))]


X_train = [sent2features(sentence, tokenized) for sentence, tokenized in zip(train.sentence.values.tolist(), train.tokenized.values.tolist())]
y_train = [tags for tags in train.bio_tags]
print("X_train:", len(X_train), len(X_train[0]))
print("y_train:", len(y_train), len(y_train[0]))

X_test = [sent2features(sentence, tokenized) for sentence, tokenized in zip(test.sentence.values.tolist(), test.tokenized.values.tolist())]
y_test = [tags for tags in test.bio_tags]

print("X_test:", len(X_test), len(X_test[0]))
print("y_test:", len(y_test), len(y_test[0]))


In [8]:
tokenizer

PreTrainedTokenizer(name_or_path='vinai/bertweet-base', vocab_size=64000, model_max_len=128, is_fast=False, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'})

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)   ### Error message when try to train


In [None]:
predictions = crf.predict(X_test)

In [None]:
test_true_tag = [ID for ID in np.concatenate(y_test)]#
test_predict_tag = [ID for ID in np.concatenate(predictions)]
print(classification_report(test_true_tag, test_predict_tag))

In [None]:
for tokens, true_labels, predicts in zip(test.tokenized, test.bio_tags, predictions):
    print("\n")
    for token, true_label, predic in zip(tokens, true_labels, predicts):
        print(token, "true:", true_label, "predic:", predic)