In [None]:
#@title
# Author: Lou Pemberton (Intern)
# Date: August 2022
# Description: This notebook takes a file as input, creates a Pandas DataFrame, then uses the SRL-BERT model from allennlp to annotate the sentences in a specific column.
# The SRL annotations are then extracted (verbs, arg0, and arg1 annotations, plus the sentence Id and text) 
# These extracted SRL annotations are then encoded using BERT before saving to a CSV file to use in embeddings_projector.ipynb.
# THIS NOTEBOOK CAN BE USED INSTEAD OF SRL_Vectors.ipynb IF YOU NEED SENTENCES SRL ANNOTATED PRIOR TO EXTRACTING AND ENCODING. IF YOU ALREADY HAVE SRL ANNOTATED DATA, USE SRL_Vectors.ipynb BEFORE embeddings_projector.ipynb

In [None]:
# import libraries
import pandas as pd
import numpy as np
import os
import json
import glob
import re
import string

In [None]:
# import Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Create Pandas DataFrame from file
filepath = # enter filepath in GDrive
df = pd.read_csv(filepath) 

In [None]:
df.head(40)

In [None]:
# some data wrangling and preprocessing

In [None]:
df['Id'] = range(1, len(df) + 1) # adds an Id column if none currently in df

In [None]:
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

In [None]:
df = df.drop(['publishMonth', 'Matched Interesting Commercial Actions', 'topics_ICA_title_num_match','cluster'], axis=1) # amend as needed

In [None]:
# start of SRL annotations
# installs SRL-BERT model from allennlp and spacy-transformers

#! pip install allennlp==2.1.0 allennlp-models==2.1.0 #uncomment if required
#! pip install git+https://github.com/explosion/spacy-transformers #uncomment if required

In [None]:
# import predictor from allennlp
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

In [None]:
# rename column to fit model
df.rename(columns = {'Cluster Representative':'SentenceText'}, inplace = True)

In [None]:
# put SentenceText column to list
cf_list = df.SentenceText.tolist()

In [None]:
# run SRL_BERT model to get SRL annotations and appends to list SemanticRoleLabels
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")

SemanticRoleLabels = []

for sentence in cf_list:
  srl = predictor.predict(sentence)
  SemanticRoleLabels.append(srl)

In [None]:
# AMEND BELOW AS NEEDED - Notice spellings of 'MLPreProcessedSentences' and 'MLPreprocessedSentences' #

In [None]:
df['SemanticRoleLabels'] = SemanticRoleLabels #creates a column for SRL

In [None]:
MLPreProcessedSentences = df.apply(lambda x: (x.to_dict()), axis=1) #puts MLPreprocessedSentences from SemanticRoleLabels list to dictionary

In [None]:
df['MLPreProcessedSentences'] = MLPreProcessedSentences #puts MLPreProcessedSentences dictionary to column in DataFrame

In [None]:
MLPreprocessedSentences =  df['MLPreProcessedSentences'] = [[i] for i in df['MLPreProcessedSentences']] #iterates through each row in MLPreProcessedSentences and creates a list MLPreprocessedSentences (notice different spelling)

In [None]:
# extracts the SRL annotations and SentenceText and Id (for sentence) and creates a list final_dataset which includes each row
final_dataset = []

for idx in range(len(MLPreprocessedSentences)):
    doc = MLPreprocessedSentences[idx]
    for sent_dict in doc:
        sent_text = sent_dict['SentenceText']
        sent_id = sent_dict['Id']
        srl = sent_dict['SemanticRoleLabels']
        srl_verb_list = srl['verbs']
        if srl_verb_list:
            for srl_annotations in srl_verb_list:
                verb_annotation = srl_annotations['verb']
                description_annotation = srl_annotations['description']
                arg0_des = re.findall('\[[ARG0]+?:(.*?)\]', description_annotation)
                arg1_des = re.findall('\[[ARG1]+?:(.*?)\]', description_annotation)
                if arg0_des and arg1_des:
                    row = {
                        'sent_id':sent_id,
                        'sentence':sent_text,
                        'verb':verb_annotation,
                        'arg0_des':arg0_des[0],
                        'arg1_des':arg1_des[0]
                    }
                    final_dataset.append(row)

In [None]:
# puts final_dataset list to Pandas DataFrame
finaldf = pd.DataFrame(final_dataset)

In [None]:
#limits the length of arguments for arg0 and arg1
finaldf = finaldf[ 
        (finaldf.arg0_des.apply(lambda x:len(x.split()) <= 3)) 
        & 
        (finaldf.arg1_des.apply(lambda x:len(x.split()) <= 3)) 
       ]

In [None]:
finaldf.head()

In [None]:
#puts each annotation column into a list
verb_list = list(finaldf.verb)
arg0_list = list(finaldf.arg0_des)
arg1_list = list(finaldf.arg1_des)

In [None]:
# start of BERT embeddings
#!pip install -U sentence-transformers #uncomment if required to install

In [None]:
#imports library and creates BERT model
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# get BERT verb embeddings
batch_size = 64
verb_embeddings = []
for i in range(0, len(verb_list), batch_size):
    verb_embeddings.extend(bert.encode(verb_list[i:i+batch_size], batch_size=batch_size))
    print(f"Batch {i} encoding finished")

Batch 0 encoding finished


In [None]:
# get BERT arg0 embeddings
batch_size = 64
agent_embeddings = []
for i in range(0, len(arg0_list), batch_size):
    agent_embeddings.extend(bert.encode(arg0_list[i:i+batch_size], batch_size=batch_size))
    print(f"Batch {i} encoding finished")

Batch 0 encoding finished


In [None]:
# get BERT arg1 embeddings
batch_size = 64
theme_embeddings = []
for i in range(0, len(arg1_list), batch_size):
    theme_embeddings.extend(bert.encode(arg1_list[i:i+batch_size], batch_size=batch_size))
    print(f"Batch {i} encoding finished")

Batch 0 encoding finished


In [None]:
# add embeddings for annotations to DataFrame
finaldf['verb_embeddings'] = verb_embeddings
finaldf['arg0_embeddings'] = agent_embeddings
finaldf['arg1_embeddings'] = theme_embeddings

In [None]:
finaldf.head()

In [None]:
# save SRL annotations and BERT embeddings to csv #

#finaldf.to_csv(FILE LOCATION GOES HERE)