# **1. Import Libraries**

In [1]:
import pandas as pd
import numpy as np
import spacy
import json
import pprint
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler, MaxAbsScaler
from sklearn.metrics import classification_report, make_scorer, f1_score
from scipy.sparse import csr_matrix, hstack
import gensim.downloader as api
from joblib import load, dump
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# **2. Load Dataset**

In [2]:
#clone fewrel repository
!git clone https://github.com/thunlp/FewRel
#change directory
%cd FewRel

Cloning into 'FewRel'...
remote: Enumerating objects: 565, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 565 (delta 1), reused 0 (delta 0), pack-reused 559[K
Receiving objects: 100% (565/565), 24.68 MiB | 7.01 MiB/s, done.
Resolving deltas: 100% (340/340), done.
/content/FewRel


In [3]:
#load training data
with open('./data/train_wiki.json', 'r') as file:
    fewrel = json.load(file)

# **3. Data Preprocessing**

## Set up Data & Environment

In [4]:
#set up nlp environment
nlp = spacy.load('en_core_web_sm')

In [5]:
#extract lists of full texts and labels for training data
texts, relations = [], []
for relation, instances in fewrel.items():
    for instance in instances:
        texts.append(' '.join(instance['tokens']))  #combine tokens into sentence
        relations.append(relation)

In [6]:
#extract lists of heads, tails, and tokens
heads, tails, h_seq, t_seq, tokens = [], [], [], [], []
for relation, instances in fewrel.items():
  for instance in instances:
    heads.append(instance['h'][0]) #extract head text
    tails.append(instance['t'][0]) #extract tail text
    h_seq.append(instance['h'][2][0]) #extract head seq
    t_seq.append(instance['t'][2][0]) #extract tail seq
    tokens.append(instance['tokens']) #extract token

In [7]:
#combine into dataframe
features = {'head': heads, 'tail': tails, 'token': tokens, 'text': texts, 'h_seq':h_seq, 't_seq':t_seq, 'relation': relations}
fewrel_df = pd.DataFrame(features)

## Named Entity Recognition

In [9]:
#create a class of NER count transformer for the pipeline
class ner_count_transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        #initialise an empty set to gather all unique NER labels
        self.unique_ner_labels = set()

        #iterate through the dataframe to collect all unique NER labels
        for _, row in X.iterrows():
            # Extract text for head and tail
            head_text, tail_text = row['head'], row['tail']

            #process text for NER and update unique_ner_labels with labels from head and tail
            head_doc = nlp(head_text)
            tail_doc = nlp(tail_text)
            self.unique_ner_labels.update([ent.label_ for ent in head_doc.ents])
            self.unique_ner_labels.update([ent.label_ for ent in tail_doc.ents])

        #convert set to list to be sorted
        self.unique_ner_labels = sorted(list(self.unique_ner_labels))
        return self

    def transform(self, X):
        #initialise list to store the feature dictionaries for each record
        ner_features_list = []

        for _, row in X.iterrows():
            #extract text for head and tail
            head_text, tail_text = row['head'], row['tail']

            #process text for NER
            head_doc = nlp(head_text)
            tail_doc = nlp(tail_text)

            #initialise counters for both head and tail NER labels
            head_ner_count = Counter([ent.label_ for ent in head_doc.ents])
            tail_ner_count = Counter([ent.label_ for ent in tail_doc.ents])

            #combine counts, adding prefix for head and tail, and ensuring all labels are included
            combined_counts = {f'H_{label}': head_ner_count.get(label, 0) for label in self.unique_ner_labels}
            combined_counts.update({f'T_{label}': tail_ner_count.get(label, 0) for label in self.unique_ner_labels})

            ner_features_list.append(combined_counts)

        #convert list of dictionaries to a DataFrame
        return pd.DataFrame(ner_features_list)

## Part of Speech

In [10]:
#create a class of POS Tag count transformer for the pipeline
class pos_count_transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        #initialise an empty set to gather all unique POS tags
        self.unique_tags = set()

        #iterate through the dataframe to collect all unique POS tags
        for _, row in X.iterrows():
            text, h_seq, t_seq = row['text'], row['h_seq'], row['t_seq']
            doc = nlp(text)
            #update unique_tags with tags from the entire text
            self.unique_tags.update([token.pos_ for token in doc])
            #update unique_tags with tags specifically from head and tail sequences
            self.unique_tags.update([doc[i].pos_ for i in h_seq if i < len(doc)])
            self.unique_tags.update([doc[i].pos_ for i in t_seq if i < len(doc)])

        #convert set to list to fix the order
        self.unique_tags = sorted(list(self.unique_tags))
        return self

    def transform(self, X):
        #initialise list to store the feature dictionaries for each record
        pos_features_list = []

        for _, row in X.iterrows():
            text, h_seq, t_seq = row['text'], row['h_seq'], row['t_seq']
            doc = nlp(text)

            #initialize counters for both head and tail sequences
            head_pos_count = Counter()
            tail_pos_count = Counter()

            #count POS tags for head and tail sequences
            for index in h_seq:
                head_pos_count[doc[index].pos_] += 1

            for index in t_seq:
                tail_pos_count[doc[index].pos_] += 1

            #combine counts, adding prefix for head and tail, and ensuring all tags are included
            combined_counts = {f'H_{tag}': head_pos_count.get(tag, 0) for tag in self.unique_tags}
            combined_counts.update({f'T_{tag}': tail_pos_count.get(tag, 0) for tag in self.unique_tags})

            pos_features_list.append(combined_counts)

        #convert list of dictionaries to a DataFrame
        return pd.DataFrame(pos_features_list)

## Dependency

In [11]:
#create a class of dependency count transformer for the pipeline
class dependency_count_transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        #initialize an empty set to gather all unique dependency
        unique_deps = set()

        #iterate through the dataframe to collect all unique dependency
        for _, row in X.iterrows():
            text = row['text']
            doc = nlp(text)
            #update unique_deps with all dependency tags from the document
            unique_deps.update([token.dep_ for token in doc])

        #convert set to list to fix the order
        self.unique_deps = sorted(list(unique_deps))
        return self

    def transform(self, X):
        #initialise list to store the feature dictionaries for each record
        dep_counts_list = []

        for _, row in X.iterrows():
            text, h_seq, t_seq = row['text'], row['h_seq'], row['t_seq']
            doc = nlp(text)

            #initialise counters for both head and tail sequences
            head_dep_count = Counter()
            tail_dep_count = Counter()

            #count dependency for head and tail sequences
            for index in h_seq:
                head_dep_count[doc[index].dep_] += 1

            for index in t_seq:
                tail_dep_count[doc[index].dep_] += 1

            #combine counts, ensuring all tags are included even if their count is zero
            combined_counts = {f'H_{dep}': head_dep_count.get(dep, 0) for dep in self.unique_deps}
            combined_counts.update({f'T_{dep}': tail_dep_count.get(dep, 0) for dep in self.unique_deps})

            dep_counts_list.append(combined_counts)

        #convert list of dictionaries to a DataFrame
        return pd.DataFrame(dep_counts_list)

## Distance

In [12]:
#create a class of distance calculation transformer for the pipeline
class distance_transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        #distance does not require learning anything from the training data
        return self  #return self to allow pipeline

    def transform(self, X):
        #initialize an empty list to store the distances
        distances = []

        for _, row in X.iterrows():
            h_seq, t_seq = row['h_seq'], row['t_seq']

            if h_seq and t_seq:
                #grasp start and end sequence
                head_end = max(h_seq)
                tail_start = min(t_seq)
                head_start = min(h_seq)
                tail_end = max(t_seq)

                #calculate the distance based on their positions
                if head_end < tail_start:  #head comes before tail
                    distance = tail_start - head_end - 1  #subtract 1 to not count overlapping word
                elif tail_end < head_start:  #tail comes before head
                    distance = head_start - tail_end - 1  #subtract 1 to not count overlapping word
                else:  #overlapping
                    distance = 0  #if they overlap, the distance is considered as 0

            distances.append(distance)

        #convert the list of distances to a DataFrame
        return pd.DataFrame(distances, columns=['distance'])

## Word Embeddings

In [13]:
#create a class of word embedding transformer for the pipeline
class word_embedding_transformer(BaseEstimator, TransformerMixin):
    def __init__(self, word_vectors):
        self.word_vectors = word_vectors

    def get_vector(self, word):
        #return the word vector if it exists, else return a zero vector
        return self.word_vectors[word] if word in self.word_vectors else np.zeros(self.word_vectors.vector_size)

    def get_avg_vector(self, phrase):
        #split phrase into words and obtain their vectors
        words = phrase.split()
        vectors = [self.get_vector(word) for word in words]
        #compute the mean of the vectors if the phrase is not empty
        return np.mean(vectors, axis=0) if vectors else np.zeros(self.word_vectors.vector_size)

    def fit(self, X, y=None):
        #word embeddings does not require learning anything from the training data
        return self

    def transform(self, X):
        #initialise an empty list to store word vector
        word_embed = []

        for _, row in X.iterrows():
            head_vector = self.get_avg_vector(row['head'])
            tail_vector = self.get_avg_vector(row['tail'])
            #concatenate the vectors for head and tail
            combined_vector = np.concatenate([head_vector, tail_vector])
            word_embed.append(combined_vector)

        #convert the list of word embeddings to a DataFrame
        feature_names = [f'embedding_{i}' for i in range(len(word_embed[0]))]
        return pd.DataFrame(word_embed, columns=feature_names)


## TF-IDF

In [14]:
#initiate TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

## Scaling

In [15]:
#scale using MaxAbsScaler
scaler = MaxAbsScaler()

# **4. Modelling**

## Training

In [22]:
#drop non-numerical features, except 'text' which will be used for TF-IDF
X = fewrel_df.drop(['token', 'relation'], axis=1)
y= fewrel_df['relation']

In [23]:
#split train and test set, stratifying by y to ensure the label instance is balanced
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
#create a column transformer to assign different transformer to different column
column_transformer = ColumnTransformer([
    ('ner', ner_count_transformer(), ['head', 'tail']),
    ('pos', pos_count_transformer(), ['text', 'h_seq', 't_seq']),
    ('dependency', dependency_count_transformer(), ['text', 'h_seq', 't_seq']),
    ('distance', distance_transformer(), ['h_seq', 't_seq']),
    ('word_embedding', word_embedding_transformer(word_vectors=api.load('glove-wiki-gigaword-100')), ['head', 'tail']),
    ('tf-idf', tfidf_vectorizer, 'text')
], remainder='drop')



In [None]:
#create a pipeline to predict RE
pipeline = Pipeline([
    ('feature_extraction', column_transformer),
    ('scale', scaler),
    #enable soft classification to get the probability of the prediction
    ('svm_model', SVC(kernel='rbf', probability=True, random_state=42))

])

In [None]:
#training model
pipeline.fit(X_train, y_train)

## Prediction on Training Data

In [None]:
#predict on the training set
y_pred_train = pipeline.predict(X_train)

In [None]:
#evaluate the model
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

       P1001       1.00      1.00      1.00         7
        P101       1.00      1.00      1.00         7
        P102       1.00      1.00      1.00         7
        P105       1.00      1.00      1.00         7
        P106       1.00      1.00      1.00         7
        P118       1.00      1.00      1.00         7
        P123       1.00      1.00      1.00         7
        P127       1.00      1.00      1.00         7
       P1303       1.00      1.00      1.00         7
        P131       1.00      1.00      1.00         7
       P1344       1.00      1.00      1.00         7
       P1346       1.00      1.00      1.00         7
        P135       1.00      1.00      1.00         7
        P136       1.00      1.00      1.00         7
        P137       1.00      1.00      1.00         7
        P140       1.00      1.00      1.00         7
       P1408       1.00      1.00      1.00         7
       P1411       1.00    

## Prediction on Held-Out Data

In [24]:
#predict on the test set
y_pred_test = pipeline.predict(X_test)

In [25]:
#evaluate the model
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

       P1001       0.77      0.82      0.80       140
        P101       0.77      0.76      0.77       140
        P102       0.96      0.94      0.95       140
        P105       0.99      0.99      0.99       140
        P106       0.95      0.90      0.92       140
        P118       0.96      0.96      0.96       140
        P123       0.63      0.70      0.66       140
        P127       0.48      0.48      0.48       140
       P1303       0.97      0.99      0.98       140
        P131       0.56      0.62      0.59       140
       P1344       0.95      0.96      0.96       140
       P1346       0.73      0.82      0.77       140
        P135       0.83      0.94      0.88       140
        P136       0.91      0.85      0.88       140
        P137       0.72      0.79      0.75       140
        P140       0.92      0.96      0.94       140
       P1408       0.89      0.96      0.92       140
       P1411       0.99    

# **5. Hyper-Parameter Tuning**

In [None]:
#hyper-parameter grid
#using only combination of few hyper parameters due to computational limitation
param_grid = {
    'svm_model__C': [0.1, 1],
    'svm_model__kernel': ['linear', 'rbf'],
}

#initialise stratifiedkfold
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average='weighted')

#initialise random search
gs = GridSearchCV(pipeline, param_grid, cv=skf, scoring=f1_scorer, verbose=2)

In [None]:
#fit random search to train data
gs.fit(X_train, y_train)

In [None]:
#best hyper-parameters
print('Best Hyper-Parameters:', gs.best_params_)
#best f1 score
print('Best F1-Score:', gs.best_score_)
#best model
best_svm = gs.best_estimator_

In [None]:
#save the model
dump(pipeline, 'svm_fewrel_pipeline.pkl')

#download
from google.colab import files
files.download('svm_fewrel_pipeline.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **6. Real-Time Test**

In [16]:
#input new sentence and its head and tail
head = ['margaret ekpo international airport']
tail = ['calabar']
text = ['Nearby margaret airport include Akwa Ibom Airport at Okobo and Margaret Ekpo International Airport in Calabar .']

In [17]:
#clone fewrel repository
!git clone https://github.com/thunlp/FewRel
#change directory
%cd FewRel
#load relation explanation
with open('./data/pid2name.json', 'r') as file:
    relation = json.load(file)

Cloning into 'FewRel'...
remote: Enumerating objects: 565, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 565 (delta 1), reused 0 (delta 0), pack-reused 559[K
Receiving objects: 100% (565/565), 24.68 MiB | 14.92 MiB/s, done.
Resolving deltas: 100% (340/340), done.
/content/FewRel/FewRel


In [18]:
#load pipeline.pkl
from google.colab import drive
drive.mount('/content/drive')

model_path = '/content/drive/My Drive/svm_fewrel_pipeline.pkl'
pipeline = load(model_path)

Mounted at /content/drive


In [19]:
#set up environment
nlp = spacy.load('en_core_web_sm')

#define a function to predict relation extraction in real time
#make sure the required libraries and relation dataset are imported
def prediction_re(head, tail, text):
  #convert the input to a dataframe
  df = pd.DataFrame({'head':head, 'tail': tail, 'text':text})



  #define a function to extract head and tail sequences
  def word_sequences(df, text_col='text', head_col='head', tail_col='tail'):

      new_h_seqs = []
      new_t_seqs = []

      for _, row in df.iterrows():
          #initialise documents
          doc = nlp(row[text_col])
          phrase_head = nlp(row[head_col])
          phrase_tail = nlp(row[tail_col])

          # convert tokens to lowercase strings for comparison
          doc_words = [token.text.lower() for token in doc]
          head_words = [token.text.lower() for token in phrase_head]
          tail_words = [token.text.lower() for token in phrase_tail]

          #initialise empty lists to store the sequences
          h_seq = []
          t_seq = []

          #find sequence for the head
          for i in range(len(doc_words) - len(head_words) + 1):
              if doc_words[i:i + len(head_words)] == head_words:
                  h_seq = list(range(i, i + len(head_words)))
                  break  # Only find the first occurrence

          #find sequence for the tail
          for i in range(len(doc_words) - len(tail_words) + 1):
              if doc_words[i:i + len(tail_words)] == tail_words:
                  t_seq = list(range(i, i + len(tail_words)))
                  break  # Only find the first occurrence

          #append found sequences
          new_h_seqs.append(h_seq)
          new_t_seqs.append(t_seq)

      #add new sequences to the DataFrame
      df['h_seq'] = new_h_seqs
      df['t_seq'] = new_t_seqs

      return df

  #running function to extract head and tail sequences
  word_sequences(df)

  #load stored pipeline
  from google.colab import drive
  drive.mount('/content/drive')

  model_path = '/content/drive/My Drive/svm_fewrel_pipeline.pkl'
  pipeline = load(model_path)

  #use predict_proba to get the probability of the prediction
  y_pred_proba = pipeline.predict_proba(df)
  max_probabilities = np.max(y_pred_proba, axis=1)
  max_classes = np.argmax(y_pred_proba, axis=1)
  rel_label = pipeline.classes_

  # print('The relation of', "'"+text[0]+"'", 'is', relation[y_pred[0]][0] +", "+ relation[y_pred[0]][1])

  for i in range(len(y_pred_proba)):
    print(f"The relation of '{head[i]}' and '{tail[i]}' in the sentence \n'{text[i]}' is '{rel_label[max_classes][i]}' \n -Relation Name: {relation[rel_label[max_classes][i]][0]} \n -Description: {relation[rel_label[max_classes][i]][1]} \nwith probability {max_probabilities[i]:.2f}")

In [20]:
#checking to run  prediction
prediction_re(head, tail, text)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The relation of 'margaret ekpo international airport' and 'calabar' in the sentence 
'Nearby margaret airport include Akwa Ibom Airport at Okobo and Margaret Ekpo International Airport in Calabar .' is 'P931' 
 -Relation Name: place served by transport hub 
 -Description: territorial entity or entities served by this transport hub (airport, train station, etc.) 
with probability 1.00
