In [None]:
# optional: run if not already installed
!pip install numpy
!pip install scikit-learn
!pip install nltk
!pip install pandas
!pip install gensim
!pip install spacy

In [1]:
# import stuff
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import f1_score
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import pos_tag
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk.corpus import wordnet as wn
from collections import defaultdict
from nltk.corpus import stopwords
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
import re
import spacy
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Clint\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Clint\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Clint\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Clint\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  _C._set_default_tensor_type(t)


In [2]:
# download pretrained word2vec model
# !!might take a while!!
import gensim.downloader as api
word2vec = api.load("word2vec-google-news-300")

## Prepare Data

In [3]:

def read_file(filename):
    """
    Read and return the lines from a file.

    Parameters:
    filename: The path to the file to be read.

    Returns:
    A list of lines read from the file.
    """
    with open(filename, 'r',  encoding='utf-8') as file:
        lines = [line.strip() for line in file.readlines()]
        return lines
    
    
def create_relation_dict(line):
    """
    Generate a dictionary of entity relations from a line.

    Parameters:
    line: A string containing triplets of (entity, related entity, relation) separated by '|'.
    e.g. Chromecast ; Google ; manufacturer | Chromecast ; Google ; developer

    Returns:
    A dictionary where keys are tuples of the entities in the triplet and values are lists of relations between them.
    e.g. (Chromecast, Google) : [manufacturer, developer]
    """
    relation = {}
    triplets = line.split('|') # split triplets
    for triplet in triplets:
        items = triplet.strip().split(';') # split entity relations
        key = tuple([item.strip() for item in items[:2]]) # make a key out of a tuple of the entities
        value = items[2].strip() # get relation
        if key in relation:
            relation[key].append(value)
        else:
            relation[key] = [value]
    return relation

def read_triplets(filename):
    """
    Read triplets from a file and create relation dictionaries.

    Parameters:
    filename: The path to the file containing triplets.

    Returns:
    A list of relation dictionaries created from the triplets in the file.
    """
    lines = read_file(filename)
    return [create_relation_dict(line) for line in lines]

def extract_X_Y(sentences, relation_dicts):
    """
    Generate X and Y data from sentences and relation dictionaries.

    Parameters:
    sentences: A list of sentences.
    relation_dicts: A list of relation dictionaries.

    Returns:
    tuple: A tuple containing X data (input data) and Y data (output data) in the form of numpy arrays, 
    along with the MultiLabelBinarizer object for Y data.

    X data: Each row represents a combination of a sentence and a key from the relation dictionary.
    Y data: Each row represents the corresponding values from the relation dictionary for the respective sentence and key.

    """
    if len(sentences) != len(relation_dicts):
        raise Exception("sentences and relation files do not match")
    
    X, Y = [], []
    for i in range(len(sentences)):
        sentence = sentences[i]
        for key in relation_dicts[i].keys():
            X.append([sentence]  + list(key)) # lines in X of the format [sentence, entity, related entity]
            Y.append(relation_dicts[i][key])
    # Convert list of target labels to binary matrix, where each column represents whether that label is applicable or not 
    mlb = MultiLabelBinarizer()
    Y = mlb.fit_transform(Y)
    return np.array(X), np.array(Y), mlb


In [4]:
# look at possible labels
labels = read_file("./datasets/relations.txt")
labels

['product/material produced',
 'manufacturer',
 'distributed by',
 'industry',
 'position held',
 'original broadcaster',
 'owned by',
 'founded by',
 'distribution format',
 'headquarters location',
 'stock exchange',
 'currency',
 'parent organization',
 'chief executive officer',
 'director/manager',
 'owner of',
 'operator',
 'member of',
 'employer',
 'chairperson',
 'platform',
 'subsidiary',
 'legal form',
 'publisher',
 'developer',
 'brand',
 'business division',
 'location of formation',
 'creator']

In [5]:
# Read Training data

train_sentences = read_file("./datasets/train.sent")
train_relation_dicts = read_triplets("./datasets/train.tup")

# Extract X and Y from training data files
X_train, Y_train, mlb = extract_X_Y(sentences=train_sentences, relation_dicts=train_relation_dicts)

# Print some information about the data
print("labels:", len(labels), "sentences:", len(train_sentences), "relation_dicts:", len(train_relation_dicts))
print("X_train shape:", X_train.shape, "Example X_train data:", X_train[0])
print("Y_train shape:", Y_train.shape, "Example Y_train data:", Y_train[0])


labels: 29 sentences: 5700 relation_dicts: 5700
X_train shape: (7070, 3) Example X_train data: ['NEW YORK (Reuters) - Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.'
 'Apple Inc' 'Steve Jobs']
Y_train shape: (7070, 29) Example Y_train data: [0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Feature Engineering

#### 1. Preprocessing

In [6]:
def get_tokens_between_entities(row):
    """
    Calculate the tokens between two entities in a sentence.

    Parameters:
    row: A list containing the sentence and two entities.

    Returns:
    The tokens between the two entities.
    """
    sentence_tokens = word_tokenize(row[0])
    entity1_tokens = word_tokenize(row[1])
    entity2_tokens = word_tokenize(row[2])
    if sentence_tokens.index(entity1_tokens[-1]) < sentence_tokens.index(entity2_tokens[0]): #entity1 comes before entity2
        from_index = sentence_tokens.index(entity1_tokens[-1])
        to_index = sentence_tokens.index(entity2_tokens[0])
    else: #entity2 comes before entity1
        from_index = sentence_tokens.index(entity2_tokens[-1]) 
        to_index = sentence_tokens.index(entity1_tokens[0])
    words_between = sentence_tokens[from_index + 1:to_index]
    return ' '.join(words_between)

def entity_neighbourhood(sentence, entity, window_size=3):
    """
    Extract the tokens around a specific entity in a sentence.

    Parameters:
    sentence: The input sentence.
    entity: The entity to extract tokens around.
    window_size: The size of the window around the entity.

    Returns:
    The tokens around the specified entity (of a certain window size, inclusive of the entity)
    """
    sentence_tokens = word_tokenize(sentence)
    entity_tokens = word_tokenize(entity)
    
    entity_start_index = sentence_tokens.index(entity_tokens[0]) #locate where first token of entity lies
    entity_end_index = sentence_tokens.index(entity_tokens[-1]) # locate where the last token lies
    
    start_index = max(0, entity_start_index - window_size)
    end_index = min(len(sentence_tokens), entity_end_index + window_size + 1)
    
    tokens_around_entity = sentence_tokens[start_index:end_index]
    
    return ' '.join(tokens_around_entity)



def preprocessing(X):
    """
    Preprocess the input data by extracting features.

    Parameters:
    X: Input data containing sentences and entities.

    Returns:
    Processed data - extract entity neighbourhoods and tokens between entities
    """
    
    # get tokens between entities
    tokens_between_entities= np.array([get_tokens_between_entities(row) for row in X])
    # get entity neighbourhoods
    entity1_neighbourhood = np.array([entity_neighbourhood(row[0], row[1]) for row in X])
    entity2_neighbourhood = np.array([entity_neighbourhood(row[0], row[2]) for row in X])
    
    # add as new features to X
    X = np.hstack((X, tokens_between_entities.reshape(-1, 1)))
    X = np.hstack((X, entity1_neighbourhood.reshape(-1, 1)))
    X = np.hstack((X, entity2_neighbourhood.reshape(-1, 1)))
    X = pd.DataFrame(X)
    X.columns = ['sentence', 'entity1', 'entity2', 'tokens_between_entities', 'entity1_neighbourhood', 'entity2_neighbourhood']
    return X



In [7]:
X_train_preprocessed = preprocessing(X_train)
X_train_preprocessed.head()

Unnamed: 0,sentence,entity1,entity2,tokens_between_entities,entity1_neighbourhood,entity2_neighbourhood
0,NEW YORK (Reuters) - Apple Inc Chief Executive...,Apple Inc,Steve Jobs,Chief Executive,Reuters ) - Apple Inc Chief Executive Steve,Inc Chief Executive Steve Jobs sought to soothe
1,"Last week, Citigroup Inc's ( C.N ) Chief Execu...",Vikram Pandit,Citigroup,Inc 's ( C.N ) Chief Executive,) Chief Executive Vikram Pandit said that he,"Last week , Citigroup Inc 's ("
2,Lehman Brothers LEH.N shares fell sharply on M...,Lehman Brothers,investment bank,LEH.N shares fell sharply on Monday on specula...,Lehman Brothers LEH.N shares fell,speculation that the investment bank could be ...
3,Lehman Brothers LEH.N shares fell sharply on M...,Lehman Brothers,investment,LEH.N shares fell sharply on Monday on specula...,Lehman Brothers LEH.N shares fell,speculation that the investment bank could be
4,Franz told Reuters that Fiat Chief Executive S...,Sergio Marchionne,Fiat,Chief Executive,Fiat Chief Executive Sergio Marchionne had sai...,told Reuters that Fiat Chief Executive Sergio


#### 2. Lexical Features

2.1 Entity distances

In [8]:
class EntityDistance(BaseEstimator, TransformerMixin):
    """
    Custom transformer class for calculating the distance between the given two entities in a sentence.
    
    Parameters:
    absolute_values:  whether to return absolute distance values or not
    
    """
    
    def __init__(self, absolute_values=False):
        self.absolute_values = absolute_values
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        distance_matrix = [[self.entity_distance(row['sentence'], row['entity1'], row['entity2'])] for index, row in X.iterrows()]
        distance_matrix_scaled = StandardScaler().fit_transform(distance_matrix)
        return csr_matrix(distance_matrix_scaled) # convert to sparse coded matrix
    
    def entity_distance(self, sentence, entity1, entity2):
        """
        Calculate the distance (in number of tokens) between two entities in a sentence.

        Parameters:
        sentence: The input sentence.
        entity1: The first entity.
        entity2: The second entity.

        Returns:
        The distance between the two entities.
        """
        tokens = word_tokenize(sentence)
        if self.absolute_values:
            return abs(tokens.index(word_tokenize(entity2)[0]) - tokens.index(word_tokenize(entity1)[0]))
        else:
            return tokens.index(word_tokenize(entity2)[0]) - tokens.index(word_tokenize(entity1)[0])



In [9]:
entity_distances = EntityDistance().fit_transform(X_train_preprocessed)
entity_distances

<7070x1 sparse matrix of type '<class 'numpy.float64'>'
	with 7070 stored elements in Compressed Sparse Row format>

2.2 Tf-idf Vectorization


In [9]:
def get_pos_tag_map():
    """
    Returns a mapping of POS tags to WordNet tags.
    
    Returns:
    Mapping of POS tags to WordNet tags.
    """
    pos_tag_map = defaultdict(lambda : wn.NOUN)
    pos_tag_map['J'] = wn.ADJ
    pos_tag_map['V'] = wn.VERB
    pos_tag_map['R'] = wn.ADV
    return pos_tag_map

def tokenizer(text):
    """
    custom tokenizer that 
    - tokenizes the input text
    - converts to lowercase
    - removes stopwords, 
    - LEMMATIZES based on pos-tags

    Parameters:
    text: Input text to tokenize.

    Returns:
    List of tokens after preprocessing
    """
    text = text.lower() #lowercase
    tokens = word_tokenize(text) #tokenize
    pos_tags = pos_tag(tokens) # get pos_tags
    lemmatizer = WordNetLemmatizer()
    tokens = []
    pos_tag_map = get_pos_tag_map()
    for token, tag in pos_tags:
        if token.lower() not in stopwords.words('english'):
            token = lemmatizer.lemmatize(token, pos=pos_tag_map[tag[0]])
            # Remove punctuations
            token = re.sub(r'[^\w\s]', '', token)
            if token:  # if token is not empty
                tokens.append(token)
    return tokens
  


class TfidfFeatures(BaseEstimator, TransformerMixin):
    """
    Custom transformer class for extracting TF-IDF vectorization of a sentence using TfidfVectorizer.
    
    Parameters:
    max_features: Maximum number of features to consider. Takes top n most frequent words as features
    ngram_range: Range for n-grams to consider.
    """
    def __init__(self, max_features=None, ngram_range=(1,3)):
        self.max_features = max_features
        self.ngram_range = ngram_range

    
    def fit(self, X, y=None):
        # we use a custom tokenzier because the default TfidfVectorizer does not lemmatize
        self.tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=self.max_features, ngram_range=self.ngram_range)
        sentences = X['sentence']
        self.tfidf_vectorizer.fit(sentences)
        return self

    def transform(self, X):
        sentences = X['sentence']
        return self.tfidf_vectorizer.transform(sentences)



In [11]:
X_tfidf = TfidfFeatures().fit_transform(X_train_preprocessed)
X_tfidf



<7070x228032 sparse matrix of type '<class 'numpy.float64'>'
	with 620774 stored elements in Compressed Sparse Row format>

tf-idf vectorization with upto 3 n-grams leads to a dimension of 228,032. This is too much and very sparse. We will use the top 3000 most frequent terms for vectorization.

#### 3. Syntactic Features

3.1 POS tags

In [10]:
class POSFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def extract_pos_features(self, X):
        pos_features_list = []
        for sentence in X:
            tokens = word_tokenize(sentence)
            pos_tags = pos_tag(tokens)
            pos_features = {}
            for word, tag in pos_tags:
                pos_features[tag] = pos_features.get(tag, 0) + 1
            pos_features_list.append(pos_features)
        return pos_features_list
    
    def fit(self, X, y=None):
        pos_features = self.extract_pos_features(X['sentence'])
        self.dict_vectorizer = DictVectorizer(sparse=True)
        self.dict_vectorizer.fit(pos_features)
        return self
    
    def transform(self, X):
        pos_features = self.extract_pos_features(X['sentence'])
        pos_features_vectorized = self.dict_vectorizer.transform(pos_features)
        return pos_features_vectorized


In [39]:
X_pos_vectorized = POSFeatures().fit_transform(X_train_preprocessed)
X_pos_vectorized

<7070x44 sparse matrix of type '<class 'numpy.float64'>'
	with 103586 stored elements in Compressed Sparse Row format>

In [11]:
def dependency_parser(sentence, entity):
    entity = entity.strip().replace("-"," ").split(" ")[0]
    doc = nlp(sentence)
    entity_token = None
    for token in doc:
        if token.text == entity:
            entity_token = token
    if entity_token is None:
        raise Exception("Entity not present in the sentence")
    return entity_token
    

class ShortestPathFeature(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def get_common_ancestor(self, entity1_ancestors, entity2_ancestors):
        common_ancestor_index = 0
        for i in range(1, min(len(entity1_ancestors), len(entity2_ancestors))):
            if entity1_ancestors[i] == entity2_ancestors[i]:
                common_ancestor_index = i
            else:
                break
        return common_ancestor_index

    def calculate_shortest_path(self, sentence, entity1, entity2):
        try:
            entity1_token = dependency_parser(sentence, entity1)
            entity2_token = dependency_parser(sentence, entity2)
        except Exception:
            return 0
        entity1_ancestors = list(entity1_token.ancestors)[::-1]
        entity2_ancestors = list(entity2_token.ancestors)[::-1]
        common_ancestor_index = self.get_common_ancestor(entity1_ancestors, entity2_ancestors) 
        shortest_path = entity1_ancestors[-1:common_ancestor_index:-1] + entity2_ancestors[common_ancestor_index:] 
        return len(shortest_path)
    
    def transform(self, X):
        shortest_path_matrix = [[self.calculate_shortest_path(row['sentence'], row['entity1'], row['entity2'])] for _, row in X.iterrows()]
        shortest_path_matrix_scaled = StandardScaler().fit_transform(shortest_path_matrix)
        return csr_matrix(shortest_path_matrix_scaled) # convert to sparse coded matrix
    
        


In [24]:
X_shortest_paths = ShortestPathFeature().fit_transform(X_train_preprocessed)
X_shortest_paths

<7070x1 sparse matrix of type '<class 'numpy.float64'>'
	with 7070 stored elements in Compressed Sparse Row format>

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

class PathToRootFeature(BaseEstimator, TransformerMixin):
    def __init__(self, entity):
        self.entity = entity
        self.vectorizer = CountVectorizer()
        
    
    def fit(self, X, y=None):
        path_list = [self.path_to_root_word(row['sentence'], row[self.entity]) for _, row in X.iterrows()]
        path_strings = [' '.join(path) for path in path_list]
        path_matrix = self.vectorizer.fit(path_strings)
        return self
    
    def transform(self, X):
        path_list = [self.path_to_root_word(row['sentence'], row[self.entity]) for _, row in X.iterrows()]
        path_strings = [' '.join(path) for path in path_list]
        path_matrix = self.vectorizer.fit_transform(path_strings)
        scaler = StandardScaler()
        path_matrix_scaled = scaler.fit_transform(path_matrix)
        return path_matrix_scaled
    
    def path_to_root_word(self, sentence, entity):
        try:
            entity_token = dependency_parser(sentence, entity)
            return [ancestor.dep_ for ancestor in entity_token.ancestors[:-1]]
        except Exception:
            return []
      

In [26]:
X_paths = PathToRootFeature('entity1').fit_transform(X_train_preprocessed)
X_paths

ValueError: empty vocabulary; perhaps the documents only contain stop words

#### 4. Semantic Features

In [12]:
class SemanticEmbeddings(BaseEstimator, TransformerMixin):
    """
    Transformer class to generate semantic embeddings for phrases using pre-trained Word2Vec embeddings.
    
    Parameters:
    column: The column containing the phrases to generate embeddings for.
    word2vec: Pre-trained Word2Vec embeddings.
    """
    
    def __init__(self, column):
        self.column = column
        self.word2vec = word2vec
    
    def get_phrase_context(self, phrase):
        """
        Generate the semantic embedding context for a given phrase.
        
        Parameters:
        phrase: The input phrase to generate the embedding for.
        
        Returns:
        The semantic embedding context for the input phrase.
        """
        phrase_tokens = tokenizer(phrase) # tokenize
        context_vectors = []
        
        for token in phrase_tokens:
            if token in self.word2vec:
                token_vector = self.word2vec[token] # get embedding
                context_vectors.append(token_vector)
        
        if len(context_vectors) == 0:
            return np.zeros(300)  # Assuming word vectors are of size 300
        
        phrase_context = np.mean(context_vectors, axis=0) # create context by averaging the word vectors
        return phrase_context
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """
        Transform method to generate semantic embeddings for all phrases in the input data.
        
        Parameters:
        X: Input data containing the phrases.
        
        Returns:
        Sparse matrix of semantic embeddings for the input phrases.
        """
        return csr_matrix([self.get_phrase_context(phrase) for phrase in X[self.column]])


In [13]:
tokens_between_entities_embedded = SemanticEmbeddings(column='tokens_between_entities').fit_transform(X_train_preprocessed)
tokens_between_entities_embedded

<7070x300 sparse matrix of type '<class 'numpy.float64'>'
	with 1645530 stored elements in Compressed Sparse Row format>

#### 5. Combine extracted features

In [14]:
entity_distances_feat = EntityDistance(absolute_values=True)

sentences_tfidf_feat = TfidfFeatures(max_features=5000, ngram_range=(1,3)) # 5000 features

# pos_feat = POSFeatures() #Removed

shortest_path_feature = ShortestPathFeature()

tokens_between_entities_feat = SemanticEmbeddings(column='tokens_between_entities') # 300 features

entity1_neighbourhood_feat = SemanticEmbeddings(column='entity1_neighbourhood') # 300 features

entity2_neighbourhood_feat = SemanticEmbeddings(column='entity2_neighbourhood') # 300 features


combined_features = FeatureUnion([
        ("entity_distances_feat", entity_distances_feat),
        ("sentences_tfidf_feat", sentences_tfidf_feat),
        ("shortest_path_feature", shortest_path_feature),
        ("tokens_between_entities_feat", tokens_between_entities_feat),
        ("entity1_neighbourhood_feat", entity1_neighbourhood_feat),
        ("entity2_neighbourhood_feat", entity2_neighbourhood_feat)
    ])


The vectorized parts of speech feature did not provide good results, probably because the structure is not encapsulated properly by simple vectorization. Therefore, it was removed.

In [15]:
X_train_features = combined_features.fit(X_train_preprocessed).transform(X_train_preprocessed)
X_train_features




<7070x5902 sparse matrix of type '<class 'numpy.float64'>'
	with 6063011 stored elements in Compressed Sparse Row format>

## Model selection and Fine Tuning

Perform grid search over hyperparameters

In [None]:
# NOTE: Takes a while. Skip to Train section for training with tuned hyperparameters

# Define base SVM classifier
svm_classifier = SVC()

# Create OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(svm_classifier, n_jobs=-1)

# define a pipeline with feature extraction and the svm classifier
pipeline = Pipeline([("features", combined_features), ("ovr_classifier", ovr_classifier)])

parameter_grid = dict(
    features__sentences_tfidf_feat__max_features=[2000, 3000, 5000],
    features__sentences_tfidf_feat__ngram_range=[(1,2), (1,3), (1,4)],
    features__entity_distances_feat__absolute_values=[True,False],
    ovr_classifier__estimator__C=[1.1, 1.2],
    ovr_classifier__estimator__gamma=["scale","auto"],
    ovr_classifier__estimator__probability=[True,False],
    ovr_classifier__estimator__shrinking=[True,False],
    ovr_classifier__estimator__break_ties=[True,False],
    ovr_classifier__estimator__kernel=["rbf", "sigmoid", "linear"],
)

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid=parameter_grid, verbose=10)
grid_search.fit(X_train_preprocessed[:100], Y_train[:100,:])
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

## Train

After hyperparameterization, train and validate model with the found hyperparameters

In [16]:
# Get and Combine features

entity_distances_feat = EntityDistance(absolute_values=False)

sentences_tfidf_feat = TfidfFeatures(max_features=3000, ngram_range=(1,3)) # 3000 features

shortest_path_feature = ShortestPathFeature()

tokens_between_entities_feat = SemanticEmbeddings(column='tokens_between_entities') # 300 features

entity1_neighbourhood_feat = SemanticEmbeddings(column='entity1_neighbourhood') # 300 features

entity2_neighbourhood_feat = SemanticEmbeddings(column='entity2_neighbourhood') # 300 features


combined_features = FeatureUnion([
        ("entity_distances_feat", entity_distances_feat),
        ("sentences_tfidf_feat", sentences_tfidf_feat),
        ("shortest_path_feature", shortest_path_feature),
        ("tokens_between_entities_feat", tokens_between_entities_feat),
        ("entity1_neighbourhood_feat", entity1_neighbourhood_feat),
        ("entity2_neighbourhood_feat", entity2_neighbourhood_feat)
    ])

# Define base SVM classifier
svm_classifier = SVC(C=1.1, break_ties=True, gamma='scale',kernel='linear',probability=True, shrinking=True)
# Create OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(svm_classifier)
# define a pipeline with feature extraction and the svm classifier
pipeline = Pipeline([("features", combined_features), ("ovr_classifier", ovr_classifier)])

# Train the OneVsRestClassifier
pipeline.fit(X_train_preprocessed, Y_train)




In [17]:
# Calculate and print training accuracy
Y_train_pred = pipeline.predict(X_train_preprocessed)
accuracy = accuracy_score(Y_train, Y_train_pred)
print("Training Accuracy:", accuracy)
f1_score_result = f1_score(Y_train, Y_train_pred, average='micro')
print("F1 Score:", f1_score_result)

Training Accuracy: 0.8521923620933521
F1 Score: 0.9181564064226744


## Validate

In [18]:
# using dev files as  validation set
val_sentences = read_file("./datasets/dev.sent")
val_relation_dicts = read_triplets("./datasets/dev.tup")
X_val, Y_val, _ = extract_X_Y(sentences=val_sentences, relation_dicts=val_relation_dicts)
# preprocess X
X_val_preprocessed = preprocessing(X_val)
# Predictions
Y_val_pred = pipeline.predict(X_val_preprocessed)
accuracy = accuracy_score(Y_val, Y_val_pred)
print("Accuracy:", accuracy)
f1_score_result = f1_score(Y_val, Y_val_pred, average='micro')
print("F1 Score:", f1_score_result)
print("\nClassification Report:\n", classification_report(Y_val, Y_val_pred,target_names=mlb.classes_))


Accuracy: 0.4772382397572079
F1 Score: 0.6253934942287512

Classification Report:
                               precision    recall  f1-score   support

                       brand       0.82      0.70      0.75        33
           business_division       0.50      0.33      0.40         3
                 chairperson       1.00      0.17      0.29         6
     chief_executive_officer       0.67      0.38      0.48        32
                     creator       0.67      0.53      0.59        19
                    currency       1.00      1.00      1.00         9
                   developer       0.44      0.57      0.50        21
          director_/_manager       1.00      0.25      0.40         4
              distributed_by       1.00      0.24      0.38        17
         distribution_format       0.75      0.55      0.63        11
                    employer       0.72      0.61      0.66        87
                  founded_by       0.52      0.29      0.37        45
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Test

prepare test data

In [19]:
# read and extract X & Y
test_sentences = read_file("./datasets/test.sent")
test_relation_dicts = read_triplets("./datasets/test.tup")
X_test, Y_test, _ = extract_X_Y(sentences=test_sentences, relation_dicts=test_relation_dicts)
# preprocess X
X_test_preprocessed = preprocessing(X_test)


Predictions

In [20]:
# Predictions
Y_pred = pipeline.predict(X_test_preprocessed)
# Y_pred = model.predict(X_test_features)
print(Y_pred[0])
print(Y_pred.shape)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
(1277, 29)


Evaluation

In [21]:

accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
f1_score_result = f1_score(Y_test, Y_pred, average='micro')
print("F1 Score:", f1_score_result)
print("\nClassification Report:\n", classification_report(Y_test, Y_pred,target_names=mlb.classes_))

Accuracy: 0.4729835552075176
F1 Score: 0.6284437825763217

Classification Report:
                               precision    recall  f1-score   support

                       brand       0.91      0.53      0.67        19
           business_division       1.00      0.80      0.89        10
                 chairperson       1.00      0.89      0.94        19
     chief_executive_officer       0.67      0.64      0.65        28
                     creator       1.00      0.67      0.80        21
                    currency       0.78      0.88      0.82         8
                   developer       0.72      0.58      0.64        48
          director_/_manager       1.00      0.30      0.46        10
              distributed_by       0.62      0.36      0.45        14
         distribution_format       0.89      0.62      0.73        13
                    employer       0.74      0.78      0.76        80
                  founded_by       0.73      0.54      0.62        41
      

  _warn_prf(average, modifier, msg_start, len(result))


## Inference Mode

In [20]:
def predict_with_model(sentence, entity1, entity2):
  X = np.array([[sentence,entity1,entity2]])
  X_preprocessed = preprocessing(X)
  Y_pred = pipeline.predict(X_preprocessed)
  return mlb.inverse_transform(Y_pred)

In [21]:
sample_sentence = "Wednesday, July 8, 2015 10:30AM IST (5:00AM GMT) Rimini Street Comment on Oracle Litigation Las Vegas, United States Rimini Street, Inc., the leading independent provider of enterprise software support for SAP AG’s (NYSE:SAP) Business Suite and BusinessObjects software and Oracle Corporation’s (NYSE:ORCL) Siebel , PeopleSoft , JD Edwards , E-Business Suite , Oracle Database , Hyperion and Oracle Retail software, today issued a statement on the Oracle litigation."
entity1, entity2 = "PeopleSoft",  "JD Edwards"
print(predict_with_model(sample_sentence,entity1, entity2))

[('subsidiary',)]
