In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
import xml.etree.ElementTree as ET
import numpy as np

Get article mapping to achieve article alignment

In [2]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
df = pd.read_excel('data/category_configuration_09-08-2022_08-08-01.xlsx', sheet_name = 'article_names_matching')
title_category = dict(zip(df["Article Title"], df["Category 2"]))

Help functions to get article alignments

In [3]:
def get_article_text(article):
    text = ""
    for child in article:
        text += " "+child.text
    return text

def article_alignment(doc1_path, doc2_path):
    doc1, doc2 = ET.parse(doc1_path), ET.parse(doc2_path)
    root1, root2 = doc1.getroot(), doc2.getroot()
    try:
        body1, body2 = root1[1][2], root2[1][2]
    except:
        return

    #model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
    article_dict = {}

    #if both articles attribute includes title
    if "title" in body1[0].attrib and "title" in body2[0].attrib:
        for article1 in body1:
            title1 = article1.get("title").lower()
            for article2 in body2:
                title2 = article2.get("title").lower()

                if title_category[title1] == title_category[title2]:
                    #to make sure that they have a high similarity
                    scores = util.cos_sim(
                        model.encode(get_article_text(article1)), 
                        model.encode(get_article_text(article2))
                    )
                    if max(scores[0])>0.7:
                        article_dict[article1.get("num")] = article2.get("num")

    else:
    #title not included in article attri, use sentence similarity instead
        for article1 in body1:

            text1_embd = model.encode(get_article_text(article1))
            text2_list_embd = model.encode([get_article_text(article2) for article2 in body2])
            scores = util.cos_sim(text1_embd, text2_list_embd)
            maxi = max(scores[0]).item()
            print(maxi)

            if maxi > 0.7:
                index = np.argmax(scores[0])
                article_dict[article1.get("num")] = body2[index].get("num")



    return article_dict

Extract similar sentences from two documents

In [4]:
def extract_similar(doc1_path, doc2_path):
    ret = []

    #First get article alignments insides two documents
    article_dict = article_alignment(doc1_path, doc2_path)
    doc1, doc2 = ET.parse(doc1_path), ET.parse(doc2_path)
    root1, root2 = doc1.getroot(), doc2.getroot()
    try:
        body1, body2 = root1[1][2], root2[1][2]
    except:
        return

    

    
    for article1 in body1:
        article2_num = article_dict.get(article1.get("num"))
        if article2_num:
        #if current article have alignment in anthoer document

            # print("Current document match: ")
            # print(article1.get('num') + '   :   ' + article2_num)
            article2 = body2.find(".//div[@num='" + article2_num + "']")
            article2_sents = []

            for child in article2:
                article2_sents.append(child.text)

            sentences2_embd = model.encode(article2_sents)

            for sentence1 in article1:
                sentence1_embd = model.encode(sentence1.text)
                scores = util.cos_sim(sentence1_embd, sentences2_embd)
                maxi = max(scores[0]).item()

                if 0.5 < maxi < 0.95:
                    index = np.argmax(scores[0])
                    ret.append((sentence1.text, article2[index].text, maxi))
                    # print(sentence1.text)
                    # print(maxi)
                    # print(article2[index].text)
                    # print("----------------------------------------------------")
    return ret


In [5]:
extract_similar('data/full data/t1989-9-canada-russian-federation-bit-1989.xml', 'data/full data/t1990-14-canada-czech-republic-bit-1990.xml')

[('(a) The term "territory" means the territory of Canada or the territory of the Union of Soviet Socialist Republics respectively, as well as those maritime areas, including the seabed and subsoil adjacent to the outer limit of the territorial sea of either of the above territories, over which the State concerned exercises, in accordance with international law, sovereign rights for the purpose of exploration and exploitation of the natural resources of such areas;',
  '(i) In respect of Canada, the territory of Canada, as well as those maritime areas, including the seabed and subsoil adjacent to the outer limit of the territorial sea, over which Canada exercises, in accordance with international law, sovereign rights for the purpose of exploration and exploitation of the natural resources of such areas;',
  0.9311114549636841),
 ('(d) The term "investor" means with regard to either Contracting Party:',
  '(b) The term "investor" means:',
  0.9142022132873535),
 ('Provided that such na

fair and equitable treatment


In [6]:
doc_path = 'data/full data/t1989-9-canada-russian-federation-bit-1989.xml'
doc = ET.parse(doc_path)
root = doc.getroot()
body = root[1][2]

body[0].attrib

{'type': 'article', 'num': 'I', 'title': 'Definitions'}

In [68]:
def xml2sentences(article, break_sentence = True):
    '''
    Break article into sentences. Break sentences with "."
    article: xml element
    '''
    ret = []
    for child in article:
        if break_sentence:
            ret.extend(child.text.strip().split("."))
        else:
            ret.append(child.text)
    return ret

def extract_similar_sentences_from_article(article1, article2):
    '''
    article1, article2: xml element
    return: list of similar sentences: (sentence1, sentence2, similarity)
    '''
    ret = []
    article1_sents, article2_sents = xml2sentences(article1, break_sentence = False), xml2sentences(article2, break_sentence = False)
    # article1_sents, article2_sents = xml2sentences(article1), xml2sentences(article2)


    #Embed article1 and article2
    article1_embd = model.encode(article1_sents)
    article2_embd = model.encode(article2_sents)

    #Get similarity between article1 and article2
    scores = util.cos_sim(article1_embd, article2_embd)

    visited = set() #to make sure that we don't add the same sentence twice

    #filter out the sentence with similarity greater than 0.99, this means they are perfect match and no need to compare
    identical = (scores > 0.99).to(torch.int64)
    for i, j in identical.nonzero().tolist():
        visited.add('row' + str(i))
        visited.add('col' + str(j))

    #filter out the sentences with similarity between 0.5 and 0.95
    mask = (scores > 0.5) & (scores < 0.95)
    scores *= mask.to(torch.int64) 

    #get the index of the sentences with similarity between 0.5 and 0.95
    sim_pairs = [(scores[i][j], i, j) for i, j in mask.nonzero().tolist()]
    sim_pairs.sort(key = lambda x: x[0]) #sort by similarity score
    
    while sim_pairs:
        score, i, j = sim_pairs.pop()
        if 'row' + str(i) not in visited and 'col' + str(j) not in visited:
            ret.append((article1_sents[i], article2_sents[j], scores[i][j]))
            visited.add('row' + str(i))
            visited.add('col' + str(j))

    
    return ret

In [69]:
def extract_similar2(doc1_path, doc2_path):
    ret = []

    #First get article alignments insides two documents
    alignment_match = article_alignment(doc1_path, doc2_path)
    doc1, doc2 = ET.parse(doc1_path), ET.parse(doc2_path)
    root1, root2 = doc1.getroot(), doc2.getroot()
    try:
        body1, body2 = root1[1][2], root2[1][2]
    except:
        return

    for page1, page2 in alignment_match.items():
        article1 = body1.find(".//div[@num='" + page1 + "']")
        article2 = body2.find(".//div[@num='" + page2 + "']")
        ret.extend(extract_similar_sentences_from_article(article1, article2))
    
    return ret

    

In [70]:
extract_similar2('data/full data/t1989-9-canada-russian-federation-bit-1989.xml', 'data/full data/t1990-14-canada-czech-republic-bit-1990.xml')

[('Provided that such natural person, corporation, partnership, trust, joint venture, organization, association or enterprise has the legal right, in accordance with the laws of that Contracting Party, to make investments in the territory of the other Contracting Party.',
  'Provided that such investor has the right, in accordance with the laws of the Contracting Party, to invest in the territory of the other Contracting Party.',
  tensor(0.9357)),
 ('(a) The term "territory" means the territory of Canada or the territory of the Union of Soviet Socialist Republics respectively, as well as those maritime areas, including the seabed and subsoil adjacent to the outer limit of the territorial sea of either of the above territories, over which the State concerned exercises, in accordance with international law, sovereign rights for the purpose of exploration and exploitation of the natural resources of such areas;',
  '(i) In respect of Canada, the territory of Canada, as well as those mari