In [5]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
import xml.etree.ElementTree as ET
import numpy as np
import time

Get article mapping to achieve article alignment

In [6]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
df = pd.read_excel('data/category_configuration_09-08-2022_08-08-01.xlsx', sheet_name = 'article_names_matching')
title2category = dict(zip(df["Article Title"], df["Category 2"]))

Help functions to get article alignments

In [25]:
def get_article_text(article):
    text = ""
    for child in article:
        text += " "+child.text
    return text

def article_alignment(doc1_path, doc2_path):
    doc1, doc2 = ET.parse(doc1_path), ET.parse(doc2_path)
    root1, root2 = doc1.getroot(), doc2.getroot()
    try:
        body1, body2 = root1[1][2], root2[1][2]
    except:
        return

    #model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
    article_dict = {}

    #if both articles attribute includes title
    if "title" in body1[0].attrib and "title" in body2[0].attrib:
        for article1 in body1:
            title1 = article1.get("title").lower()
            for article2 in body2:
                title2 = article2.get("title").lower()

                if title2category[title1] == title2category[title2]:
                    #to make sure that they have a high similarity
                    scores = util.cos_sim(
                        model.encode(get_article_text(article1)), 
                        model.encode(get_article_text(article2))
                    )
                    if max(scores[0])>0.7:
                        article_dict[article1.get("num")] = article2.get("num")

    else:
    #title not included in article attri, use sentence similarity instead
        for article1 in body1:

            text1_embd = model.encode(get_article_text(article1))
            text2_list_embd = model.encode([get_article_text(article2) for article2 in body2])
            scores = util.cos_sim(text1_embd, text2_list_embd)
            maxi = max(scores[0]).item()
            print(maxi)

            if maxi > 0.7:
                index = np.argmax(scores[0])
                article_dict[article1.get("num")] = body2[index].get("num")



    return article_dict

Extract similar sentences from two documents

In [39]:
def extract_similar(doc1_path, doc2_path):
    ret = []

    #First get article alignments insides two documents
    article_dict = article_alignment(doc1_path, doc2_path)
    doc1, doc2 = ET.parse(doc1_path), ET.parse(doc2_path)
    root1, root2 = doc1.getroot(), doc2.getroot()
    try:
        body1, body2 = root1[1][2], root2[1][2]
    except:
        return

    

    
    for article1 in body1:
        article2_num = article_dict.get(article1.get("num"))
        if article2_num:
        #if current article have alignment in anthoer document

            # print("Current document match: ")
            # print(article1.get('num') + '   :   ' + article2_num)
            article2 = body2.find(".//div[@num='" + article2_num + "']")
            article2_sents = []

            for child in article2:
                article2_sents.append(child.text)

            sentences2_embd = model.encode(article2_sents)

            for sentence1 in article1:
                sentence1_embd = model.encode(sentence1.text)
                scores = util.cos_sim(sentence1_embd, sentences2_embd)
                maxi = max(scores[0]).item()

                if 0.5 < maxi < 0.95:
                    index = np.argmax(scores[0])
                    ret.append((sentence1.text, article2[index].text, maxi))
                    # print(sentence1.text)
                    # print(maxi)
                    # print(article2[index].text)
                    # print("----------------------------------------------------")
    return ret


In [40]:

start = time.time() 
diff = extract_similar('data/full data/t1989-9-canada-russian-federation-bit-1989.xml', 'data/full data/t1990-14-canada-czech-republic-bit-1990.xml')
end = time.time()
print(end - start)

0.8434820175170898


In [87]:
list(filter(None, '   a . b . c . d   '.strip().split('.')))

['a ', ' b ', ' c ', ' d']

# 重构了亿小下

In [94]:
def elem2sent(article, break_sentence = True):
    '''
    Break article into sentences. Break sentences with "."
    article: xml element
    '''
    ret = []
    for child in article:
        if break_sentence:
            ret.extend(list(filter(None, child.text.strip().split('.'))))
        else:
            ret.append(child.text)
    return ret


def get_article_alignment(article_body1, article_body2, title2category, sanity_check = True):
    '''
    Get article alignment between two documents
    article_body: xml element
    title2category: dictionary of title to category
    sanity_check: use similarity score to check if the alignment is correct
    return a dictionary of article alignment
    '''
    alignment_match = {}

    #if both articles attribute includes title
    if 'title' in article_body1[0].attrib and 'title' in article_body2[0].attrib:
        article_title1 = [(article.get('title').lower(), article.get('num')) for article in article_body1]
        article_title2 = [(article.get('title').lower(), article.get('num')) for article in article_body2]
        for index1, (title1, num1) in enumerate(article_title1):
            for index2, (title2, num2) in enumerate(article_title2):
                if title2category[title1] == title2category[title2]:
                    #to make sure that they have a high similarity
                    if sanity_check and max(util.cos_sim(
                        model.encode(''.join(elem2sent(article_body1[index1]))),
                        model.encode(''.join(elem2sent(article_body2[index2])))
                    )).item() < 0.7:
                        continue
                    alignment_match[num1] = num2
                    break
                
    #title not included in article attri, use sentence similarity instead
    else:
        for article1 in article_body1:
            text1_embd = model.encode(''.join(elem2sent(article1)))
            text2_list_embd = model.encode([''.join(elem2sent(article2)) for article2 in article_body2])
            scores = util.cos_sim(text1_embd, text2_list_embd)
            if max(scores[0]).item() > 0.7:
                index = np.argmax(scores[0])
                alignment_match[article1.get('num')] = article_body2[index].get('num')

    return alignment_match
    

def extract_similar_sentences_from_article(article1, article2):
    '''
    article1, article2: xml element
    return: list of similar sentences: (sentence1, sentence2, similarity)
    '''
    #article1_sents, article2_sents = elem2sent(article1, break_sentence = False), elem2sent(article2, break_sentence = False)
    article1_sents, article2_sents = elem2sent(article1), elem2sent(article2)

    #Embed article1 and article2
    article1_embd, article2_embd = model.encode(article1_sents), model.encode(article2_sents)

    #Get similarity between article1 and article2
    scores = util.cos_sim(article1_embd, article2_embd)

    visited = set() #to make sure that we don't add the same sentence twice

    #filter out the sentence with similarity greater than 0.98, this means they are perfect match and no need to compare
    identical = (scores > 0.98).to(torch.int64)
    for i, j in identical.nonzero().tolist():
        visited.add('row' + str(i))
        visited.add('col' + str(j))

    #filter out the sentences with similarity between 0.5 and 0.98
    mask = (scores > 0.5) & (scores < 0.98)
    scores *= mask.to(torch.int64) 

    #get the index of the sentences with similarity between 0.5 and 0.98
    sim_pairs = [(scores[i][j], i, j) for i, j in mask.nonzero().tolist()]
    sim_pairs.sort(key = lambda x: x[0]) #sort by similarity score
    
    ret = []
    while sim_pairs:
        score, i, j = sim_pairs.pop()
        if 'row' + str(i) not in visited and 'col' + str(j) not in visited:
            ret.append((article1_sents[i], article2_sents[j], scores[i][j].item()))
            visited.add('row' + str(i))
            visited.add('col' + str(j))

    return ret

def extract_similar_from_doc(doc1_path, doc2_path, title2category, min_length = 5):
    '''
    doc_path: path to first document
    title2category: dictionary of title to category
    '''

    try:   
        doc1, doc2 = ET.parse(doc1_path), ET.parse(doc2_path)
        doc_root1, doc_root2 = doc1.getroot(), doc2.getroot()
        doc_body1, doc_body2 = doc_root1[1][2], doc_root2[1][2]
    except Exception as e:
        print(e); return []
    
    #get article alignment between two documents
    alignment_match = get_article_alignment(doc_body1, doc_body2, title2category)

    ret = []
    for page1, page2 in alignment_match.items():
        article1 = doc_body1.find(".//div[@num='" + page1 + "']")
        article2 = doc_body2.find(".//div[@num='" + page2 + "']")
        ret.extend(extract_similar_sentences_from_article(article1, article2))
    
    #sort by similarity score
    ret.sort(key = lambda x: x[2], reverse = True)
    
    #filter out the pairs in which both sentences are longer than min_length words and length difference is less than 4 * min_length
    ret = [x for x in ret if len(x[0].split()) > min_length and len(x[1].split()) > min_length and abs(len(x[0].split()) - len(x[1].split())) < 4 * min_length]

    return ret

In [95]:
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')
# meta_data_df = pd.read_excel('data/category_configuration_09-08-2022_08-08-01.xlsx', sheet_name = 'article_names_matching')
# title2category = dict(zip(meta_data_df["Article Title"], meta_data_df["Category 2"]))
start = time.time()
diff1 = extract_similar_from_doc(
    'data/full data/t1989-9-canada-russian-federation-bit-1989.xml', 
    'data/full data/t1990-14-canada-czech-republic-bit-1990.xml',
    title2category
)
print(time.time() - start)

0.5321807861328125


In [96]:
diff1

[('(b) The term "investment" means any kind of asset invested either directly, or indirectly through an investor of a third State, by an investor of one Contracting Party in the territory of the other Contracting Party and in particular, though not exclusively, shall include:',
  '(a) The term "investment" means any kind of asset held or invested either directly, or indirectly through an investor of a third State, by an investor of one Contracting Party in the territory of the other Contracting Party in accordance with the latter\'s laws and, in particular, though not exclusively, includes:',
  0.9771180748939514),
 ('(i) Any movable and immovable property and any related property rights, such as mortgages;',
  '(i) Movable and immovable property and any related property rights, such as mortgages, liens or pledges;',
  0.9756933450698853),
 ('Investors of one Contracting Party whose investments or returns in the territory of the other Contracting Party suffer losses owing to war, other

In [75]:
diff2 = extract_similar_from_doc(
    'data/full data/t1995-139-hong-kong-china-sar-italy-bit-1995.xml',
    'data/full data/t1995-140-hong-kong-china-sar-new-zealand-bit-1995.xml',
    title2category
)

In [77]:
diff2

[('(a) In respect of Hong Kong, the armed forces of the sovereign government which is responsible for its foreign affairs;',
  '(i) In respect of Hong Kong, the armed forces of the Government of the sovereign State which is responsible for foreign affairs relating to Hong Kong;',
  0.9781162738800049),
 ('(7) "returns" means the amounts yielded by an investment and in particular, though not exclusively, includes profit, interest, capital gains, dividends, royalties and fees or payment for assistance and technical services.',
  '6. "returns" means the amounts yielded by an investment and in particular, though not exclusively, includes profit, earnings, interest, capital gains, dividends, royalties, proceeds of liquidation, loan repayments and fees.',
  0.9767873287200928),
 ('Shall be accorded restitution or reasonable compensation. Resulting payments shall be freely convertible.',
  'Shall be accorded restitution or reasonable compensation. Resulting payments shall be in a freely conve

In [61]:
s1 = '(5) The arbitral tribunal shall reach its decision by a majority of votes. Such decision shall be binding on both Contracting Parties. Unless otherwise agreed, the decision of the arbitral tribunal shall be rendered within six months of the appointment of the Chairman in accordance with paragraph (3) or (4) of this Article. The arbitral tribunal shall determine its own procedure. Each Contracting Party shall bear the costs of its own member of the tribunal and of its representation in the arbitral proceedings; the costs related to the Chairman and any remaining costs shall be borne equally by the Contracting Parties. The arbitral tribunal may, however, in its decision direct that a higher proportion of costs shall be borne by one of the two Contracting Parties, and this award shall be binding on both Contracting Parties.'
s2 = '(5) The arbitral tribunal shall determine its own procedure. The arbitral tribunal shall reach its decision by a majority of votes. Such decision shall be binding on both Contracting Parties. Unless otherwise agreed, the decision of the arbitral tribunal shall be rendered within six months of the appointment of the Chairman in accordance with paragraph (3) or (4) of this Article.'

print(util.cos_sim(model.encode([s1]), model.encode([s2])))

tensor([[0.9706]])
