In [None]:
import pandas as pd
from bs4 import BeautifulSoup, Comment, NavigableString, Tag
import re
import urllib
import difflib
from nltk import sent_tokenize
import math
from tqdm import tqdm
import re

In [None]:
df_versions = pd.read_parquet('/scratch/tsoares/wikidumps/simplewiki-NS0-20230901/val_data/link_versions.parquet')
df_versions.head(5)

In [None]:
sample = df_versions.sample(1)
source_title = sample['source_title'].iloc[0]
source_ID = sample['source_ID'].iloc[0]
first_version = sample['first_version'].iloc[0]
second_version = sample['second_version'].iloc[0]

In [None]:
file_1 = f'/scratch/tsoares/wikidumps/simplewiki-NS0-20230901/val_data/pages/{source_ID}_{first_version}.html'
file_2 = f'/scratch/tsoares/wikidumps/simplewiki-NS0-20230901/val_data/pages/{source_ID}_{second_version}.html'
with open(file_1, 'r') as f:
    text_1 = f.read()
with open(file_2, 'r') as f:
    text_2 = f.read()

In [None]:
html_1 = BeautifulSoup(text_1, 'html.parser')
html_2 = BeautifulSoup(text_2, 'html.parser')

In [None]:
def simplify_html(html):
    # only keep the tag with class 'mw-parser-output'
    html = html.find('div', {'class': 'mw-parser-output'})
    # remove all figures, tables, captions, sup, style
    # figures
    for figure in html.find_all('figure'):
        figure.decompose()
    # tables
    for table in html.find_all('table'):
        table.decompose()
    # captions
    for caption in html.find_all('caption'):
        caption.decompose()
    # sup
    for sup in html.find_all('sup'):
        sup.decompose()
    # style
    for style in html.find_all('style'):
        style.decompose()
    
    # remove all comments
    comments = html.find_all(string=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]
    
    # remove all tags with class 'mw-editsection'
    for tag in html.find_all('span', {'class': 'mw-editsection'}):
        tag.decompose()
    
    # remove all tags with class 'metadata'
    for tag in html.find_all('div', {'class': 'metadata'}):
        tag.decompose()
        
    # remove all tags with class 'reflist'
    for tag in html.find_all('div', {'class': 'reflist'}):
        tag.decompose()
        
    # remove all links with class 'external text'
    for tag in html.find_all('a', {'class': 'external text'}):
        tag.decompose()
        
    # remove all map tags
    for tag in html.find_all('map'):
        tag.decompose()
    return html

In [None]:
html_1_clean = simplify_html(html_1)
html_2_clean = simplify_html(html_2)
text_1 = "\n".join([line for line in html_1_clean.prettify().split('\n') if line.strip() != ''])
text_2 = "\n".join([line for line in html_2_clean.prettify().split('\n') if line.strip() != ''])
sentences_1 = sent_tokenize(text_1)
sentences_2 = sent_tokenize(text_2)

In [None]:
# for each sentence, if it is less than 50 characters, merge it with the next and previous sentences
i = 0
while i < len(sentences_1):
    if len(sentences_1[i]) < 50:
        if i > 0 and i < len(sentences_1) - 1:
            sentences_1[i-1] = sentences_1[i-1] + ' ' + sentences_1[i] + ' ' + sentences_1[i+1]
            del sentences_1[i]
            del sentences_1[i]
        elif i == 0:
            sentences_1[i] = sentences_1[i] + ' ' + sentences_1[i+1]
            del sentences_1[i+1]
        else:
            sentences_1[i-1] = sentences_1[i-1] + ' ' + sentences_1[i]
            del sentences_1[i]
    else:
        i += 1

i = 0
while i < len(sentences_2):
    if len(sentences_2[i]) < 50:
        if i > 0 and i < len(sentences_2) - 1:
            sentences_2[i-1] = sentences_2[i-1] + ' ' + sentences_2[i] + ' ' + sentences_2[i+1]
            del sentences_2[i]
            del sentences_2[i]
        elif i == 0:
            sentences_2[i] = sentences_2[i] + ' ' + sentences_2[i+1]
            del sentences_2[i+1]
        else:
            sentences_2[i-1] = sentences_2[i-1] + ' ' + sentences_2[i]
            del sentences_2[i]
    else:
        i += 1
        

In [None]:
# dont allow for links to be separated across sentences
i = 0
while i < len(sentences_1):
    while sentences_1[i].count('<a ') > sentences_1[i].count('</a>'):
        sentences_1[i] = sentences_1[i] + ' ' + sentences_1[i+1]
        del sentences_1[i+1]
    i += 1

i = 0
while i < len(sentences_2):
    while sentences_2[i].count('<a ') > sentences_2[i].count('</a>'):
        sentences_2[i] = sentences_2[i] + ' ' + sentences_2[i+1]
        del sentences_2[i+1]
    i += 1


In [None]:
# dont allow parenthesis to be separated across sentences
i = 0
while i < len(sentences_1) - 1:
    # find right-most occurrence of ')' and '(' in sentences_1[i]
    right_paren_before = sentences_1[i].rfind(')')
    left_paren_before = sentences_1[i].rfind('(')
    right_paren_before = right_paren_before if right_paren_before != -1 else 0
    left_paren_before = left_paren_before if left_paren_before != -1 else 0
    # find left-most occurrence of ')' and '(' in sentences_1[i+1]
    right_paren_after = sentences_1[i+1].find(')')
    left_paren_after = sentences_1[i+1].find('(')
    right_parent_after = right_paren_after if right_paren_after != -1 else len(sentences_1[i+1])
    left_parent_after = left_paren_after if left_paren_after != -1 else len(sentences_1[i+1])

    if right_paren_before < left_paren_before and right_parent_after < left_paren_after:
        sentences_1[i] = sentences_1[i] + ' ' + sentences_1[i+1]
        del sentences_1[i+1]
    else:
        i += 1

i = 0
while i < len(sentences_2) - 1:
    # find right-most occurrence of ')' and '(' in sentences_2[i]
    right_paren_before = sentences_2[i].rfind(')')
    left_paren_before = sentences_2[i].rfind('(')
    right_paren_before = right_paren_before if right_paren_before != -1 else 0
    left_paren_before = left_paren_before if left_paren_before != -1 else 0
    # find left-most occurrence of ')' and '(' in sentences_2[i+1]
    right_paren_after = sentences_2[i+1].find(')')
    left_paren_after = sentences_2[i+1].find('(')
    right_parent_after = right_paren_after if right_paren_after != -1 else len(sentences_2[i+1])
    left_parent_after = left_paren_after if left_paren_after != -1 else len(sentences_2[i+1])

    if right_paren_before < left_paren_before and right_parent_after < left_paren_after:
        sentences_2[i] = sentences_2[i] + ' ' + sentences_2[i+1]
        del sentences_2[i+1]
    else:
        i += 1 

In [None]:
# dont allow list items to be separated across sentences
i = 0
while i < len(sentences_1) - 1:
    while sentences_1[i].count('<li>') > sentences_1[i].count('</li>'):
        sentences_1[i] = sentences_1[i] + ' ' + sentences_1[i+1]
        del sentences_1[i+1]
    i += 1

i = 0
while i < len(sentences_2) - 1:
    while sentences_2[i].count('<li>') > sentences_2[i].count('</li>'):
        sentences_2[i] = sentences_2[i] + ' ' + sentences_2[i+1]
        del sentences_2[i+1]
    i += 1

In [None]:
# force </li> to act as a separator
i = 0
while i < len(sentences_1):
    if '</li>' in sentences_1[i]:
        extra_sentences = [s.strip() + '</li>' for s in sentences_1[i].split('</li>') if s.strip() != '']
        del sentences_1[i]
        sentences_1 = sentences_1[:i] + extra_sentences + sentences_1[i:]
    i += 1
    
i = 0
while i < len(sentences_2):
    if '</li>' in sentences_2[i]:
        extra_sentences = [s.strip() + '</li>' for s in sentences_2[i].split('</li>') if s.strip() != '']
        del sentences_2[i]
        sentences_2 = sentences_2[:i] + extra_sentences + sentences_2[i:]
    i += 1        

In [None]:
# dont allow h2 tags to be separated across sentences
i = 0
while i < len(sentences_1):
    while sentences_1[i].count('<h2>') != sentences_1[i].count('</h2>'):
        sentences_1[i] = sentences_1[i] + ' ' + sentences_1[i+1]
        del sentences_1[i+1]
    i += 1

i = 0
while i < len(sentences_2):
    while sentences_2[i].count('<h2>') != sentences_2[i].count('</h2>'):
        sentences_2[i] = sentences_2[i] + ' ' + sentences_2[i+1]
        del sentences_2[i+1]
    i += 1

In [None]:
# force <h2> to act as a separator
i = 0
while i < len(sentences_1):
    if '<h2>' in sentences_1[i]:
        extra_sentences = [s.strip() for s in sentences_1[i].split('<h2>')]
        extra_sentences[1:] = ['<h2>' + s for s in extra_sentences[1:]]
        extra_sentences = [s for s in extra_sentences if s != '']
        del sentences_1[i]
        sentences_1 = sentences_1[:i] + extra_sentences + sentences_1[i:]
    i += 1

i = 0
while i < len(sentences_2):
    if '<h2>' in sentences_2[i]:
        extra_sentences = [s.strip() for s in sentences_2[i].split('<h2>')]
        extra_sentences[1:] = ['<h2>' + s for s in extra_sentences[1:]]
        extra_sentences = [s for s in extra_sentences if s != '']
        del sentences_2[i]
        sentences_2 = sentences_2[:i] + extra_sentences + sentences_2[i:]
    i += 1

In [None]:
# print the differences between the two versions
d = difflib.Differ()

In [None]:
print(source_title)
print(sample['target_title'].iloc[0])
print(first_version)
print(second_version)

Match original and modified elements

In [None]:
original_sentences = []
new_sentences = []
diff = d.compare(sentences_1, sentences_2)
section_original = 'Lead'
section_new = 'Lead'
for line in diff:
    if line.startswith('?'):
        continue
    elif line.startswith('+'):
        soup = BeautifulSoup(line[2:].strip(), 'html.parser')
        clean_text = soup.text.strip()
        if clean_text.strip() == '':
            continue
        if '<h2>' in line:
            h2 = soup.find('h2')
            section_new = h2.text.strip()
        new_sentences.append({'added': True, 'index': len(new_sentences), 'match': None, 'section': section_new, 'clean_sentence': clean_text, 'raw_sentence': line[2:].strip()})
        words = new_sentences[-1]['clean_sentence'].split()
        freqs = {}
        for word in words:
            freqs[word] = freqs.get(word, 0) + 1
        norm = math.sqrt(sum([freqs[word] ** 2 for word in freqs]))
        best_match = {'index': None, 'score': 0}
        for i, sentence in enumerate(original_sentences):
            if not sentence['removed']:
                continue
            # compute cossine similarity between all the sentence, and get the highest score
            words = sentence['clean_sentence'].split()
            freqs2 = {}
            for word in words:
                freqs2[word] = freqs2.get(word, 0) + 1
            norm2 = math.sqrt(sum([freqs2[word] ** 2 for word in freqs2]))
            score = 0
            for word in freqs:
                score += freqs[word] * freqs2.get(word, 0)
            score = score / (norm * norm2)
            if score > best_match['score']:
                best_match['index'] = i
                best_match['score'] = score
        if best_match['score'] > 0.5:
            original_sentences[best_match['index']]['match'] = new_sentences[-1]['index']
            new_sentences[-1]['match'] = original_sentences[best_match['index']]['index']
    elif line.startswith('-'):
        soup = BeautifulSoup(line[2:].strip(), 'html.parser')
        clean_text = soup.text.strip()
        if clean_text.strip() == '':
            continue
        if '<h2>' in line:
            h2 = soup.find('h2')
            section_original = h2.text.strip()
        original_sentences.append({'removed': True, 'index': len(original_sentences), 'match': None, 'section': section_original, 'clean_sentence': clean_text, 'raw_sentence': line[2:].strip()})
    else:
        soup = BeautifulSoup(line.strip(), 'html.parser')
        clean_text = soup.text.strip()
        if clean_text.strip() == '':
            continue
        if '<h2>' in line:
            h2 = soup.find('h2')
            section_original = h2.text.strip()
            section_new = section_original
        original_sentences.append({'removed': False, 'index': len(original_sentences), 'match': None, 'section': section_original, 'clean_sentence': clean_text, 'raw_sentence': line.strip()})
        new_sentences.append({'added': False, 'index': len(new_sentences), 'match': None, 'section': section_new, 'clean_sentence': clean_text, 'raw_sentence': line.strip()})

In [None]:
for sentence in original_sentences:
    if sentence['removed']:
        print(sentence)

for sentence in new_sentences:
    if sentence['added']:
        print(sentence)

In [None]:
for sentence in new_sentences:
    if sentence['match'] is not None:
        print(sentence['raw_sentence'])
        print(original_sentences[sentence['match']]['raw_sentence'])
        print('##########')