In [1]:
import pandas as pd
from mwedittypes import StructuredEditTypes
from time import perf_counter
import random
import urllib
import re

In [2]:
def clean_xml(text):
    text = text.replace('\n*', '\n')
    # find all {{...}} elements
    templates = re.findall(r'{{[^}]*}}', text)
    # remove all templates containing 0 | charaters or more than 2 | characters
    templates = [t for t in templates if t.count('|') == 0 or t.count('|') > 2]
    for t in templates:
        text = text.replace(t, '')
    text = text.strip()
    return text

In [3]:
df = pd.read_parquet('/scratch/tsoares/wikidumps/simplewiki-NS0-20230901/val_data/link_versions.parquet')
df

Unnamed: 0,source_title,target_title,source_ID,target_ID,first_version,second_version,first_text,second_text
0,Andouille,Cane,18,26429,7592354,9074938,[[File:Andouille.jpg|thumb]]\n[[File:Andouille...,[[File:Andouille.jpg|thumb]]\n[[File:Andouille...
1,Andouille,Pecan,18,168446,7592354,9074938,[[File:Andouille.jpg|thumb]]\n[[File:Andouille...,[[File:Andouille.jpg|thumb]]\n[[File:Andouille...
2,Andouille,Taste,18,13407,7592354,9074938,[[File:Andouille.jpg|thumb]]\n[[File:Andouille...,[[File:Andouille.jpg|thumb]]\n[[File:Andouille...
3,Astronomy,Black_hole,48,3506,9035453,9035459,[[File:The Galactic Centre and Bulge above the...,[[File:The Galactic Centre and Bulge above the...
4,Astronomy,Cosmic_ray,48,241923,9035250,9035453,[[File:The Galactic Centre and Bulge above the...,[[File:The Galactic Centre and Bulge above the...
...,...,...,...,...,...,...,...,...
6688,Aosta_Valley_%28Chamber_of_Deputies_constituen...,Valle_d%27Aosta,1042752,35255,9036987,9036989,</ref>\n|towns = [[Aosta]]\n|area ...,</ref>\n|towns = [[Aosta]]\n|area ...
6689,Justin_I,Justinian_I,1042831,68685,9037540,9037557,iustinus {{Abbreviation|p·p·|PERPETUUS}} {{Abb...,iustinus {{Abbreviation|p·p·|PERPETUUS}} {{Abb...
6690,Justin_I,Latin,1042831,6592,9037540,9037557,iustinus {{Abbreviation|p·p·|PERPETUUS}} {{Abb...,iustinus {{Abbreviation|p·p·|PERPETUUS}} {{Abb...
6691,Justin_I,List_of_Byzantine_emperors,1042831,56110,9037540,9037557,iustinus {{Abbreviation|p·p·|PERPETUUS}} {{Abb...,iustinus {{Abbreviation|p·p·|PERPETUUS}} {{Abb...


In [4]:
# re-structure data
# group by first_version and second version
revisions = {}
temp = df.to_dict('records')
for row in temp:
    if f"{row['first_version']}->{row['second_version']}" not in revisions:
        revisions[f"{row['first_version']}->{row['second_version']}"] = {'first_text': row['first_text'],
                                                                         'second_text': row['second_text'],
                                                                         'source_title': row['source_title'],
                                                                         'source_ID': row['source_ID'],
                                                                         'added_links': [{'target_title': row['target_title'], 'target_ID': row['target_ID']}]}
    else:
        revisions[f"{row['first_version']}->{row['second_version']}"]['added_links'].append({'target_title': row['target_title'], 'target_ID': row['target_ID']})
len(revisions)

2823

In [30]:
rand_revisions = random.sample(list(revisions), 100)

In [31]:
for revision in rand_revisions:
    print(f"Source Title: {revisions[revision]['source_title']}")
    print(f"First Version: {revision.split('->')[0]}")
    print(f"Second Version: {revision.split('->')[1]}")
    print(f"Target Titles: {[link['target_title'] for link in revisions[revision]['added_links']]}")
    edit_diffs = StructuredEditTypes(revisions[revision]['first_text'], revisions[revision]['second_text'], lang='simple', timeout=True).get_diff()
    for category in edit_diffs:
        print(category)
        for elem in edit_diffs[category]:
            print(f"\t{elem}")

Source Title: 2023_Atlantic_hurricane_season
First Version: 9065843
Second Version: 9072989
Target Titles: ['Barahona%2C_Dominican_Republic', 'Extratropical_cyclone', 'Eyewall_replacement_cycle', 'Wind_shear']
node-edits
	NodeEdit(type='Heading', edittype='insert', section='11: === Hurricane Franklin ===', name='Hurricane Franklin', changes=[('level', None, 3), ('title', None, 'Hurricane Franklin')])
	NodeEdit(type='Reference', edittype='insert', section='11: === Hurricane Franklin ===', name=None, changes=[])
	NodeEdit(type='Reference', edittype='insert', section='11: === Hurricane Franklin ===', name=None, changes=[])
	NodeEdit(type='Wikilink', edittype='insert', section='11: === Hurricane Franklin ===', name='wind shear', changes=[('title', None, 'wind shear')])
	NodeEdit(type='Reference', edittype='insert', section='11: === Hurricane Franklin ===', name=None, changes=[])
	NodeEdit(type='Wikilink', edittype='insert', section='11: === Hurricane Franklin ===', name='Barahona, Dominica

In [35]:
for revision in rand_revisions:
    start = perf_counter()
    edit_diffs = StructuredEditTypes(revisions[revision]['first_text'], revisions[revision]['second_text'], lang='simple', timeout=True).get_diff()
    target_pages = [urllib.parse.unquote(link['target_title']).replace('_', ' ').lower() for link in revisions[revision]['added_links']]
    print(target_pages)
    print(revisions[revision]['source_title'])
    print(revision)
    
    # split the first text into sections
    first_text_sections = re.split(r'(==.+==)', revisions[revision]['first_text'])
    first_sections = {}
    if len(first_text_sections) % 2 == 0:
        for i in range(0, len(first_text_sections), 2):
            section_title = clean_xml(first_text_sections[i].replace('=', ''))
            first_sections[section_title] = clean_xml(first_text_sections[i+1])
    else:
        first_sections['Lead'] = clean_xml(first_text_sections[0])
        for i in range(1, len(first_text_sections), 2):
            section_title = clean_xml(first_text_sections[i].replace('=', ''))
            first_sections[section_title] = clean_xml(first_text_sections[i+1])
    
    # split the second text into sections
    second_text_sections = re.split(r'(==.+==)', revisions[revision]['second_text'])
    second_sections = {}
    if len(second_text_sections) % 2 == 0:
        for i in range(0, len(second_text_sections), 2):
            section_title = clean_xml(second_text_sections[i].replace('=', ''))
            second_sections[section_title] = clean_xml(second_text_sections[i+1])
    else:
        second_sections['Lead'] = clean_xml(second_text_sections[0])
        for i in range(1, len(second_text_sections), 2):
            section_title = clean_xml(second_text_sections[i].replace('=', ''))
            second_sections[section_title] = clean_xml(second_text_sections[i+1])
    
    
    nodes = []
    for edit in edit_diffs['node-edits']:
        # check if the edit type is 'insert'
        if edit[0] != 'Wikilink':
            continue
        if edit[1] != 'insert' and edit[1] != 'change':
            continue
        if edit[4][0][2].lower() not in target_pages:
            continue
        section = clean_xml(": ".join(edit[2].split(': ')[1:]).replace('=', ''))
        if section == 'Lede':
            section = 'Lead'
        print(edit)
        if edit[1] == 'change':
            nodes.append({'section': section, 'target': edit[4][0][2].lower(), 'mention': edit[4][1][2] if len(edit[4]) > 1 and edit[4][1][2] else edit[4][0][2], 'prev_target': edit[4][0][1].lower(), 'prev_mention': edit[4][1][1] if len(edit[4]) > 1 and edit[4][1][1] else edit[3]})
        else:
            nodes.append({'section': section, 'target': edit[4][0][2].lower(), 'mention': edit[4][1][2] if len(edit[4]) > 1 else edit[3], 'prev_text': None, 'prev_mention': None})
    # find if all the nodes were already in the first text or not
    for node in nodes:
        print(node['mention'], revisions[revision]['first_text'].count(node['mention']), revisions[revision]['second_text'].count(node['mention']))
        if revisions[revision]['first_text'].count(node['mention']) == revisions[revision]['second_text'].count(node['mention']):
            node['present'] = True
        else:
            node['present'] = False
    
    print(nodes)
        
    # for edit in edit_diffs['text-edits']:
    #     if edit[0] != 'Sentence':
    #         continue
    #     found = False
    #     print(edit)
    print(f"Time: {perf_counter() - start} seconds")

['barahona, dominican republic', 'extratropical cyclone', 'eyewall replacement cycle', 'wind shear']
2023_Atlantic_hurricane_season
9065843->9072989
NodeEdit(type='Wikilink', edittype='insert', section='11: === Hurricane Franklin ===', name='wind shear', changes=[('title', None, 'wind shear')])
NodeEdit(type='Wikilink', edittype='insert', section='11: === Hurricane Franklin ===', name='Barahona, Dominican Republic', changes=[('title', None, 'Barahona, Dominican Republic')])
NodeEdit(type='Wikilink', edittype='insert', section='11: === Hurricane Franklin ===', name='eyewall replacement cycle', changes=[('title', None, 'eyewall replacement cycle')])
NodeEdit(type='Wikilink', edittype='insert', section='11: === Hurricane Franklin ===', name='extratropical cyclone', changes=[('title', None, 'extratropical cyclone')])
wind shear 0 1
Barahona, Dominican Republic 0 1
eyewall replacement cycle 0 1
extratropical cyclone 0 1
[{'section': 'Hurricane Franklin', 'target': 'wind shear', 'mention': '