In [15]:
import pandas as pd
import macrogen as m
positions = pd.read_csv('positions.csv', index_col=0)



In [2]:
pos = positions.fillna(0).astype('int16')

In [3]:
single_none_order = pos.sort_values(by='single-none').index

In [4]:
single_copy_order = pos.sort_values(by='single-copy').index

In [5]:
import difflib

In [6]:
sm = difflib.SequenceMatcher(a=list(single_none_order), b=list(single_copy_order), autojunk=False)

In [7]:
class PositionMatcher(difflib.SequenceMatcher):
    def __init__(self, df, a_name, b_name):
        self.a_name = a_name
        self.b_name = b_name
        a = list(df.sort_values(by=a_name).index)
        b = list(df.sort_values(by=b_name).index)
        super().__init__(a=a, b=b, autojunk=False)
        
    def _repr_html_(self):
        result = f"<table><thead><tr><th>{self.a_name}</th><th>{self.b_name}</th></thead><tbody>\n"
        for op, a_start, a_end, b_start, b_end in self.get_opcodes():
            if op == 'equal':
                result += f'<tr style="color:gray"><td colspan="2">{"<br/>".join(self.a[a_start:a_end])}</td></tr>\n'
            elif op == 'delete':
                result += f'<tr style="color:red"><td>{"<br/>".join(self.a[a_start:a_end])}</td><td /></tr>\n'
            elif op == 'insert':
                result += f'<tr style="color:green"><td /><td>{"<br/>".join(self.b[b_start:b_end])}</td></tr>\n'
            elif op == 'replace':
                result += f'<tr style="color:blue"><td>{"<br/>".join(self.a[a_start:a_end])}</td><td>{"<br/>".join(self.b[b_start:b_end])}</td></tr>\n'                
        result += "</tbody></table>\n"
        return result
    
    def moved_groups(self):
        for op, a_start, a_end, b_start, b_end in self.get_opcodes():
            if op == 'insert' or op == 'replace':
                yield self.b[b_start:b_end]

In [8]:
pm = PositionMatcher(pos, 'single-none', 'single-copy')

In [9]:
mg = list(pm.moved_groups())

In [10]:
len(mg), sum(map(len, mg))

(11, 17)

In [11]:
def differentiate(pos: pd.DataFrame, reference_column: str):
    stats = pd.DataFrame(columns=['ngroups', 'nitems', 'moved'])
    stats.name = reference_column
    for column in pos.columns.difference(set(reference_column)):
        pm = PositionMatcher(pos, reference_column, column)
        mg = list(pm.moved_groups())
        stats.loc[column, 'ngroups'] = len(mg)
        stats.loc[column, 'nitems'] = sum(map(len, mg))
        stats.loc[column,  'moved'] = ";    ".join(", ".join(group) for group in mg)
    return stats.sort_values(by='nitems')

In [85]:
differentiate(pos, 'single-copy')

Unnamed: 0,ngroups,nitems,moved
single-copy,0,0,
single-none,11,17,"1 H.10; 2 I H.19, 2 I H.20; 2 III H.5, 2..."
single-orphan,11,17,"1 H.10; 2 I H.19, 2 I H.20; 2 III H.5, 2..."
single-copy-nearest,6,27,"1 H.4, H P20, H P82; 2 V H.32, 2 III H.50b,..."
single-copy-farthest,12,53,"1 H.4, H P20, H P82; 2 V H.32, 2 V H.31 i_r..."
single-copy-copy,13,67,"1 H.4, H P20, H P82; 2 V H.32, 2 V H.31 i_r..."
split-none,52,187,"1 H.10, 1 H.15, 1 H.8; 2 III H.12a, 2 III H..."
split-inline,51,189,"1 H.10, 1 H.15, 1 H.8; 2 III H.12a, 2 III H..."
split-inline-low,52,189,"1 H.10, 1 H.15, 1 H.8; 2 III H.12a, 2 III H..."
reverse-inline,95,379,"1 H.8, wa: 182 i_r_o; 2 III H.12a, 2 III H...."


In [13]:
pm  = PositionMatcher(pos, 'single-copy', 'split-inline')

In [14]:
pm

single-copy,split-inline
,1 H.10 1 H.15 1 H.8
wa: 182 wa: 182 i_r_o,wa: 182 wa: 182 i_r_o
1 H.8 1 H.15 2 III H.27 alpha 2 III H.30 alpha 2 III H.70 alpha 2 III H.71 alpha 2 II H.69 alpha 2 II H.71 alpha 2 I H.21 alpha 2 I H.32 alpha 2 V H.3 alpha 2 V H.17 i_r 2 IV H.20 i_uebrige 2 IV H.12 i_v_o fischer_lamberg: 2_III_H.15 wa: 2_H_P167 wa: 2_III_H.2_alpha wa: 2_II_H.31_alpha wa: 2_H_P167 i_uebrige 2 I H.0 2 I H.0a 2 I H.0b 2 I H.0c 2 I H.0d 2 I H.2 2 I H.6a 2 I H.10 2 I H.14 2 I H.15b 2 I H.15c 2 I H.16 2 I H.18 2 I H.21 2 I H.23 2 I H.24a 2 I H.49 2 I H.51 2 I H.52 2 I H.53 2 I H.54 2 I H.55 2 II H.2:2 2 II H.5a 2 II H.6a 2 II H.6b,2 III H.12a 2 III H.2:1 2 III H.2:2 2 III H.3a:1 2 III H.3a:2 2 III H.4 2 III H.45a 2 III H.45b 2 III H.46a 2 III H.46b 2 III H.48 2 III H.49 2 III H.5 2 III H.50 2 III H.60 2 III H.61 2 III H.64e 2 III H.66 2 III H.68 2 III H.69 2 III H.76 2 III H.8 2 III H.81 2 III H.81a 2 III H.81b*
2 II H.11a 2 II H.13a 2 II H.16 2 II H.21 2 II H.26,2 II H.11a 2 II H.13a 2 II H.16 2 II H.21 2 II H.26
,2 II H.27:1
2 II H.29,2 II H.29
,2 II H.2:2
2 II H.30 2 II H.31 2 II H.32 2 II H.37 2 II H.40 2 II H.43 2 II H.44 2 II H.50 2 II H.59,2 II H.30 2 II H.31 2 II H.32 2 II H.37 2 II H.40 2 II H.43 2 II H.44 2 II H.50 2 II H.59
,2 II H.5a
2 II H.68,2 II H.68


In [16]:
mi = m.MacrogenesisInfo('target/split-inline/macrogen-info.zip')

[32mINFO     macrogen.graph Loading macrogenesis graphs from target/split-inline/macrogen-info.zip[0m
[32mINFO     macrogen.graph Removing edges to ignore[0m
[32mINFO     macrogen.graph Adding 161 otherwise unmentioned references to the working graph[0m
[32mINFO     macrogen.graph Creating sort order from DAG[0m
[32mINFO     macrogen.graph Preparing transitive closure …[0m
[32mINFO     macrogen.graph Inferring witness detail table ..[0m
[32mINFO     macrogen.graph Preparing details on references[0m
[32mINFO     macrogen.graph MacrogenesisInfo loaded.[0m


In [23]:
from itertools import chain
ws = [w for w in chain.from_iterable(pm.moved_groups())]
nodes = mi.nodes(", ".join(ws))

In [25]:
nodes.count(None)

36

In [31]:
refs = [w.reference for w in nodes if w is not None]

In [37]:
in_degrees = [mi.base.in_degree[m.splitgraph.SplitReference(ref, m.splitgraph.Side.START)] for ref in refs]
out_degrees = [mi.base.out_degree[m.splitgraph.SplitReference(ref, m.splitgraph.Side.END)] for ref in refs]

In [42]:
def degrees(graph, ref):
    return {'in': mi.base.in_degree[m.splitgraph.SplitReference(ref, m.splitgraph.Side.START)],
           'out': mi.base.out_degree[m.splitgraph.SplitReference(ref, m.splitgraph.Side.END)]}

In [46]:
degs = pd.DataFrame({ref: degrees(mi.base, ref) for ref in refs}).T

In [50]:
degs['total'] = degs.sum(axis=1)

In [67]:
mi.details.loc[refs,:]

Unnamed: 0,uri,label,kind,inscription_of,position,start_pos,end_pos,rank,max_before_date,max_abs_before_date,min_after_date,min_abs_after_date,avg,avg_year,yearlabel,baseline_position
1 H.10,faust://document/faustedition/1_H.10,1 H.10,Witness,,1,1,2,2,,,1800-11-09,,1800-11-09,1800.0,1800,12
1 H.15,faust://document/faustedition/1_H.15,1 H.15,Witness,,2,3,4,2,,,1776-01-02,1776-01-02,1776-01-02,1776.0,1776,17
2 III H.2:1,faust://document/faustedition/2_III_H.2_1,2 III H.2:1,Witness,,5,9,10,2,,,1826-05-26,,1826-05-26,1826.0,1826,290
2 III H.4,faust://document/faustedition/2_III_H.4,2 III H.4,Witness,,6,11,12,2,,,1826-06-10,1826-12-11,1826-06-10,1826.0,1826,299
2 III H.46b,faust://document/faustedition/2_III_H.46b,2 III H.46b,Witness,,7,13,14,2,,,1826-04-01,1826-10-11,1826-04-01,1826.0,1826,363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2 II H.7,faust://document/faustedition/2_II_H.7,2 II H.7,Witness,,597,1193,1194,1364,1830-06-11,,1832-06-06,,1831-06-09,1831.0,1830 … 1832,213
2 V H.33,faust://document/faustedition/2_V_H.33,2 V H.33,Witness,,609,1217,1218,1397,1830-11-30,,1830-12-18,,1830-12-09,1830.0,1830,502
2 V H.38a,faust://document/faustedition/2_V_H.38a,2 V H.38a,Witness,,611,1221,1222,1403,1830-11-30,1830-06-19,1830-12-18,1830-12-18,1830-12-09,1830.0,1830,515
2 V H.0f,faust://document/faustedition/2_V_H.0f,2 V H.0f,Witness,,625,1249,1250,1458,1831-04-05,1831-04-05,1831-05-05,1831-05-05,1831-04-20,1831.0,1831,461


In [63]:
mi.base[m.splitgraph.SplitReference(refs[0], m.splitgraph.Side.END)]

AdjacencyView({SplitReference(Witness('faust://document/faustedition/H_P27'), Side.START): {0: {'kind': 'temp-pre', 'source': <macrogen.bibliography.BiblSource object at 0x7f94cc69ca90>, 'comments': ('"vermutlich frühere Fassung auf dem Sammelblatt H P10", "Die auf\n         H P27 vorliegende Niederschrift ist sehr wahrscheinlich später entstanden."',), 'dating': <macrogen.datings.RelativeDating object at 0x7f94cc69cb50>, 'xml': (PosixPath('macrogenesis/handschriftendatierung_pre1800.xml'), 312), 'ignore': False, 'weight': 250, 'iweight': 0.004}}})

In [68]:
split = mi
single = m.MacrogenesisInfo('target/single-copy/macrogen-info.zip')

[32mINFO     macrogen.graph Loading macrogenesis graphs from target/single-copy/macrogen-info.zip[0m
[32mINFO     macrogen.graph Removing edges to ignore[0m
[32mINFO     macrogen.graph Adding 144 otherwise unmentioned references to the working graph[0m
[32mINFO     macrogen.graph Could not remove 2 II H.8:1→2 II H.8:1 (0): The edge 2 II H.8:1-2 II H.8:1 is not in the graph.[0m
[32mINFO     macrogen.graph Could not remove 2 II H.22→2 II H.22 (0): The edge 2 II H.22-2 II H.22 is not in the graph.[0m
[32mINFO     macrogen.graph Could not remove 2 III H.2:1→2 III H.2:1 (0): The edge 2 III H.2:1-2 III H.2:1 is not in the graph.[0m
[32mINFO     macrogen.graph Could not remove 2 III H.6:2→2 III H.6:2 (1): The edge 2 III H.6:2-2 III H.6:2 is not in the graph.[0m
[32mINFO     macrogen.graph Could not remove 2 III H.30→2 III H.30 (0): The edge 2 III H.30-2 III H.30 is not in the graph.[0m
[32mINFO     macrogen.graph Could not remove 2 III H.70→2 III H.70 (0): The edge 2 III H.70

In [71]:
split.conflicts[0]

(SplitReference(Witness('faust://document/faustedition/2_IV_H.2'), Side.END),
 datetime.date(1831, 2, 28),
 0,
 {'kind': 'not_after',
  'source': <macrogen.bibliography.BiblSource at 0x7f94cce94e90>,
  'dating': <macrogen.datings.AbsoluteDating at 0x7f94cce94ed0>,
  'xml': (PosixPath('macrogenesis/handschriftendatierung_iv.xml'), 34),
  'ignore': False,
  'comments': ('Brief an Ch. P. W. F. Beuth, 22.2.1831',),
  'weight': 75,
  'iweight': 0.013333333333333334,
  'delete': True})

In [75]:
def cn(node):
    return node.reference if isinstance(node, m.splitgraph.SplitReference) else node

In [76]:
clean_split_conflicts = [(cn(u), cn(v), k, attr) for u,v,k,attr in split.conflicts]

In [78]:
def simple_conflicts(conflicts):
    return {(cn(u), cn(v)) for u, v, k, attr in conflicts}

In [79]:
split_c = simple_conflicts(split.conflicts)
single_c = simple_conflicts(single.conflicts)

In [80]:
split_c - single_c

{(AmbiguousRef('faust://document/wa/2_III_H.2'),
  AmbiguousRef('faust://document/wa/2_III_H.2')),
 (AmbiguousRef('faust://document/wa/2_III_H.2'),
  UnknownRef('faust://document/fischer_lamberg/2_III_H.22_2')),
 (AmbiguousRef('faust://document/wa/2_III_H.2'),
  UnknownRef('faust://document/fischer_lamberg/2_III_H.22_3')),
 (AmbiguousRef('faust://document/wa/2_III_H.2'),
  UnknownRef('faust://document/fischer_lamberg/2_III_H.22_4')),
 (AmbiguousRef('faust://document/wa/2_III_H.2'),
  UnknownRef('faust://document/fischer_lamberg/2_III_H.22_5')),
 (AmbiguousRef('faust://document/wa/2_III_H.2'),
  UnknownRef('faust://document/wa/2_III_H.2alpha')),
 (AmbiguousRef('faust://document/wa/2_III_H.2'),
  Witness('faust://document/faustedition/2_III_H.33a')),
 (AmbiguousRef('faust://document/wa/2_III_H.2'),
  Witness('faust://document/faustedition/2_III_H.60')),
 (AmbiguousRef('faust://document/wa/2_III_H.2'),
  Witness('faust://document/faustedition/2_III_H.69')),
 (AmbiguousRef('faust://documen

In [82]:
len(single_c - split_c)

55

In [83]:
len(split_c - single_c)

23