In [6]:
import csv
from collections import defaultdict
from itertools import combinations

In [7]:
# Step 1: Parse the CSV to create a mapping of words to manuscripts
def parse_csv(file_path):
    word_to_manuscripts = defaultdict(list)
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header
        for row in reader:
            word, manuscripts = row
            manuscripts_list = manuscripts.split(', ')
            for manuscript in manuscripts_list:
                word_to_manuscripts[manuscript].append(word)
    return word_to_manuscripts

In [8]:
# Step 2: Find all unique groups of manuscripts that share 2 or more words
def find_shared_words_groups(word_to_manuscripts):
    shared_words = defaultdict(list)

    for manuscripts in combinations(word_to_manuscripts.keys(), 2):
        shared = set(word_to_manuscripts[manuscripts[0]]) & set(word_to_manuscripts[manuscripts[1]])
        if len(shared) >= 2:
            shared_words[manuscripts].extend(shared)

    return shared_words

In [9]:
# Step 3: Filter out subsets
def filter_subsets(shared_words):
    filtered_groups = {}
    for group, words in shared_words.items():
        if not any(set(group) < set(other_group) and set(words) <= set(other_words) 
                   for other_group, other_words in filtered_groups.items()):
            filtered_groups[group] = words
    return filtered_groups

In [10]:
# Step 4: Run the script
file_path = 'data.csv'
word_to_manuscripts = parse_csv(file_path)
shared_words_groups = find_shared_words_groups(word_to_manuscripts)
filtered_groups = filter_subsets(shared_words_groups)

for group, words in filtered_groups.items():
    print(f'Group: "{", ".join(group)}"\nCommon words: "{", ".join(words)}"\nCount of shared words: {len(words)}\n')

Group: "B, A"
Common words: "ueteru[m], co[n]tinet[or], regimina, exordium, occasum 1, augendo"
Count of shared words: 6

Group: "B, C"
Common words: "cum 1, et fide et dextra, ueteru[m], post "producta" in, regimina, exordium, limite, omitted 1, augendo"
Count of shared words: 9

Group: "B, Ch"
Common words: "cum 1, ueteru[m], regimina, maximo[rum], exordium, augendo"
Count of shared words: 6

Group: "B, G"
Common words: "cum 1, ueteru[m], maximo[rum], exordium, augendo"
Count of shared words: 5

Group: "B, Li"
Common words: "ueteru[m], regimina, maximo[rum], exordium, augendo"
Count of shared words: 5

Group: "B, Lo"
Common words: "cum 1, ueteru[m], maximo[rum], exordium, augendo"
Count of shared words: 5

Group: "B, Pa1"
Common words: "cum 1, et fide et dextra, post "producta" in, regimina, occasum 1, augendo"
Count of shared words: 6

Group: "B, Pv"
Common words: "ueteru[m], augendo, exordium"
Count of shared words: 3

Group: "B, R"
Common words: "cum 1, ueteru[m], regimina, maximo