In [7]:
import csv
from tqdm import tqdm
from collections import defaultdict
from itertools import combinations

def parse_csv(file_path):
    print("Parsing the CSV file")
    word_to_manuscripts = defaultdict(list)
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header
        for row in reader:
            word, manuscripts = row
            manuscripts_list = manuscripts.strip('"').split(', ')
            for manuscript in manuscripts_list:
                word_to_manuscripts[word].append(manuscript)
    return word_to_manuscripts

def build_combinations_from_words(word_to_manuscripts):
    print("Building combinations from words")
    combinations_to_words = defaultdict(set)
    for word, manuscripts in tqdm(word_to_manuscripts.items()):
        for n in range(2, len(manuscripts) + 1):
            for combo in combinations(manuscripts, n):
                combinations_to_words[combo].add(word)
    return combinations_to_words

def filter_subsets_within_n_word_group(groups_with_n_words):
    filtered_groups = []
    for group in sorted(groups_with_n_words, key=lambda x: len(x), reverse=True):
        if not any(set(group) <= set(existing_group) for existing_group, _ in filtered_groups):
            filtered_groups.append((group, groups_with_n_words[group]))
    return {group: words for group, words in filtered_groups}

def process_by_shared_word_count(combinations_to_words, max_significant_words, output_file_path):
    print("Processing by shared word count")
    with open(output_file_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Group", "Common Words", "Count of Shared Words"])
        for n in tqdm(range(1, max_significant_words + 1)):
            groups_sharing_n_words = {group: words for group, words in combinations_to_words.items() if len(words) == n}
            filtered_sets = filter_subsets_within_n_word_group(groups_sharing_n_words)
            for group, words in filtered_sets.items():
                writer.writerow([', '.join(group), ', '.join(words), len(words)])

In [8]:
max_significant_words = 20
file_path = 'data_filtered.csv'
output_file_path = 'filtered_sets.csv'

word_to_manuscripts = parse_csv(file_path)
combinations_to_words = build_combinations_from_words(word_to_manuscripts)
process_by_shared_word_count(combinations_to_words, max_significant_words, output_file_path)

print("Output has been written to", output_file_path)

Parsing the CSV file
Building combinations from words


100%|██████████| 40/40 [01:38<00:00,  2.46s/it]


Processing by shared word count


100%|██████████| 20/20 [00:52<00:00,  2.63s/it]

Output has been written to filtered_sets.csv



