In [None]:
# DELETE FILE BEFORE SUBMITTING

In [3]:
# The input is a set of n elements and a collection of subsets of these elements. 
# The goal is to find the smallest number of subsets such that their union covers all elements in the set.

## implementation

In [4]:
'''
approximation guarantees of O(log n)
OPT is size of the optimal solution
k is size of the greedy solution
need to show: GREEDY <= OPT * log n
    1. at each step, we pick the set that covers the most uncovered elements => GREEDY covers at least as many as any OPT set would
        - let m be the size of the set we picked
        - m <= size of the set OPT picked
    2. GREEDY covers at least >= remaining elements / size of the set we picked
    3. using recursion, let r_i be the number of remaining elements after i iterations
        5. r_i <= r_{i-1} - r_{i-1}/m (r_i = 0 is the goal => all elements are covered)
        6. r_i <= n(1 - 1/m)^i
        7. n(1 - 1/m)^k < 1     (take log of both sides)
        8. k >= m*log(n)         
        9. k = O(m*log(n))
        10. k = O(OPT*log n)    (since m <= OPT)
    11. k = O(log n)
'''

'\napproximation guarantees of O(log n)\nOPT is size of the optimal solution\nk is size of the greedy solution\nneed to show: GREEDY <= OPT * log n\n    1. at each step, we pick the set that covers the most uncovered elements => GREEDY covers at least as many as any OPT set would\n        - let m be the size of the set we picked\n        - m <= size of the set OPT picked\n    2. GREEDY covers at least >= remaining elements / size of the set we picked\n    3. using recursion, let r_i be the number of remaining elements after i iterations\n        5. r_i <= r_{i-1} - r_{i-1}/m (r_i = 0 is the goal => all elements are covered)\n        6. r_i <= n(1 - 1/m)^i\n        7. n(1 - 1/m)^k < 1     (take log of both sides)\n        8. k >= m*log(n)         \n        9. k = O(m*log(n))\n        10. k = O(OPT*log n)    (since m <= OPT)\n    11. k = O(log n)\n'

In [5]:
# idea 1: select the subset that covers the most uncovered elements i.e. when added to the cover, increases the number of covered elements the most

def approx_msc(U, S):
    '''
    input: U = {x_1, x_2, ..., x_n}: set of n elements
           S = {S_1, S_2, ..., S_m} where S_i is a subset of U: list of sets
    output: C is a subset of S such that C covers all elements in U: list of sets
    '''
    C = []                # initialize the cover as empty
    uncovered = set(U)    # all elements in U are initially uncovered
    while uncovered:      # while there are still uncovered elements
        best_subset = max(S, key=lambda s: len(uncovered & s))  # find the subset that covers the most uncovered elements
        C.append(best_subset)     # add it to the cover
        uncovered -= best_subset  # remove the covered elements from the uncovered set

    # below is optional
    def needs_pruning(s, C):
        # check if the set s is necessary in the cover
        # a set is necessary if removing it would leave some elements uncovered
        remaining = set().union(*(c for c in C if c != s))
        return s <= remaining  # if s is a subset of the remaining sets, it is not necessary
    
    # prune the cover to remove any redundant sets
    pruned = []
    for s in C:
        temp_cover = pruned + [x for x in C if x != s]
        if not needs_pruning(s, temp_cover):
            pruned.append(s)
    C = pruned  # update the cover with the pruned sets

    return C               # return the cover

## testing

In [6]:
def read_input(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    
    n, m = map(int, lines[0].split())
    S = []

    for line in lines[1:]:
        parts = list(map(int, line.split()))
        S_i = set(parts[1:])
        S.append(S_i)
    U = set(range(1, n + 1))
    return U, S

In [7]:
# test1.in
filename = '../data/test1.in'
U, S = read_input(filename)
print('U:', U)
print('S:', S)
C = approx_msc(U, S)
print('C:', C)

U: {1, 2, 3}
S: [{3}, {1, 3}, {2, 3}]
C: [{1, 3}, {2, 3}]


In [8]:
filename = '../data/test2.in'
U, S = read_input(filename)
print('U:', U)
print('S:', S)
C = approx_msc(U, S)
print('C:', C)

U: {1, 2, 3, 4, 5}
S: [{5}, {1}, {4, 5}, {5}, {4}, {1, 2, 4}, {1, 2, 3}]
C: [{1, 2, 4}, {5}, {1, 2, 3}]


In [9]:
filename = '../data/test3.in'
U, S = read_input(filename)
print('U:', U)
print('S:', S)
C = approx_msc(U, S)
print('C:', C)

U: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
S: [{1, 6, 7}, {8, 5}, {8, 4}, {3}, {8, 2}, {10, 5}, {1}, {9, 5, 6}, {7}, {3}]
C: [{1, 6, 7}, {8, 4}, {3}, {8, 2}, {10, 5}, {9, 5, 6}]


In [10]:
filename = '../data/test4.in'
U, S = read_input(filename)
print('U:', U)
print('S:', S)
C = approx_msc(U, S)
print('C:', C)

U: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
S: [{1, 3, 9}, {5, 6}, {10, 6}, {8, 2, 4}, {5, 7}]
C: [{1, 3, 9}, {8, 2, 4}, {10, 6}, {5, 7}]


In [11]:
filename = '../data/test5.in'
U, S = read_input(filename)
print('U:', U)
print('S:', S)
C = approx_msc(U, S)
print('C:', C)

U: {1, 2, 3, 4, 5, 6, 7}
S: [{4}, {4}, {6, 7}, {5}, {1}, {1, 5}, {4}, {2, 5}, {1, 3}, {1}]
C: [{6, 7}, {4}, {2, 5}, {1, 3}]


## comprehensive evaluation table

In [12]:
import time
import pandas as pd
import os

In [13]:
def read_output(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    opt_val = int(lines[0].strip())
    return opt_val

In [14]:
def rel_error(approx_val, opt_val):
    return abs(approx_val - opt_val) / opt_val

In [17]:
def run_and_measure(input, output):
    U, S = read_input(input)
    start = time.time()
    C = approx_msc(U, S)
    elapsed = time.time() - start
    alg_val = len(C)
    opt_val = read_output(output)
    rel_err = rel_error(alg_val, opt_val)
    return round(elapsed, 2), alg_val, round(rel_err, 2)

In [18]:
test_dataset = []
for i in range(1, 6):
    input = f'../data/test{i}.in'
    output = f'../data/test{i}.out'
    elapsed, alg_val, rel_err = run_and_measure(input, output)
    data = os.path.splitext(os.path.basename(input))[0]
    test_dataset.append((data, elapsed, alg_val, rel_err))

test_df = pd.DataFrame(test_dataset, columns=['Dataset', 'Time (s)', 'size', 'RelErr'])
print(test_df)

  Dataset  Time (s)  size  RelErr
0   test1       0.0     2     0.0
1   test2       0.0     3     0.5
2   test3       0.0     6     0.0
3   test4       0.0     4     0.0
4   test5       0.0     4     0.0


In [19]:
small_dataset = []
for i in range(1, 19):
    input = f'../data/small{i}.in'
    output = f'../data/small{i}.out'
    elapsed, alg_val, rel_err = run_and_measure(input, output)
    data = os.path.splitext(os.path.basename(input))[0]
    small_dataset.append((data, elapsed, alg_val, rel_err))

small_df = pd.DataFrame(small_dataset, columns=['Dataset', 'Time (s)', 'size', 'RelErr'])
print(small_df)

    Dataset  Time (s)  size  RelErr
0    small1       0.0     5    0.00
1    small2       0.0     4    0.33
2    small3       0.0     6    0.20
3    small4       0.0     5    0.25
4    small5       0.0     6    0.20
5    small6       0.0     4    0.33
6    small7       0.0     4    0.33
7    small8       0.0     3    0.50
8    small9       0.0     4    0.33
9   small10       0.0     3    0.50
10  small11       0.0     5    0.25
11  small12       0.0     4    0.33
12  small13       0.0     3    0.50
13  small14       0.0     3    0.50
14  small15       0.0     3    0.50
15  small16       0.0     3    0.50
16  small17       0.0     3    0.50
17  small18       0.0     3    0.50


In [20]:
large_dataset = []
for i in range(1, 13):
    input = f'../data/large{i}.in'
    output = f'../data/large{i}.out'
    elapsed, alg_val, rel_err = run_and_measure(input, output)
    data = os.path.splitext(os.path.basename(input))[0]
    large_dataset.append((data, elapsed, alg_val, rel_err))

large_df = pd.DataFrame(large_dataset, columns=['Dataset', 'Time (s)', 'size', 'RelErr'])
print(large_df)

    Dataset  Time (s)  size  RelErr
0    large1      0.35    50    0.00
1    large2      0.00    20    0.05
2    large3      0.00    17    0.13
3    large4      0.14   152    0.67
4    large5      0.00     8    0.33
5    large6      0.01     7    0.17
6    large7      0.34   172    0.81
7    large8      0.00     6    0.20
8    large9      0.00    16    0.14
9   large10      0.28   318    0.44
10  large11      0.06    56    0.40
11  large12      0.00    18    0.20


In [23]:
comprehensive_df = pd.concat([test_df, small_df, large_df], ignore_index=True)
print(comprehensive_df)
comprehensive_df.to_csv('comprehensive_approx.csv', index=False)

    Dataset  Time (s)  size  RelErr
0     test1      0.00     2    0.00
1     test2      0.00     3    0.50
2     test3      0.00     6    0.00
3     test4      0.00     4    0.00
4     test5      0.00     4    0.00
5    small1      0.00     5    0.00
6    small2      0.00     4    0.33
7    small3      0.00     6    0.20
8    small4      0.00     5    0.25
9    small5      0.00     6    0.20
10   small6      0.00     4    0.33
11   small7      0.00     4    0.33
12   small8      0.00     3    0.50
13   small9      0.00     4    0.33
14  small10      0.00     3    0.50
15  small11      0.00     5    0.25
16  small12      0.00     4    0.33
17  small13      0.00     3    0.50
18  small14      0.00     3    0.50
19  small15      0.00     3    0.50
20  small16      0.00     3    0.50
21  small17      0.00     3    0.50
22  small18      0.00     3    0.50
23   large1      0.35    50    0.00
24   large2      0.00    20    0.05
25   large3      0.00    17    0.13
26   large4      0.14   152 