In [88]:
import networkx as nx
from random import sample

## Datasets

In [89]:
# Select the dataset. There are seven datasets as follows:
# congress-bills
# contact-high-school
# contact-primary-school
# email-Enron
# email-Eu
# threads-ask-ubuntu
# threads-math-sx
hyperdata = 'congress-bills'

## Read the data

In [90]:
# Read the vertex data.
f1 = open(f"./RowData/{hyperdata}/{hyperdata}-nverts.txt", "r")
file01 = f1.readlines()
vertexes = [int(i.strip('\n')) for i in file01]


# Read the hyperedge data.
f2 = open(f"./RowData/{hyperdata}/{hyperdata}-simplices.txt", "r")
file02 = f2.readlines()
hyperedges = [int(i.strip('\n')) for i in file02]


# Obtain the row data and delete hyperedges with size more than one.
E_row = []
j = 0
for i in vertexes:
    j += i
    E_row.append(hyperedges[j : j + i])

E_row = [list(set(l)) for l in E_row]
E_row = [sorted(i) for i in E_row if len(i) >= 2]


V_row = []
for l in E_row:
    V_row += l
V_row = list(set(V_row))

print('V_row: ', len(V_row))
print('E_row: ', len(E_row))

V_row:  1024507
E_row:  1044630


## Setting

In [94]:
# Weight ('yes') or not ('no')
weight = 'yes'

# Set the number of the selected vertices, which should be smaller than V_row.
Set_N = 20000

## Select a part of the hypergraph

In [95]:
Set_V = sample(V_row, Set_N)

E_select = [l for l in E_row if all(value in Set_V for value in l)]


# Vertex list
V_select = []
for l in E_select:
    V_select += l
V_select = list(set(V_select))


Nei_vertex = {i: [] for i in V_select}
for i in V_select:
    i_list = []
    for l in E_select:
        if i in l:
            i_list += l
    i_nodup = list(set(i_list))
    i_nodup.remove(i)
    Nei_vertex[i] = i_nodup

print('V_select: ', len(V_select))
print('E_select: ', len(E_select))

V_select:  239
E_select:  127


## Max-compoment

In [96]:
# Generate a graph
HG = nx.Graph()

# Add vertexes
HG.add_nodes_from(V_select)

# Add edges
edge = []
for i in V_select:
    edge += list(zip([i] * len(Nei_vertex), Nei_vertex[i]))
HG.add_edges_from(edge)

# Obtain all the connected subgraphs.
components = nx.connected_components(HG)
 
# Obtain the max compoment of the connected subgraphs.
max_component = max(components, key=len)

V_compo = list(max_component)
E_compo = [l for l in E_select if set(l).issubset(set(V_compo))]

print('V_compo: ', len(V_compo))
print('E_compo: ', len(E_compo))

V_compo:  5
E_compo:  5


## Reorder and nodup

In [97]:
E_reorder = [[V_compo.index(i) for i in e] for e in E_compo]

E_sort = [tuple(sorted(i)) for i in E_reorder]

E_nodup = list(set(E_sort))

E = sorted(E_nodup, key=lambda x: min(x))

V = list(range(len(V_compo)))


if weight == 'no':
    W = [1] * int(len(E))

elif weight == 'yes':
    W = [E_sort.count(i) for i in E]

    
print('V: ', len(V))
print('E: ', len(E))

V:  5
E:  4


## Save the vertices

In [98]:
file = open(f"./ProcessedData/{hyperdata}/{hyperdata}-vertices.txt",'w')
for i in range(len(V)):
    s = str(V[i]) + '\n'
    file.write(s)
file.close()

## Save the weights

In [99]:
file = open(f"./ProcessedData/{hyperdata}/{hyperdata}-weights.txt",'w')
for i in range(len(W)):
    s = str(W[i]) + '\n'
    file.write(s)
file.close()

## Save the hyperedges

In [100]:
file = open(f"./ProcessedData/{hyperdata}/{hyperdata}-hyperedges.txt",'w')
for i in range(len(E)):
    s = str(E[i]).replace('(','').replace(')','') + '\n'
    file.write(s)
file.close()