In [0]:
import os
import sys
import pandas as pd
pd.options.display.float_format = '{:,}'.format
from tqdm.autonotebook import tqdm

home_path = '/content/drive/My Drive/projects/textual_analysis_email/catsetmat'
src_path = os.path.join(home_path, 'src')
data_path = os.path.join(home_path, 'data')
raw_data_path = os.path.join(data_path, 'raw')
processed_data_path = os.path.join(data_path, 'processed')
sys.path.append(home_path)
sys.path.append(src_path)
from src import data_reader, our_utils

In [0]:
default_data_params = our_utils.get_default_data_params()
raw_data_names = list(sorted(os.listdir(raw_data_path)))
print('Raw:\t', raw_data_names)
print('Processed:\t', sorted(os.listdir(processed_data_path)))

Raw:	 ['cast-crew-2011-inf', 'cast-kws-2011-inf', 'mag_acm', 'sample_mag_acm']
Processed:	 ['cast-crew-2011-inf', 'cast-kws-2011-inf', 'sample_mag_acm']


In [0]:
def get_data_properties(U, V):
    set_V = set.union(*map(set, U))
    set_V_ = set.union(*map(set, V))
    F = set(zip(map(frozenset, U), map(frozenset, V)))
    F_V = set(map(frozenset, U))
    F_V_ = set(map(frozenset, V))
    properties = {}
    properties['|V|'] = len(set_V)
    properties["|V'|"] = len(set_V_)
    properties['|F|'] = len(F)
    properties['|F_{V}|'] = len(F_V)
    properties["|F_{V'}|"] = len(F_V_)
    # properties = {k: '{:,}'.format(v) for k, v in properties.items()}
    return properties

data_properties_map = {}
for data_name in tqdm(raw_data_names):
    # data_name = 'sample_mag_acm'
    data_params = dict(default_data_params)
    data_params['raw_data_path'] = os.path.join(raw_data_path, data_name)
    data_params['processed_data_path'] = os.path.join(processed_data_path, data_name)
    U, V = data_reader.load_bipartite_hypergraph(data_params)
    properties = get_data_properties(U, V)
    data_properties_map[data_name] = properties

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [21]:
data_display_name_map = {'cast-crew-2011-inf': '\\dataicc',
                         'cast-kws-2011-inf': '\\dataick',
                         'mag_acm': '\\datamacm',
                         'sample_mag_acm': '\\datasmacm'}
prop_display_seq = ['|V|', "|V'|", '|F|', '|F_{V}|', "|F_{V'}|"]
prop_df = pd.DataFrame(data_properties_map).T
prop_df = prop_df.rename(index=data_display_name_map)
prop_df = prop_df[prop_display_seq]
prop_df.columns = map(lambda x: '${}$'.format(x).replace("'", '^\\prime'), prop_df.columns)
# prop_df = prop_df.style.format("{:,.0f}")
print(prop_df.to_latex(escape=False, formatters = ['{:,}'.format]*prop_df.shape[1]))

\begin{tabular}{lrrrrr}
\toprule
{} & $|V|$ & $|V^\prime|$ & $|F|$ & $|F_{V}|$ & $|F_{V^\prime}|$ \\
\midrule
\dataicc   & 4,556 &        3,802 & 2,825 &     2,824 &            2,744 \\
\dataick   & 3,156 &        1,256 & 2,669 &     2,656 &            2,621 \\
\datamacm  & 1,059 &        2,338 & 1,388 &       847 &            1,379 \\
\datasmacm &     5 &            6 &     8 &         7 &                6 \\
\bottomrule
\end{tabular}

