In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp

#### Definitions

In [3]:
base_dataset_dir = os.path.join(".", "..", "..", "..", "FairnessDatasets")
dataset_dir_dict = {
    "ml-100k": os.path.join("ml-100k", "user_gte_5_item_gte_5"),
    "ml-1m": os.path.join("ml-1m", "user_gte_5_item_gte_5"),
    "lfm-small": os.path.join("lfm", "user_gte_10_item_gte_10_gender_age_loc_10000"),
    "lfm-big": os.path.join("lfm", "user_gte_10_item_gte_10_gender_age_loc_100000")
}

In [4]:
i = 0
dataset_stats = {}
for dataset, dataset_dir in dataset_dir_dict.items():
    dataset_dir = os.path.join(base_dataset_dir, dataset_dir)
    interactions = sp.load_npz(os.path.join(dataset_dir, "interactions.npz"))
    user_info = pd.read_csv(os.path.join(dataset_dir, "user_info.csv"))
    
    value_counts = user_info["gender"].value_counts()
    male_interactions = interactions[user_info["gender"] == "m", :]
    female_interactions = interactions[user_info["gender"] == "f", :]
    
    dataset_stats[i] = {
        "dataset": dataset,
        "type": "all",
        "n_users": interactions.shape[0],
        "n_items": interactions.shape[1],
        "n_interactions": int(interactions.sum()),
        "n_avrg_interactions": round(int(interactions.sum()) / interactions.shape[0], 2),
        "density": round(interactions.sum() / (interactions.shape[0] * interactions.shape[1]), 4)
    }
    i += 1
    
    for gdr in ["m", "f"]:
        inter = interactions[user_info["gender"] == gdr, :]
        dataset_stats[i] = {
            "dataset": dataset,
            "type": gdr,
            "n_users": value_counts[gdr],
            "n_items": "",
            "n_interactions": int(inter.sum()),
            "n_avrg_interactions": round(int(inter.sum()) / value_counts[gdr], 2),
            "density": round(inter.sum() / (value_counts[gdr] * inter.shape[1]), 4),
        }
        i += 1
    
df = pd.DataFrame.from_dict(dataset_stats, orient="index")
df.columns = ["Dataset", "User Group", "Users", "Items", "Interactions", "Avrg. Interactions", "Density"]
df

Unnamed: 0,Dataset,User Group,Users,Items,Interactions,Avrg. Interactions,Density
0,ml-100k,all,943,1349.0,99287,105.29,0.078
1,ml-100k,m,670,,73824,110.19,0.0817
2,ml-100k,f,273,,25463,93.27,0.0691
3,ml-1m,all,6034,3125.0,574376,95.19,0.0305
4,ml-1m,m,4326,,429039,99.18,0.0317
5,ml-1m,f,1708,,145337,85.09,0.0272
6,lfm-small,all,5130,5677.0,165003,32.16,0.0057
7,lfm-small,m,4222,,138618,32.83,0.0058
8,lfm-small,f,908,,26385,29.06,0.0051
9,lfm-big,all,7603,62617.0,1845963,242.79,0.0039


In [5]:
with open("dataset-stats.tex", "w") as fh:
    df.style.to_latex(fh, )