In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
datasets = ['ml-1m', 'steam', 'goodreads']
DATASET = datasets[0]  # Change dataset here

In [3]:
base_artifacts = Path.cwd().resolve().parents[2] / 'CausalI2I_artifacts'
path = base_artifacts / 'Datasets' / 'Processed' / DATASET
train = pd.read_csv(path / 'train.csv')
test = pd.read_csv(path / 'test.csv')
full_data = pd.concat([train, test], ignore_index=True)

In [4]:
n_users = full_data['user_id'].nunique()
n_items = full_data['item_id'].nunique()
n_interactions = len(full_data)
n_pos = len(full_data[full_data['interaction'] > 0])
n_neg = len(full_data[full_data['interaction'] == 0])

n_test_users = test['user_id'].nunique()
n_test_items = test['item_id'].nunique()
n_test_interactions = len(test)
n_test_pos = len(test[test['interaction'] > 0])
n_test_neg = len(test[test['interaction'] == 0])

n_train_users = n_users - n_test_users
n_train_items = n_items - n_test_items
n_train_interactions = len(train)
n_train_pos = len(train[train['interaction'] > 0])
n_train_neg = len(train[train['interaction'] == 0])

In [5]:
output = f"""
Dataset: {DATASET}

    Information about the full dataset:
-------------------------------------------
Number of unique users in the dataset: {n_users:,}
Number of unique items in the dataset: {n_items:,}
Number of interactions in the dataset: {n_interactions:,} (= {n_users:,} x {n_items:,})
Number of positive interactions in the dataset: {n_pos:,} ({n_pos / n_interactions:.2%} of total)
Number of negative interactions in the dataset: {n_neg:,} ({n_neg / n_interactions:.2%} of total)

\n   Information about the test dataset:
-------------------------------------------
Number of unique users in the test dataset: {n_test_users:,} ({n_test_users/n_users:.2%} of total)
Number of unique items in the test dataset: {n_test_items:,} ({n_test_items/n_items:.2%} of total)
Number of interactions in the test dataset: {n_test_interactions:,} (= {n_test_users:,} x {n_test_items:,})
Number of positive interactions in the test dataset: {n_test_pos:,} ({n_test_pos / n_test_interactions:.2%} of total)
Number of negative interactions in the test dataset: {n_test_neg:,} ({n_test_neg / n_test_interactions:.2%} of total)

\n   Information about the train dataset:
-------------------------------------------
Number of unique users that appear in train but not in test: {n_train_users:,} (= {n_users:,} - {n_test_users:,})
Number of unique items that appear in train but not in test: {n_train_items:,} (= {n_items:,} - {n_test_items:,})
Number of interactions in the train dataset: {n_train_interactions:,} (= {n_interactions:,} - {n_test_interactions:,})
Number of positive interactions in the train dataset: {n_train_pos:,} ({n_train_pos / n_train_interactions:.2%} of total)
Number of negative interactions in the train dataset: {n_train_neg:,} ({n_train_neg / n_train_interactions:.2%} of total)
"""
print(output)


Dataset: ml-1m

    Information about the full dataset:
-------------------------------------------
Number of unique users in the dataset: 6,040
Number of unique items in the dataset: 3,706
Number of interactions in the dataset: 22,384,240 (= 6,040 x 3,706)
Number of positive interactions in the dataset: 1,000,209 (4.47% of total)
Number of negative interactions in the dataset: 21,384,031 (95.53% of total)


   Information about the test dataset:
-------------------------------------------
Number of unique users in the test dataset: 3,020 (50.00% of total)
Number of unique items in the test dataset: 749 (20.21% of total)
Number of interactions in the test dataset: 2,261,980 (= 3,020 x 749)
Number of positive interactions in the test dataset: 99,153 (4.38% of total)
Number of negative interactions in the test dataset: 2,162,827 (95.62% of total)


   Information about the train dataset:
-------------------------------------------
Number of unique users that appear in train but not in t

In [6]:
with open(f"{DATASET}_description.txt", "w") as file:
    file.write(output)