In [2]:
!cp -r ../input/updated-code-3-7/* /kaggle/working/

In [3]:
import pandas as pd
import os

In [6]:
import scipy.sparse as sps
import numpy as np

In [7]:
def split_urm_in_k_folds(URM_all, k):
    from Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix
    num_users, num_items = URM_all.shape

    URM_train = sps.coo_matrix(URM_all)

    indices_for_sampling = np.arange(0, URM_all.nnz, dtype=int)
    np.random.shuffle(indices_for_sampling)

    indices_for_sampling = np.array_split(indices_for_sampling, k)
    np.random.shuffle(indices_for_sampling)
    k_URM = []
    
    for i in range(k):
        URM_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items, auto_create_col_mapper=False, auto_create_row_mapper=False)
        URM_builder.add_data_lists(URM_train.row[indices_for_sampling[i]],
                                             URM_train.col[indices_for_sampling[i]],
                                             URM_train.data[indices_for_sampling[i]])

        k_URM.append(sps.csr_matrix(URM_builder.get_SparseMatrix()))

    #verify that the sum of the URM_train_matrices is equal to the original URM
    k_URM_sum = sps.csr_matrix((num_users, num_items))
    for URM_train_matrices_single in k_URM:
        k_URM_sum += URM_train_matrices_single
    
    assert k_URM_sum.nnz == URM_all.nnz, "split_train_in_k_percentage_global_sample: URM_train_matrices_sum doesn't match URM_all"


    return k_URM

In [8]:
# Import training data
URM_path = "Data/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path,
                                header=0,
                                dtype={0:int, 1:int, 2:int},
                                engine='python')

URM_all_dataframe.columns = ["user_id", "item_id", "interaction"]

# Import target users
target_path = "Data/data_target_users_test.csv"
target_dataframe= pd.read_csv(filepath_or_buffer=target_path,
                                header=0,
                                dtype={0:int},
                                engine='python')
target_dataframe.columns = ["user_id"]

In [9]:
def preprocess_data(ratings: pd.DataFrame):
    unique_users = ratings.user_id.unique()
    unique_items = ratings.item_id.unique()

    num_users, min_user_id, max_user_id = unique_users.size, unique_users.min(), unique_users.max()
    num_items, min_item_id, max_item_id = unique_items.size, unique_items.min(), unique_items.max()

    print(num_users, min_user_id, max_user_id)
    print(num_items, min_item_id, max_item_id)

    mapping_user_id = pd.DataFrame({"mapped_user_id": np.arange(num_users), "user_id": unique_users})
    mapping_item_id = pd.DataFrame({"mapped_item_id": np.arange(num_items), "item_id": unique_items})

    ratings = pd.merge(left=ratings,
                       right=mapping_user_id,
                       how="inner",
                       on="user_id")

    ratings = pd.merge(left=ratings,
                       right=mapping_item_id,
                       how="inner",
                       on="item_id")

    return ratings

In [10]:
# Call preprocess data function
ratings = preprocess_data(URM_all_dataframe)

12638 1 13024
22222 1 22347


In [11]:
URM = sps.coo_matrix((ratings.interaction.values, (ratings.mapped_user_id.values, ratings.mapped_item_id.values)))

In [12]:
k = 30
splits = []
splits = split_urm_in_k_folds(URM, k)

In [13]:
sums = 0
for i in range(k):
    print(splits[i].nnz)
    sums += splits[i].nnz
sums == URM.nnz


15957
15957
15958
15957
15958
15958
15958
15958
15958
15958
15957
15958
15958
15957
15958
15957
15958
15957
15958
15958
15957
15958
15958
15957
15958
15958
15958
15958
15957
15958


True

In [14]:
p = URM - splits[0]
URM.nnz, splits[0].nnz, p.nnz

(478730, 15957, 462773)