In [1]:
import pickle
import os
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

In [2]:
def create_path(path):
    if not os.path.exists(path):
        os.mkdir(path)
        print("Folder %s created!" % path)
    else:
        print("Folder %s already exists" % path)

In [3]:
current_path = os.getcwd()
while not os.path.exists(os.path.join(current_path, '.git')):
    current_path = os.path.dirname(current_path)

In [4]:
DATASET = "amazon"
SENSITIVE = "gender"

dataset_path = f"{current_path}/datasets/{DATASET}"

V = np.load(dataset_path + "/matrix.npy").astype(float)
Sx = np.load(dataset_path + f"/{SENSITIVE}.npy")
V.shape

(705, 10152)

In [12]:
root = os.getcwd()

### file: f"{DATASET}_blocks_topk_user_{SENSITIVE}.pkl"

In [6]:
print(coo_matrix(V))

  (0, 0)	5.0
  (0, 47)	5.0
  (0, 144)	5.0
  (0, 253)	4.0
  (0, 513)	5.0
  (0, 517)	4.0
  (0, 574)	4.0
  (0, 580)	4.0
  (0, 581)	5.0
  (0, 593)	4.0
  (0, 639)	3.0
  (0, 689)	3.0
  (0, 708)	3.0
  (0, 740)	4.0
  (0, 853)	3.0
  (0, 858)	4.0
  (0, 877)	4.0
  (0, 957)	5.0
  (0, 963)	5.0
  (0, 964)	5.0
  (0, 970)	5.0
  (0, 1025)	4.0
  (0, 1104)	5.0
  (0, 1107)	3.0
  (0, 1117)	4.0
  :	:
  (6039, 3107)	5.0
  (6039, 3115)	3.0
  (6039, 3130)	4.0
  (6039, 3132)	2.0
  (6039, 3133)	4.0
  (6039, 3157)	1.0
  (6039, 3186)	3.0
  (6039, 3189)	3.0
  (6039, 3190)	3.0
  (6039, 3192)	2.0
  (6039, 3217)	3.0
  (6039, 3238)	4.0
  (6039, 3271)	4.0
  (6039, 3272)	4.0
  (6039, 3288)	5.0
  (6039, 3291)	1.0
  (6039, 3309)	4.0
  (6039, 3313)	4.0
  (6039, 3318)	2.0
  (6039, 3429)	4.0
  (6039, 3441)	4.0
  (6039, 3461)	4.0
  (6039, 3493)	4.0
  (6039, 3508)	4.0
  (6039, 3575)	5.0


In [7]:
row_id_tuple, col_id_tuple = np.where(V != 0)
items = V[row_id_tuple, col_id_tuple]
sensitive_row_tuple = np.where(Sx[row_id_tuple] == 1, False, True)

In [8]:
sensitive_str = np.where(Sx == 0, "M", "F")
row_id = np.arange(V.shape[0])
row_id

array([    0,     1,     2, ..., 13230, 13231, 13232])

In [9]:
col_id = np.arange(V.shape[1])
col_id

array([   0,    1,    2, ..., 1847, 1848, 1849])

In [10]:
data = {
    "data": pd.DataFrame({
        "user_id": row_id_tuple,
        "movie_id": col_id_tuple,
        "user_gender": sensitive_row_tuple, #True, False
        "user_rating": V[row_id_tuple,col_id_tuple].astype(float)
    }),
    "data_users": pd.DataFrame({
        "userid": row_id,
        "gender": sensitive_str
    }),
    "data_movies": None,
    "blocks": [{
        "X_train": coo_matrix(V),
        "X_test": coo_matrix(V)
    }]
}

In [11]:
data["data"]

Unnamed: 0,user_id,movie_id,user_gender,user_rating
0,0,0,True,34.333332
1,0,1,True,26.000000
2,0,2,True,22.000000
3,0,3,True,26.666666
4,0,4,True,31.333334
...,...,...,...,...
24479659,13232,1845,True,98.666664
24479660,13232,1846,True,110.333336
24479661,13232,1847,True,117.000000
24479662,13232,1848,True,108.333336


In [12]:
data["data_users"]

Unnamed: 0,userid,gender
0,0,M
1,1,M
2,2,M
3,3,M
4,4,F
...,...,...
13228,13228,M
13229,13229,F
13230,13230,M
13231,13231,M


In [13]:
data["blocks"]

[{'X_train': <13233x1850 sparse matrix of type '<class 'numpy.float64'>'
  	with 24479664 stored elements in COOrdinate format>,
  'X_test': <13233x1850 sparse matrix of type '<class 'numpy.float64'>'
  	with 24479664 stored elements in COOrdinate format>}]

In [14]:
save_path = f"{root}/reproducibility_study/Frisch_et_al/data/{DATASET}"
create_path(save_path)

Folder ./fair_taucc/algorithms/C-Fairness-RecSys/reproducibility_study/Frisch_et_al/data/lfw already exists


In [15]:
with open(f"{save_path}/{DATASET}_blocks_topk_{SENSITIVE}.pkl", "wb") as data_file:
    pickle.dump(data, data_file, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
row_id

array([    0,     1,     2, ..., 13230, 13231, 13232])

In [17]:
data_extra = {
    "users_map": {str(value).encode(): value for value in row_id},
    "items_map": {str(value).encode(): value for value in col_id}
}

In [18]:
with open(f"{save_path}/{DATASET}_extra_data_{SENSITIVE}.pkl", "wb") as extra_file:
    pickle.dump(data_extra, extra_file, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
data["data"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24479664 entries, 0 to 24479663
Data columns (total 4 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   movie_id     int64  
 2   user_gender  bool   
 3   user_rating  float64
dtypes: bool(1), float64(1), int64(2)
memory usage: 583.6 MB
