In [1]:
import pickle
import os
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

In [2]:
def create_path(path):
    if not os.path.exists(path):
        os.mkdir(path)
        print("Folder %s created!" % path)
    else:
        print("Folder %s already exists" % path)

In [3]:
DATASET = "movielens-1m"
SENSITIVE = "age"

if "movielens" in DATASET:
    dataset_path = f"./fair_taucc/datasets/movielens/{DATASET}"
else:
    dataset_path = f"./fair_taucc/datasets/{DATASET}"

V = np.load(dataset_path + "/matrix.npy").astype(float)
Sx = np.load(dataset_path + f"/{SENSITIVE}.npy")
V.shape

(6040, 3706)

In [4]:
V

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

In [5]:
root = os.getcwd()
root

'./fair_taucc/algorithms/C-Fairness-RecSys'

### file: f"{DATASET}_blocks_topk_user_{SENSITIVE}.pkl"

In [6]:
print(coo_matrix(V))

  (0, 0)	5.0
  (0, 47)	5.0
  (0, 144)	5.0
  (0, 253)	4.0
  (0, 513)	5.0
  (0, 517)	4.0
  (0, 574)	4.0
  (0, 580)	4.0
  (0, 581)	5.0
  (0, 593)	4.0
  (0, 639)	3.0
  (0, 689)	3.0
  (0, 708)	3.0
  (0, 740)	4.0
  (0, 853)	3.0
  (0, 858)	4.0
  (0, 877)	4.0
  (0, 957)	5.0
  (0, 963)	5.0
  (0, 964)	5.0
  (0, 970)	5.0
  (0, 1025)	4.0
  (0, 1104)	5.0
  (0, 1107)	3.0
  (0, 1117)	4.0
  :	:
  (6039, 3107)	5.0
  (6039, 3115)	3.0
  (6039, 3130)	4.0
  (6039, 3132)	2.0
  (6039, 3133)	4.0
  (6039, 3157)	1.0
  (6039, 3186)	3.0
  (6039, 3189)	3.0
  (6039, 3190)	3.0
  (6039, 3192)	2.0
  (6039, 3217)	3.0
  (6039, 3238)	4.0
  (6039, 3271)	4.0
  (6039, 3272)	4.0
  (6039, 3288)	5.0
  (6039, 3291)	1.0
  (6039, 3309)	4.0
  (6039, 3313)	4.0
  (6039, 3318)	2.0
  (6039, 3429)	4.0
  (6039, 3441)	4.0
  (6039, 3461)	4.0
  (6039, 3493)	4.0
  (6039, 3508)	4.0
  (6039, 3575)	5.0


In [7]:
row_id_tuple, col_id_tuple = np.where(V != 0)
items = V[row_id_tuple, col_id_tuple]

In [8]:
sensitive_row_tuple_0 = np.where(Sx[row_id_tuple] == 0, True, False)
sensitive_row_tuple_1 = np.where(Sx[row_id_tuple] == 1, True, False)
sensitive_row_tuple_2 = np.where(Sx[row_id_tuple] == 2, True, False)

In [14]:
sensitive_str = np.where(Sx == 0, "<35", np.where(Sx == 1, "<50", ">=50"))
sensitive_str

array(['<35', '>=50', '<35', ..., '>=50', '<50', '<35'], dtype='<U4')

In [7]:
#sensitive_row_tuple = np.where(Sx[row_id_tuple] == 1, False, True)\
#sensitive_str = np.where(Sx == 0, "M", "F")

In [10]:
row_id = np.arange(V.shape[0])
row_id

array([   0,    1,    2, ..., 6037, 6038, 6039])

In [11]:
col_id = np.arange(V.shape[1])
col_id

array([   0,    1,    2, ..., 3703, 3704, 3705])

In [16]:
data = {
    "data": pd.DataFrame({
        "user_id": row_id_tuple,
        "movie_id": col_id_tuple,
        "user_age_0": sensitive_row_tuple_0,
        "user_age_1": sensitive_row_tuple_1,
        "user_age_2": sensitive_row_tuple_2,
        "user_rating": V[row_id_tuple,col_id_tuple].astype(float)
    }),
    "data_users": pd.DataFrame({
        "userid": row_id,
        "age": sensitive_str
    }),
    "data_movies": None,
    "blocks": [{
        "X_train": coo_matrix(V),
        "X_test": coo_matrix(V)
    }]
}

In [17]:
data["data"]

Unnamed: 0,user_id,movie_id,user_age_0,user_age_1,user_age_2,user_rating
0,0,0,True,False,False,5.0
1,0,47,True,False,False,5.0
2,0,144,True,False,False,5.0
3,0,253,True,False,False,4.0
4,0,513,True,False,False,5.0
...,...,...,...,...,...,...
1000204,6039,3441,True,False,False,4.0
1000205,6039,3461,True,False,False,4.0
1000206,6039,3493,True,False,False,4.0
1000207,6039,3508,True,False,False,4.0


In [18]:
data["data_users"]

Unnamed: 0,userid,age
0,0,<35
1,1,>=50
2,2,<35
3,3,<50
4,4,<35
...,...,...
6035,6035,<35
6036,6036,<50
6037,6037,>=50
6038,6038,<50


In [19]:
data["blocks"]

[{'X_train': <6040x3706 sparse matrix of type '<class 'numpy.float64'>'
  	with 1000209 stored elements in COOrdinate format>,
  'X_test': <6040x3706 sparse matrix of type '<class 'numpy.float64'>'
  	with 1000209 stored elements in COOrdinate format>}]

In [20]:
save_path = f"{root}/reproducibility_study/Frisch_et_al/data/movielens_1m_age"
create_path(save_path)

Folder ./fair_taucc/algorithms/C-Fairness-RecSys/reproducibility_study/Frisch_et_al/data/movielens_1m_age created!


In [21]:
with open(f"{save_path}/{DATASET}_blocks_topk_{SENSITIVE}.pkl", "wb") as data_file:
    pickle.dump(data, data_file, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
row_id

array([   0,    1,    2, ..., 6037, 6038, 6039])

In [23]:
data_extra = {
    "users_map": {str(value).encode(): value for value in row_id},
    "items_map": {str(value).encode(): value for value in col_id}
}

In [24]:
with open(f"{save_path}/{DATASET}_extra_data_{SENSITIVE}.pkl", "wb") as extra_file:
    pickle.dump(data_extra, extra_file, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
data["data"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   user_id      1000209 non-null  int64  
 1   movie_id     1000209 non-null  int64  
 2   user_age_0   1000209 non-null  bool   
 3   user_age_1   1000209 non-null  bool   
 4   user_age_2   1000209 non-null  bool   
 5   user_rating  1000209 non-null  float64
dtypes: bool(3), float64(1), int64(2)
memory usage: 25.8 MB
