# CatGCN - JD dataset process

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def show_df_info(df):
    print(df.info())
    print('####### Repeat ####### \n', df.duplicated().any())
    print('####### Count ####### \n', df.nunique())
    print('####### Example ####### \n',df.head())

def label_statics(label_df, label_list):
    print("####### nCount #######")
    for label in label_list:
        print(label_df[label].value_counts())
    print("####### nPercent #######")
    for label in label_list:
        print(label_df[label].value_counts()/label_df.shape[0])

## Base paths

In [3]:
raw_data_path = '/home/purificato/papers_code/TKDE21_CatGCN/data/jd_data/_raw_data'

## USER analysis

In [4]:
df_user = pd.read_csv(os.path.join(raw_data_path, "user"))
df_user.dropna(inplace=True)
age_dic = {'11~15':0, '16~20':0, '21~25':0, '26~30':1, '31~35':1, '36~40':2, '41~45':2, '46~50':3, '51~55':3, '56~60':4, '61~65':4, '66~70':4, '71~':4}
df_user[["age_range"]] = df_user[["age_range"]].applymap(lambda x:age_dic[x])
df_user.rename(columns={"user_id":"uid", "age_range":"age"}, inplace=True)

show_df_info(df_user)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   uid     100000 non-null  int64
 1   gender  100000 non-null  int64
 2   age     100000 non-null  int64
dtypes: int64(3)
memory usage: 3.1 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid       100000
gender         2
age            5
dtype: int64
####### Example ####### 
    uid  gender  age
0    0       0    0
1    1       0    1
2    2       0    1
3    3       1    3
4    4       1    1


In [5]:
label_statics(df_user, df_user.columns[1:])

####### nCount #######
0    59500
1    40500
Name: gender, dtype: int64
1    50824
2    20847
0    19288
3     7100
4     1941
Name: age, dtype: int64
####### nPercent #######
0    0.595
1    0.405
Name: gender, dtype: float64
1    0.50824
2    0.20847
0    0.19288
3    0.07100
4    0.01941
Name: age, dtype: float64


### bin_age

In [6]:
df_user["bin_age"] = df_user["age"]
df_user["bin_age"] = df_user["bin_age"].replace(1,0)
df_user["bin_age"] = df_user["bin_age"].replace(2,1)
df_user["bin_age"] = df_user["bin_age"].replace(3,1)
df_user["bin_age"] = df_user["bin_age"].replace(4,1)

label_statics(df_user, df_user.columns[1:])

####### nCount #######
0    59500
1    40500
Name: gender, dtype: int64
1    50824
2    20847
0    19288
3     7100
4     1941
Name: age, dtype: int64
0    70112
1    29888
Name: bin_age, dtype: int64
####### nPercent #######
0    0.595
1    0.405
Name: gender, dtype: float64
1    0.50824
2    0.20847
0    0.19288
3    0.07100
4    0.01941
Name: age, dtype: float64
0    0.70112
1    0.29888
Name: bin_age, dtype: float64


## ITEM analysis

In [7]:
df_item = pd.read_csv(os.path.join(raw_data_path, "item_info"))
df_item.dropna(inplace=True)
df_item.rename(columns={"item_id":"pid", "cid3":"cid"}, inplace=True)
df_item.drop(columns=["cid1", "cid2", "cid1_name", "cid2_name", "cid3_name", "brand_code", "price", "item_name", "seg_name"], inplace=True)
df_item.reset_index(drop=True, inplace=True)

show_df_info(df_item)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4730503 entries, 0 to 4730502
Data columns (total 2 columns):
 #   Column  Dtype
---  ------  -----
 0   pid     int64
 1   cid     int64
dtypes: int64(2)
memory usage: 72.2 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 pid    4730503
cid       4098
dtype: int64
####### Example ####### 
             pid    cid
0  100000002008  12015
1  100000002009   6232
2  100000002011   6232
3  100000002012   9744
4  100000002013  11153


In [8]:
df_item = df_item.sample(frac=0.15, random_state=11)
df_item.reset_index(drop=True, inplace=True)

show_df_info(df_item)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 709575 entries, 0 to 709574
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   pid     709575 non-null  int64
 1   cid     709575 non-null  int64
dtypes: int64(2)
memory usage: 10.8 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 pid    709575
cid      3614
dtype: int64
####### Example ####### 
            pid    cid
0  16756787137   1695
1  28012078546  12039
2  14673037900   1509
3     11902157   9286
4     12475079   3304


## CLICK analysis

In [9]:
df_click = pd.read_csv(os.path.join(raw_data_path, "user_click"), usecols=[0,1])
df_click.dropna(inplace=True)
df_click.rename(columns={"user_id":"uid", "item_id":"pid"}, inplace=True)
df_click.reset_index(drop=True, inplace=True)

show_df_info(df_click)

  exec(code_obj, self.user_global_ns, self.user_ns)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52983323 entries, 0 to 52983322
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   uid     int64 
 1   pid     object
dtypes: int64(1), object(1)
memory usage: 808.5+ MB
None
####### Repeat ####### 
 True
####### Count ####### 
 uid      85177
pid    8335109
dtype: int64
####### Example ####### 
    uid       pid
0    0   1150551
1    0  11133236
2    0   6888601
3    0   5812383
4    0   4803330


In [10]:
df_click = df_click.sample(frac=0.15, random_state=11)
df_click.reset_index(drop=True, inplace=True)

show_df_info(df_click)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7947498 entries, 0 to 7947497
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   uid     int64 
 1   pid     object
dtypes: int64(1), object(1)
memory usage: 121.3+ MB
None
####### Repeat ####### 
 True
####### Count ####### 
 uid      81173
pid    2739082
dtype: int64
####### Example ####### 
      uid          pid
0  91559       851954
1  43488      5089253
2  44568     12367746
3  18466  26220866219
4  80782      8591502


In [11]:
df_click = df_click[df_click["uid"].isin(df_user["uid"])]
df_click = df_click[df_click["pid"].isin(df_item["pid"])]

df_click.drop_duplicates(inplace=True)
df_click.reset_index(drop=True, inplace=True)

show_df_info(df_click)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 454909 entries, 0 to 454908
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   uid     454909 non-null  int64 
 1   pid     454909 non-null  object
dtypes: int64(1), object(1)
memory usage: 6.9+ MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid     41145
pid    188573
dtype: int64
####### Example ####### 
      uid          pid
0  29190  16237834468
1  56519      4596108
2  73751  21845916575
3  30253     11798319
4  88255      1228007


# Filter & Process

In [12]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=True)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, user, item, min_uc=0, min_sc=0):
    # Only keep the triplets for users who clicked on at least min_uc items
    if min_uc > 0:
        usercount = get_count(tp, user)
        tp = tp[tp[user].isin(usercount.index[usercount >= min_uc])]
    
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, item)
        tp = tp[tp[item].isin(itemcount.index[itemcount >= min_sc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, user), get_count(tp, item) 
    return tp, usercount, itemcount

### filter df_click (item interactions >= 2)

In [13]:
# Before filtering
users = set(df_click.uid.tolist())
items = set(df_click.pid.tolist())

print(len(users), len(items))

41145 188573


In [14]:
df_click, uid_activity, pid_popularity = filter_triplets(df_click, 'uid', 'pid', min_uc=0, min_sc=2)

sparsity = 1. * df_click.shape[0] / (uid_activity.shape[0] * pid_popularity.shape[0])

print("After filtering, there are %d interaction events from %d users and %d items (sparsity: %.4f%%)" % 
      (df_click.shape[0], uid_activity.shape[0], pid_popularity.shape[0], sparsity * 100))

After filtering, there are 315970 interaction events from 38322 users and 49634 items (sparsity: 0.0166%)


In [15]:
# After filtering
users = set(df_click.uid.tolist())
items = set(df_click.pid.tolist())

print(len(users), len(items))

38322 49634


## CLICK-ITEM merge

In [16]:
df_click_item = pd.merge(df_click, df_item, how="inner", on="pid")
raw_click_item = df_click_item.drop("pid", axis=1, inplace=False)
raw_click_item.drop_duplicates(inplace=True)

show_df_info(raw_click_item)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244318 entries, 0 to 315966
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   uid     244318 non-null  int64
 1   cid     244318 non-null  int64
dtypes: int64(2)
memory usage: 5.6 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid    38322
cid     2056
dtype: int64
####### Example ####### 
      uid    cid
0  56519  13691
1  60437  13691
2  56461  13691
3  56724  13691
4  76507  13691


### filter df_click_item (cid interactions >= 2)

In [17]:
df_click_item, uid_activity, cid_popularity = filter_triplets(raw_click_item, 'uid', 'cid', min_uc=0, min_sc=2)

sparsity = 1. * df_click_item.shape[0] / (uid_activity.shape[0] * cid_popularity.shape[0])

print("After filtering, there are %d interacton events from %d users and %d items (sparsity: %.4f%%)" % 
      (df_click_item.shape[0], uid_activity.shape[0], cid_popularity.shape[0], sparsity * 100))

After filtering, there are 244318 interacton events from 38322 users and 2056 items (sparsity: 0.3101%)


In [18]:
show_df_info(df_click_item)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244318 entries, 0 to 315966
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   uid     244318 non-null  int64
 1   cid     244318 non-null  int64
dtypes: int64(2)
memory usage: 5.6 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid    38322
cid     2056
dtype: int64
####### Example ####### 
      uid    cid
0  56519  13691
1  60437  13691
2  56461  13691
3  56724  13691
4  76507  13691


## UID-UID analysis

In [19]:
df_click = df_click[df_click["uid"].isin(df_click_item["uid"])]

In [20]:
df_click_1 = df_click[["uid", "pid"]].copy()
df_click_1.rename(columns={"uid":"uid1"}, inplace=True)

df_click_2 = df_click[["uid", "pid"]].copy()
df_click_2.rename(columns={"uid":"uid2"}, inplace=True)

In [21]:
df_click1_click2 = pd.merge(df_click_1, df_click_2, how="inner", on="pid")
df_uid_uid = df_click1_click2.drop("pid", axis=1, inplace=False)
df_uid_uid.drop_duplicates(inplace=True)
# df_uid_uid.reset_index(inplace=True)

show_df_info(df_uid_uid)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14559244 entries, 0 to 15696670
Data columns (total 2 columns):
 #   Column  Dtype
---  ------  -----
 0   uid1    int64
 1   uid2    int64
dtypes: int64(2)
memory usage: 333.2 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid1    38322
uid2    38322
dtype: int64
####### Example ####### 
     uid1   uid2
0  56519  56519
1  56519  60437
2  56519  56461
3  56519  56724
4  56519  76507


In [22]:
del df_click_1, df_click_2, df_click1_click2

## Map

In [23]:
df_label = df_user[df_user["uid"].isin(df_click_item["uid"])]

In [24]:
label_statics(df_label, df_label.columns[1:])

####### nCount #######
0    24587
1    13735
Name: gender, dtype: int64
1    18944
2     8919
0     6773
3     2954
4      732
Name: age, dtype: int64
0    25717
1    12605
Name: bin_age, dtype: int64
####### nPercent #######
0    0.64159
1    0.35841
Name: gender, dtype: float64
1    0.494337
2    0.232738
0    0.176739
3    0.077084
4    0.019101
Name: age, dtype: float64
0    0.671077
1    0.328923
Name: bin_age, dtype: float64


In [25]:
uid2id = {num: i for i, num in enumerate(df_label['uid'])}
cid2id = {num: i for i, num in enumerate(pd.unique(df_click_item['cid']))}

In [26]:
def col_map(df, col, num2id):
    df[[col]] = df[[col]].applymap(lambda x: num2id[x])
    return df


def label_map(label_df, label_list):
    for label in label_list:
        label2id = {num: i for i, num in enumerate(pd.unique(label_df[label]))}
        label_df = col_map(label_df, label, label2id)
    return label_df

In [27]:
df_label = col_map(df_label, 'uid', uid2id)
df_label = label_map(df_label, df_label.columns[1:])

show_df_info(df_label)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38322 entries, 8 to 99998
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   uid      38322 non-null  int64
 1   gender   38322 non-null  int64
 2   age      38322 non-null  int64
 3   bin_age  38322 non-null  int64
dtypes: int64(4)
memory usage: 1.5 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid        38322
gender         2
age            5
bin_age        2
dtype: int64
####### Example ####### 
     uid  gender  age  bin_age
8     0       0    0        0
15    1       1    1        0
18    2       0    2        1
19    3       0    2        1
21    4       1    0        0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [28]:
user_edge = df_uid_uid[df_uid_uid['uid1'].isin(df_click_item['uid'])]
user_edge = user_edge[user_edge['uid2'].isin(df_click_item['uid'])]

user_edge = col_map(user_edge, 'uid1', uid2id)
user_edge = col_map(user_edge, 'uid2', uid2id)

show_df_info(user_edge)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14559244 entries, 0 to 15696670
Data columns (total 2 columns):
 #   Column  Dtype
---  ------  -----
 0   uid1    int64
 1   uid2    int64
dtypes: int64(2)
memory usage: 333.2 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid1    38322
uid2    38322
dtype: int64
####### Example ####### 
     uid1   uid2
0  21984  21984
1  21984  24513
2  21984  21946
3  21984  22106
4  21984  34596


In [29]:
user_field = col_map(df_click_item, 'uid', uid2id)
user_field = col_map(user_field, 'cid', cid2id)

show_df_info(user_field)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244318 entries, 0 to 315966
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   uid     244318 non-null  int64
 1   cid     244318 non-null  int64
dtypes: int64(2)
memory usage: 5.6 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid    38322
cid     2056
dtype: int64
####### Example ####### 
      uid  cid
0  21984    0
1  24513    0
2  21946    0
3  22106    0
4  34596    0


In [30]:
label_statics(df_label, df_label.columns[1:])

####### nCount #######
1    24587
0    13735
Name: gender, dtype: int64
2    18944
0     8919
3     6773
1     2954
4      732
Name: age, dtype: int64
1    25717
0    12605
Name: bin_age, dtype: int64
####### nPercent #######
1    0.64159
0    0.35841
Name: gender, dtype: float64
2    0.494337
0    0.232738
3    0.176739
1    0.077084
4    0.019101
Name: age, dtype: float64
1    0.671077
0    0.328923
Name: bin_age, dtype: float64


## Save

In [31]:
save_path = './input_jd_data/orig'

In [32]:
user_edge.to_csv(os.path.join(save_path, "user_edge.csv"), index=False)
user_field.to_csv(os.path.join(save_path, "user_field.csv"), index=False)

In [33]:
df_label.to_csv(os.path.join(save_path, "user_labels.csv"), index=False)

df_label[["uid", "age"]].to_csv(os.path.join(save_path, "user_age.csv"), index=False)
df_label[["uid", "bin_age"]].to_csv(os.path.join(save_path, "user_bin_age.csv"), index=False)
df_label[["uid", "gender"]].to_csv(os.path.join(save_path, "user_gender.csv"), index=False)

## Reprocess

In [34]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

import time

NUM_FIELD = 10

np.random.seed(42)

def field_reader(path):
    """
    Reading the sparse field matrix stored as csv from the disk.
    :param path: Path to the csv file.
    :return field: csr matrix of field.
    """
    user_field = pd.read_csv(path)
    user_index = user_field["uid"].values.tolist()
    field_index = user_field["cid"].values.tolist()
    user_count = max(user_index)+1
    field_count = max(field_index)+1
    field_index = sp.csr_matrix((np.ones_like(user_index), (user_index, field_index)), shape=(user_count, field_count))
    return field_index

user_field = field_reader(os.path.join(save_path, "user_field.csv"))

print("Shapes of user with field:", user_field.shape)
print("Number of user with field:", np.count_nonzero(np.sum(user_field, axis=1)))

def get_neighs(csr):
    neighs = []
#     t = time.time()
    idx = np.arange(csr.shape[1])
    for i in range(csr.shape[0]):
        x = csr[i, :].toarray()[0] > 0
        neighs.append(idx[x])
#         if i % (10*1000) == 0:
#             print('sec/10k:', time.time()-t)
    return neighs

def sample_neigh(neigh, num_sample):
    if len(neigh) >= num_sample:
        sample_neigh = np.random.choice(neigh, num_sample, replace=False)
    elif len(neigh) < num_sample:
        sample_neigh = np.random.choice(neigh, num_sample, replace=True)
    return sample_neigh

neighs = get_neighs(user_field)

sample_neighs = []
for i in range(len(neighs)):
    sample_neighs.append(list(sample_neigh(neighs[i], NUM_FIELD)))
sample_neighs = np.array(sample_neighs)
np.save(os.path.join(save_path, "user_field.npy"), sample_neighs)
print('Shape of sampled user_field:', sample_neighs.shape)

Shapes of user with field: (38322, 2056)
Number of user with field: 38322
Shape of sampled user_field: (38322, 10)
