In [1]:
#cudf for linux
# %conda install -c rapidsai -c conda-forge -c nvidia cudf=23.04 python=3.10 cudatoolkit=11.8
#https://github.com/rapidsai/cudf
#https://docs.rapids.ai/install#WSL2

^C

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
# import cudf

# Load data to df
## Load data file

In [3]:
import os
from os.path import join
import numpy as np
np.random.seed(2023)
#Gowalla setting
#================================================
DATA_NAME = "Gowalla"
FILE_NAME = "Gowalla_totalCheckins.txt"
COL_NAMES = ['user','time','lat','long','item']
BASE_DATA_RATIO = 7# N/10
INC_RATIO = 3
INC_STEP = 5
#================================================

# config
ROOT_PATH = os.path.abspath("")
DATA_PATH = join(ROOT_PATH+"\\data", DATA_NAME)
SAVE_PATH = join(ROOT_PATH,"data")
FILE_PATH = join(DATA_PATH, FILE_NAME)
FILT_THRESHOLD = 10




In [4]:
# Load File
origin_df = pd.read_table(FILE_PATH, names=COL_NAMES)
#origin_df = cudf.from_pandas(origin_df)
def print_info(df):
        print(f"Total Edges : {len(df)}\nTotal User : {len(df['user'].unique())}\nTotal item : {len(df['item'].unique())} \
                \nSmallest user id : {df['user'].unique().min()} \
                \nbiggest user id : {df['user'].unique().max()} \
                \nSmallest item id : {df['item'].unique().min()} \
                \nbiggest item id : {df['item'].unique().max()} \
                \nMin Interaction Per user : {df.user.value_counts().min()} \
                \nMax Interaction Per user : {df.user.value_counts().max()} \
                \nAvg Interaction Per user : {df.user.value_counts().mean()}\
                \nMin Interaction Per item : {df.item.value_counts().min()} \
                \nMax Interaction Per item : {df.item.value_counts().max()} \
                \nAvg Interaction Per item : {df.item.value_counts().mean()}")
print_info(origin_df)

Total Edges : 6442892
Total User : 107092
Total item : 1280969                 
Smallest user id : 0                 
biggest user id : 196585                 
Smallest item id : 8904                 
biggest item id : 5977757                 
Min Interaction Per user : 1                 
Max Interaction Per user : 2175                 
Avg Interaction Per user : 60.16221566503567                
Min Interaction Per item : 1                 
Max Interaction Per item : 5811                 
Avg Interaction Per item : 5.029701733609478


다음번 부터는 필터링 하고 리팩토링 할것 필요 없는 것들 까지들어가있어서 시간이 훨씬 오래걸린다......

반복문이 아닌 판다스 기본 기능을 활용하니까 훨씬 빠르다</br>
mapping = {index: i for i, index in enumerate(df.index.unique())}</br>
이 방법으로 훨씬 간단하게 매핑 가능

In [5]:
from tqdm import tqdm
from multiprocessing import Pool
import numpy as np




def refactoring_from_0(df):
    out_df = pd.DataFrame() 
    
    original_uid = np.sort(df['user'].unique())
    original_iid = np.sort(df['item'].unique())

    u_range = range(len(original_uid))
    i_range = range(len(original_iid))

    uid_mapping = { o_id: n_id for o_id, n_id in zip(original_uid, u_range)} # 원래 유저 아이디 (중간중간 비어있음) : 순서대로 유저 아이디
    iid_mapping = { o_id: n_id for o_id, n_id in zip(original_iid,i_range)} # 원래 아이템 아이디 : 순서대로 아이템 아이디

    uid_map = pd.DataFrame({'o_id' : list(uid_mapping.keys()), 'n_id' : list(uid_mapping.values())})

    iid_map = pd.DataFrame({'o_id' : list(iid_mapping.keys()),'n_id':list(iid_mapping.values())})


    out_df['user'] = df['user'].map(uid_mapping)
    out_df['item'] = df['item'].map(iid_mapping)
    out_df['time'] = df['time']
    return out_df, uid_map, iid_map


# NUM_CORES = 8

# def user_replace(df):
#     return df.replace({'user':{user_id:id}})

# def item_replace(df):
#     return df.replace({'item':{item_id:id}})


# def parallelize_dataframe(df, func):
#     df_split = np.array_split(df, NUM_CORES)
#     pool = Pool(NUM_CORES)
#     df = pd.concat(pool.map(func, df_split))
#     pool.close()
#     pool.join()
#     return df

# Change each user and item index start from 0
# origin_df.sort_values(by='user',ascending=True)

# # Change user id to start from 0
# id = 0


# for user_id in tqdm(origin_df['user'].unique()):
#     parameter = {'user':{user_id:id}}
#     # origin_df = parallelize_dataframe(origin_df, user_replace)
#     id += 1

# Change item id to start from 0
# origin_df.sort_values(by='item',ascending=True)
# id = 0

# for item_id in tqdm(origin_df['item'].unique()):
#     parameter = {'item':{item_id:id}}
#     # origin_df = parallelize_dataframe(origin_df, item_replace)
#     id += 1



In [6]:
# Test refactoring function 
temp_df, a, b = refactoring_from_0(origin_df)
print_info(temp_df)
temp_df

Total Edges : 6442892
Total User : 107092
Total item : 1280969                 
Smallest user id : 0                 
biggest user id : 107091                 
Smallest item id : 0                 
biggest item id : 1280968                 
Min Interaction Per user : 1                 
Max Interaction Per user : 2175                 
Avg Interaction Per user : 60.16221566503567                
Min Interaction Per item : 1                 
Max Interaction Per item : 5811                 
Avg Interaction Per item : 5.029701733609478


Unnamed: 0,user,item,time
0,0,11319,2010-10-19T23:55:27Z
1,0,316537,2010-10-18T22:17:43Z
2,0,240915,2010-10-17T23:42:03Z
3,0,6334,2010-10-17T19:26:05Z
4,0,1267091,2010-10-16T18:50:42Z
...,...,...,...
6442887,107090,641922,2010-06-11T13:32:26Z
6442888,107090,679059,2010-06-11T13:26:45Z
6442889,107090,808233,2010-06-11T13:26:34Z
6442890,107091,353112,2010-10-08T21:01:49Z


In [8]:
# Save refactored dataframe
temp_df.to_csv(DATA_PATH+"\\refactored.intr",sep=" ",index=False,header=None)

In [102]:
# Load refactored dataframe
origin_df = pd.read_table(DATA_PATH+"\\refactored.intr", names=COL_NAMES)

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\PC\\Desktop\\MGCCF\\data\\Gowalla_refactored.txt'

## Preprocessing the data
1. remove duplcaite
2. filter data which have under 10 interaction
3. refactor the id of user and item start from 0

In [7]:
df = origin_df
# Just interaction and drop duplicate
df = df[['user','time','item']]
df = df.drop_duplicates(subset=['user','item'])

In [10]:
# Save Interaction information without filtering
WO_filtering = df
# Sort with ascending order with user first and item
WO_filtering = WO_filtering.sort_values(by=['user','item','time'], ascending=[True,True,True])
WO_filtering, a, b = refactoring_from_0(WO_filtering)

In [11]:
WO_filtering_filename = DATA_PATH + "\\WO_Filtering.intr"
if not os.path.exists(WO_filtering_filename):
    WO_filtering.to_csv(WO_filtering_filename,sep=" ", index=False, header=None)

### Filter the data that has less than 10 interaction

In [8]:
filtering_filename = DATA_PATH + "\\filtering_refactored.intr"
fdf = df

while fdf.user.value_counts().min() < FILT_THRESHOLD or fdf.item.value_counts().min() < FILT_THRESHOLD:
    df_item = fdf.groupby('item').count()
    df_item = df_item[df_item.user < FILT_THRESHOLD]
    li = df_item.index.to_list()
    fdf = fdf.drop(fdf.loc[fdf.item.isin(li)].index)
    # print_info(fdf)
    df_usr = fdf.groupby('user').count()
    df_usr = df_usr[df_usr.item < FILT_THRESHOLD]
    li = df_usr.index.to_list()
    fdf = fdf.drop(fdf.loc[fdf.user.isin(li)].index)
    # print_info(fdf)
    # print(f"Total Edges : {len(fdf)}\nTotal User : {len(fdf['user'].unique())}\nTotal item : {len(fdf['item'].unique())} \
    #             \nMin Interaction Per user : {fdf.user.value_counts().min()} \
    #             \nMax Interaction Per user : {fdf.user.value_counts().max()} \
    #             \nAvg Interaction Per user : {fdf.user.value_counts().mean()}\
    #             \nMin Interaction Per item : {fdf.item.value_counts().min()} \
    #             \nMax Interaction Per item : {fdf.item.value_counts().max()} \
    #             \nAvg Interaction Per item : {fdf.item.value_counts().mean()}")
print_info(fdf)
fdf = fdf.reset_index().drop(columns = ['index'])
fdf


Total Edges : 1027464
Total User : 29858
Total item : 40988                 
Smallest user id : 0                 
biggest user id : 196183                 
Smallest item id : 8932                 
biggest item id : 5838873                 
Min Interaction Per user : 10                 
Max Interaction Per user : 1014                 
Avg Interaction Per user : 34.411681961283406                
Min Interaction Per item : 10                 
Max Interaction Per item : 2310                 
Avg Interaction Per item : 25.067434371035425


Unnamed: 0,user,time,item
0,0,2010-10-19T23:55:27Z,22847
1,0,2010-10-18T22:17:43Z,420315
2,0,2010-10-17T23:42:03Z,316637
3,0,2010-10-17T19:26:05Z,16516
4,0,2010-10-16T18:50:42Z,5535878
...,...,...,...
1027459,196183,2010-04-27T04:32:19Z,73521
1027460,196183,2010-03-14T18:13:45Z,9724
1027461,196183,2010-03-13T03:29:55Z,388127
1027462,196183,2010-02-15T03:33:42Z,55858


### Refactor to make each user and item number start from 0

In [9]:
fdf, filtered_uid_map, filtered_iid_map = refactoring_from_0(fdf)
print_info(fdf)
fdf

Total Edges : 1027464
Total User : 29858
Total item : 40988                 
Smallest user id : 0                 
biggest user id : 29857                 
Smallest item id : 0                 
biggest item id : 40987                 
Min Interaction Per user : 10                 
Max Interaction Per user : 1014                 
Avg Interaction Per user : 34.411681961283406                
Min Interaction Per item : 10                 
Max Interaction Per item : 2310                 
Avg Interaction Per item : 25.067434371035425


Unnamed: 0,user,item,time
0,0,4169,2010-10-19T23:55:27Z
1,0,31533,2010-10-18T22:17:43Z
2,0,29074,2010-10-17T23:42:03Z
3,0,2572,2010-10-17T19:26:05Z
4,0,40979,2010-10-16T18:50:42Z
...,...,...,...
1027459,29857,13173,2010-04-27T04:32:19Z
1027460,29857,392,2010-03-14T18:13:45Z
1027461,29857,30837,2010-03-13T03:29:55Z
1027462,29857,10578,2010-02-15T03:33:42Z


In [11]:
fdf.to_csv(filtering_filename,sep=" ", index=False, header=None)
filtered_uid_map.to_csv(DATA_PATH+"\\filtered_uid_map.uid",sep=" ", index=False, header=None)
filtered_iid_map.to_csv(DATA_PATH+"\\filtered_iid_map.iid",sep=" ", index=False, header=None)
shuffled = pd.DataFrame(np.random.permutation(fdf))
train = shuffled[:len(fdf)*8//10]
test = shuffled[len(fdf)*8//10:]
train.to_csv(DATA_PATH+"\\train.data",sep=" ", index=False, header=None)
test.to_csv(DATA_PATH+"\\test.data",sep=" ", index=False, header=None)

### Split in to train, test set

## User-Item Time Sequential Incremental Split

In [23]:
INC_TIME_FILE_PATH = os.path.join(SAVE_PATH,f"Incremental\\{DATA_NAME}\\Time\\")
fdf = fdf.sort_values(by="user", ascending=True)
print(len(fdf))
pivot = len(fdf)*BASE_DATA_RATIO//10
fdf[:pivot].to_csv(INC_TIME_FILE_PATH + "base.data", sep=" ", header=None, index=False)
print(pivot)
remain = fdf[pivot:]
start = 0
length = (len(fdf) - pivot)//INC_STEP
inc_block = []
for i in range(INC_STEP):
    if i != INC_STEP-1:
        inc_block.append(remain[start:start+length])
    else:
        inc_block.append(remain[start:])
    start += length
    print(f"Inc{i} Block Size : {len(inc_block[i])}")
    inc_block[i].to_csv(INC_TIME_FILE_PATH+f"inc{i+1}.data", sep = " ", header=None, index=False)

1027464
821971
Inc0 Block Size : 41098
Inc1 Block Size : 41098
Inc2 Block Size : 41098
Inc3 Block Size : 41098
Inc4 Block Size : 41101


# User Incremental

In [59]:
INC_USER_FILE_PATH = os.path.join(SAVE_PATH,f"Incremental\\{DATA_NAME}\\User\\")
print(f"Total User = {len(fdf['user'].unique())}")
# Make df containing number of user interaction
ucdf = fdf.groupby('user').count()
# Shuffle the df
ucdf = ucdf.iloc[np.random.permutation(ucdf.index)].reset_index()

is_base = True

base_user = []
base_size = len(fdf)*BASE_DATA_RATIO//10
inc_user = [[] for i in range(INC_STEP)]
inc_size = len(fdf)*INC_RATIO

i = 0
cur = 0

while i < INC_STEP:
    count = 0
    if is_base:
        # Base block
        while count < base_size:
            row = ucdf.iloc[cur]
            base_user.append(row['user'])
            count += row['item']
            cur += 1
        print(f"Base Block Size : {count}")
        with open(INC_USER_FILE_PATH+"base_user.data", 'w') as fp:
            for user in base_user:
                # write each item on a new line
                fp.write("%s\n" % user)
        print('Base Done')
        is_base = False
    elif i == INC_STEP-1:
        # Last incremental block
        inc_user[i]= ucdf[cur:]['user'].values.tolist()
        count = ucdf[cur:]['item'].values.sum()
        with open(INC_USER_FILE_PATH+f"inc_user_{i+1}.data", 'w') as fp:
            for user in inc_user[i]:
                # write each item on a new line
                fp.write("%s\n" % user)
        print(f'Inc{i+1} Done')
        print(f"Inc Block{i+1} Size : {count}")
        i += 1
    else:
        # Incremental block
        while count <= inc_size:
            row = ucdf.iloc[cur]
            inc_user[i].append(row['user'])
            count += row['item']
            cur += 1
        with open(INC_USER_FILE_PATH+f"inc_user_{i+1}.data", 'w') as fp:
            for user in inc_user[i]:
                # write each item on a new line
                fp.write("%s\n" % user)
        print(f'Inc{i+1} Done')
        print(f"Inc Block{i+1} Size : {count}")
        i += 1
        


Total User = 29858
Base Block Size : 822003
Base Done
Inc1 Done
Inc Block1 Size : 41111
Inc2 Done
Inc Block2 Size : 41119
Inc3 Done
Inc Block3 Size : 41122
Inc4 Done
Inc Block4 Size : 41131
Inc5 Done
Inc Block5 Size : 40978


# Item Incremental

In [58]:
INC_ITEM_FILE_PATH = os.path.join(SAVE_PATH,f"Incremental\\{DATA_NAME}\\Item\\")
print(f"Total Item = {len(fdf['item'].unique())}")
# Make df containing number of user interaction
icdf = fdf.groupby('item').count()
# Shuffle the df
icdf = icdf.iloc[np.random.permutation(icdf.index)].reset_index()

is_base = True

base_item = []
inc_item = [[] for i in range(INC_STEP)]

i = 0
cur = 0

while i < INC_STEP:
    count = 0
    if is_base:
        # Base block
        while count < base_size:
            row = icdf.iloc[cur]
            base_item.append(row['item'])
            count += row['user']
            cur += 1
        print(f"Base Block Size : {count}")
        with open(INC_ITEM_FILE_PATH+"base_item.data", 'w') as fp:
            for item in base_user:
                # write each item on a new line
                fp.write("%s\n" % item)
        print('Base Done')
        is_base = False
    elif i == INC_STEP-1:
        # Last incremental block
        inc_item[i]= icdf[cur:]['item'].values.tolist()
        count = icdf[cur:]['user'].values.sum()
        with open(INC_ITEM_FILE_PATH+f"inc_item_{i+1}.data", 'w') as fp:
            for item in inc_item[i]:
                # write each item on a new line
                fp.write("%s\n" % item)
        print(f'Inc{i+1} Done')
        print(f"Inc Block{i+1} Size : {count}")
        i += 1
    else:
        # Incremental block
        while count <= inc_size:
            row = icdf.iloc[cur]
            inc_item[i].append(row['item'])
            count += row['user']
            cur += 1
        with open(INC_ITEM_FILE_PATH+f"inc_item_{i+1}.data", 'w') as fp:
            for item in inc_item[i]:
                # write each item on a new line
                fp.write("%s\n" % item)
        print(f'Inc{i+1} Done')
        print(f"Inc Block{i+1} Size : {count}")
        i += 1

Total Item = 40988
Base Block Size : 821989
Base Done
Inc1 Done
Inc Block1 Size : 41273
Inc2 Done
Inc Block2 Size : 41161
Inc3 Done
Inc Block3 Size : 41112
Inc4 Done
Inc Block4 Size : 41107
Inc5 Done
Inc Block5 Size : 40822
