In [26]:
import pandas as pd
import numpy as np
import os
from os.path import join

## 데이터 로딩

In [52]:
DATA_NAME = "Gowalla"
FILE_NAME = "Gowalla_totalCheckins.txt"
COL_NAMES = ['user','time','lat','long','item']
BASE_DATA_RATIO = 7# N/10
INC_RATIO = 3
INC_STEP = 5

# config
ROOT_PATH = os.path.abspath("")
DATA_PATH = join(ROOT_PATH+"\\dataset", DATA_NAME)
SAVE_PATH = DATA_PATH
FILE_PATH = join(DATA_PATH, FILE_NAME)
FILT_THRESHOLD = 10

print(f"""
ROOT_PATH: {ROOT_PATH}
DATA_PATH: {DATA_PATH}
SAVE_PATH: {SAVE_PATH}
FILE_PATH: {FILE_PATH}
""")


ROOT_PATH: C:\Users\PC\Desktop\Study\KCC-지원-dataset
DATA_PATH: C:\Users\PC\Desktop\Study\KCC-지원-dataset\dataset\Gowalla
SAVE_PATH: C:\Users\PC\Desktop\Study\KCC-지원-dataset\dataset\Gowalla
FILE_PATH: C:\Users\PC\Desktop\Study\KCC-지원-dataset\dataset\Gowalla\Gowalla_totalCheckins.txt



In [53]:
def print_info(df):
    """
    그래프 정보 출력
    """
    print(f"Total Edges : {len(df)}\nTotal User : {len(df['user'].unique())}\nTotal item : {len(df['item'].unique())} \
                \nSmallest user id : {df['user'].unique().min()} \
                \nbiggest user id : {df['user'].unique().max()} \
                \nSmallest item id : {df['item'].unique().min()} \
                \nbiggest item id : {df['item'].unique().max()} \
                \nMin Interaction Per user : {df.user.value_counts().min()} \
                \nMax Interaction Per user : {df.user.value_counts().max()} \
                \nAvg Interaction Per user : {df.user.value_counts().mean()}\
                \nMin Interaction Per item : {df.item.value_counts().min()} \
                \nMax Interaction Per item : {df.item.value_counts().max()} \
                \nAvg Interaction Per item : {df.item.value_counts().mean()}")

In [54]:
# Load File
origin_df = pd.read_table(FILE_PATH, names=COL_NAMES)
print_info(origin_df)

Total Edges : 6442892
Total User : 107092
Total item : 1280969                 
Smallest user id : 0                 
biggest user id : 196585                 
Smallest item id : 8904                 
biggest item id : 5977757                 
Min Interaction Per user : 1                 
Max Interaction Per user : 2175                 
Avg Interaction Per user : 60.16221566503567                
Min Interaction Per item : 1                 
Max Interaction Per item : 5811                 
Avg Interaction Per item : 5.029701733609478


In [55]:
def refactoring_from_0(df):
    """
    :param df: 데이터
    :return: 유저, 아이템 인덱스 전처리 후 데이터
    """
    out_df = pd.DataFrame()

    original_uid = np.sort(df['user'].unique())
    original_iid = np.sort(df['item'].unique())

    u_range = range(len(original_uid))
    i_range = range(len(original_iid))

    uid_mapping = { o_id: n_id for o_id, n_id in zip(original_uid, u_range)} # 원래 유저 아이디 (중간중간 비어있음) : 순서대로 유저 아이디
    iid_mapping = { o_id: n_id for o_id, n_id in zip(original_iid,i_range)} # 원래 아이템 아이디 : 순서대로 아이템 아이디

    uid_map = pd.DataFrame({'o_id' : list(uid_mapping.keys()), 'n_id' : list(uid_mapping.values())})

    iid_map = pd.DataFrame({'o_id' : list(iid_mapping.keys()),'n_id':list(iid_mapping.values())})


    out_df['user'] = df['user'].map(uid_mapping)
    out_df['item'] = df['item'].map(iid_mapping)
    out_df['time'] = df['time']
    return out_df, uid_map, iid_map

In [56]:
refactored_df, uid_map, iid_map = refactoring_from_0(origin_df)
print_info(refactored_df)
refactored_df

Total Edges : 6442892
Total User : 107092
Total item : 1280969                 
Smallest user id : 0                 
biggest user id : 107091                 
Smallest item id : 0                 
biggest item id : 1280968                 
Min Interaction Per user : 1                 
Max Interaction Per user : 2175                 
Avg Interaction Per user : 60.16221566503567                
Min Interaction Per item : 1                 
Max Interaction Per item : 5811                 
Avg Interaction Per item : 5.029701733609478


Unnamed: 0,user,item,time
0,0,11319,2010-10-19T23:55:27Z
1,0,316537,2010-10-18T22:17:43Z
2,0,240915,2010-10-17T23:42:03Z
3,0,6334,2010-10-17T19:26:05Z
4,0,1267091,2010-10-16T18:50:42Z
...,...,...,...
6442887,107090,641922,2010-06-11T13:32:26Z
6442888,107090,679059,2010-06-11T13:26:45Z
6442889,107090,808233,2010-06-11T13:26:34Z
6442890,107091,353112,2010-10-08T21:01:49Z


In [57]:
# Save refactored dataframe
refactored_df.to_csv(join(SAVE_PATH, "refactored.csv"))

## 데이터 전처리

1. 중복 제거

In [85]:
print("원본")
print_info(refactored_df)

원본
Total Edges : 6442892
Total User : 107092
Total item : 1280969                 
Smallest user id : 0                 
biggest user id : 107091                 
Smallest item id : 0                 
biggest item id : 1280968                 
Min Interaction Per user : 1                 
Max Interaction Per user : 2175                 
Avg Interaction Per user : 60.16221566503567                
Min Interaction Per item : 1                 
Max Interaction Per item : 5811                 
Avg Interaction Per item : 5.029701733609478


In [86]:
drop_dupliacted_df = refactored_df.drop_duplicates(subset=['user', 'item'])
print("중복 제거")
print_info(drop_dupliacted_df)

중복 제거
Total Edges : 3981334
Total User : 107092
Total item : 1280969                 
Smallest user id : 0                 
biggest user id : 107091                 
Smallest item id : 0                 
biggest item id : 1280968                 
Min Interaction Per user : 1                 
Max Interaction Per user : 2064                 
Avg Interaction Per user : 37.176763903933065                
Min Interaction Per item : 1                 
Max Interaction Per item : 2631                 
Avg Interaction Per item : 3.1080642857087097


2. 10개 미만 interaction 삭제

In [103]:
# user별 등장 횟수 계산
user_count = drop_dupliacted_df.groupby("user").size().reset_index(name="count")
# user_count[user_count['count'] > 10].sort_values(by="count")
# # 등장 횟수가 10번 초과인 user 추출
filtered_user = user_count[user_count['count'] > 10]['user']
# # 해당 user를 포함한 데이터프레임 추출
filtered_user_df = drop_dupliacted_df[drop_dupliacted_df['user'].isin(filtered_user)]
filtered_user_df.groupby("user").size().sort_values()
filtered_user_df

Unnamed: 0,user,item,time
0,0,11319,2010-10-19T23:55:27Z
1,0,316537,2010-10-18T22:17:43Z
2,0,240915,2010-10-17T23:42:03Z
3,0,6334,2010-10-17T19:26:05Z
4,0,1267091,2010-10-16T18:50:42Z
...,...,...,...
6442884,107090,571537,2010-06-11T13:33:27Z
6442885,107090,452539,2010-06-11T13:32:50Z
6442886,107090,679015,2010-06-11T13:32:40Z
6442887,107090,641922,2010-06-11T13:32:26Z


In [109]:
# item별 등장 횟수 계산
item_count = filtered_user_df.groupby("item").size().reset_index(name="count")
# # 등장 횟수가 10번 초과인 user 추출
filtered_item = item_count[item_count['count'] > 10]['item']
# # 해당 user를 포함한 데이터프레임 추출
filtered_df = filtered_user_df[filtered_user_df['item'].isin(filtered_item)]
print_info(filtered_df)

Total Edges : 1224790
Total User : 62523
Total item : 46370                 
Smallest user id : 0                 
biggest user id : 107089                 
Smallest item id : 1                 
biggest item id : 1279995                 
Min Interaction Per user : 1                 
Max Interaction Per user : 993                 
Avg Interaction Per user : 19.589431089359117                
Min Interaction Per item : 11                 
Max Interaction Per item : 2524                 
Avg Interaction Per item : 26.41341384515851


3. 유저 아이템 인덱스 0부터 설정