# ml-100k는 u.data
# ml-latest-small는 ratings.csv

In [65]:
import pandas as pd
import numpy as np
import os
from os.path import join

## 데이터 로딩

In [66]:
DATA_NAME = "Movielens"
FILE_NAME1 = "ml-100k\\u.data"
FILE_NAME2 = "ml-latest-small\\ratings.csv"
COL_NAMES = ['user', 'item', 'rating', 'time']  # 초기 컬럼  user id | item id | rating | timestamp
BASE_DATA_RATIO = 7  # N/10
INC_RATIO = 3
INC_STEP = 5

# config
ROOT_PATH = os.path.abspath("")
DATA_PATH = join(ROOT_PATH + "\\dataset", DATA_NAME)
SAVE_PATH = DATA_PATH
FILE_PATH1 = join(DATA_PATH, FILE_NAME1)
FILE_PATH2 = join(DATA_PATH, FILE_NAME2)
FILT_THRESHOLD = 10  # 최소 데이터 갯수
cols = ["user", "item", "time"]  # 사용할 컬럼

print(f"""
ROOT_PATH: {ROOT_PATH}
DATA_PATH: {DATA_PATH}
SAVE_PATH: {SAVE_PATH}
FILE_PATH1: {FILE_PATH1}
FILE_PATH2: {FILE_PATH2}
""")


ROOT_PATH: C:\tech\Study\KCC
DATA_PATH: C:\tech\Study\KCC\dataset\Movielens
SAVE_PATH: C:\tech\Study\KCC\dataset\Movielens
FILE_PATH1: C:\tech\Study\KCC\dataset\Movielens\ml-100k\u.data
FILE_PATH2: C:\tech\Study\KCC\dataset\Movielens\ml-latest-small\ratings.csv



In [67]:
def print_info(df):
    """
    그래프 정보 출력
    """
    print(f"Total Edges : {len(df)}\nTotal User : {len(df['user'].unique())}\nTotal item : {len(df['item'].unique())} \
                \nSmallest user id : {df['user'].unique().min()} \
                \nbiggest user id : {df['user'].unique().max()} \
                \nSmallest item id : {df['item'].unique().min()} \
                \nbiggest item id : {df['item'].unique().max()} \
                \nMin Interaction Per user : {df.user.value_counts().min()} \
                \nMax Interaction Per user : {df.user.value_counts().max()} \
                \nAvg Interaction Per user : {df.user.value_counts().mean()}\
                \nMin Interaction Per item : {df.item.value_counts().min()} \
                \nMax Interaction Per item : {df.item.value_counts().max()} \
                \nAvg Interaction Per item : {df.item.value_counts().mean()}")

In [68]:
# Load File
ml_100k_df = pd.read_table(FILE_PATH1, names=COL_NAMES)
ml_small_df = pd.read_csv(FILE_PATH2, names=COL_NAMES, skiprows=[0])

In [69]:
print_info(ml_100k_df)

Total Edges : 100000
Total User : 943
Total item : 1682                 
Smallest user id : 1                 
biggest user id : 943                 
Smallest item id : 1                 
biggest item id : 1682                 
Min Interaction Per user : 20                 
Max Interaction Per user : 737                 
Avg Interaction Per user : 106.04453870625663                
Min Interaction Per item : 1                 
Max Interaction Per item : 583                 
Avg Interaction Per item : 59.45303210463734


In [70]:
print_info(ml_small_df)

Total Edges : 100836
Total User : 610
Total item : 9724                 
Smallest user id : 1                 
biggest user id : 610                 
Smallest item id : 1                 
biggest item id : 193609                 
Min Interaction Per user : 20                 
Max Interaction Per user : 2698                 
Avg Interaction Per user : 165.30491803278687                
Min Interaction Per item : 1                 
Max Interaction Per item : 329                 
Avg Interaction Per item : 10.369806663924312


## 데이터 전처리

* 중복 제거: 동일 유저, 동일 아이템에 대한 평점은 가장 최근 것만
* 긍정 데이터: rating 3점 이상 데이터만
* 유효 데이터: interaction 10개 이상

1. 중복 제거 (중복이 없다.)

In [71]:
ml_small_df[ml_small_df.duplicated(subset=['user', 'item'])]

Unnamed: 0,user,item,rating,time


2. 평점 3점 이상 데이터만 남기기

In [72]:
def GetPositiveDf(df):
    return df[df['rating'] >= 3]


ml_100k_positive_df = GetPositiveDf(ml_100k_df)
ml_small_positive_df = GetPositiveDf(ml_small_df)

In [73]:
print_info(ml_100k_positive_df)

Total Edges : 82520
Total User : 943
Total item : 1574                 
Smallest user id : 1                 
biggest user id : 943                 
Smallest item id : 1                 
biggest item id : 1682                 
Min Interaction Per user : 6                 
Max Interaction Per user : 509                 
Avg Interaction Per user : 87.50795334040296                
Min Interaction Per item : 1                 
Max Interaction Per item : 558                 
Avg Interaction Per item : 52.42693773824651


In [74]:
print_info(ml_small_positive_df)

Total Edges : 81763
Total User : 609
Total item : 8452                 
Smallest user id : 1                 
biggest user id : 610                 
Smallest item id : 1                 
biggest item id : 193609                 
Min Interaction Per user : 4                 
Max Interaction Per user : 2117                 
Avg Interaction Per user : 134.25779967159278                
Min Interaction Per item : 1                 
Max Interaction Per item : 315                 
Avg Interaction Per item : 9.673805016564128


3. 10개 미만 interaction 삭제

In [75]:
def GetVailidDf(fdf):
    while fdf.user.value_counts().min() < FILT_THRESHOLD or fdf.item.value_counts().min() < FILT_THRESHOLD:
        df_item = fdf.groupby('item').count()
        df_item = df_item[df_item.user < FILT_THRESHOLD]
        li = df_item.index.to_list()
        fdf = fdf.drop(fdf.loc[fdf.item.isin(li)].index)
        # print_info(fdf)
        df_usr = fdf.groupby('user').count()
        df_usr = df_usr[df_usr.item < FILT_THRESHOLD]
        li = df_usr.index.to_list()
        fdf = fdf.drop(fdf.loc[fdf.user.isin(li)].index)
    fdf = fdf.reset_index().drop(columns=['index'])
    return fdf


ml_100k_vailid_df = GetVailidDf(ml_100k_positive_df)
ml_small_vailid_df = GetVailidDf(ml_small_positive_df)

4. 시간 오름차순 정렬

In [76]:
def GetSortedTimestampDf(df):
    df['time'] = pd.to_datetime(df['time'])
    df = df.sort_values('time')
    return df


pre_ml_100k_df = GetSortedTimestampDf(ml_100k_vailid_df)
pre_ml_small_df = GetSortedTimestampDf(ml_small_vailid_df)

In [77]:
pre_ml_small_df

Unnamed: 0,user,item,rating,time
44203,429,434,4.0,1970-01-01 00:00:00.828124615
44198,429,351,4.0,1970-01-01 00:00:00.828124615
44197,429,349,3.0,1970-01-01 00:00:00.828124615
44179,429,227,3.0,1970-01-01 00:00:00.828124615
44201,429,421,4.0,1970-01-01 00:00:00.828124615
...,...,...,...,...
23563,233,81845,3.5,1970-01-01 00:00:01.537470521
26713,272,148626,4.5,1970-01-01 00:00:01.537470612
26714,272,158238,4.0,1970-01-01 00:00:01.537475893
20509,210,122916,4.5,1970-01-01 00:00:01.537632280


In [78]:
SAVE_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens'

In [79]:
pre_ml_100k_df.to_csv(SAVE_PATH + "\\preprocessed_ml_100k.csv", index=False)
pre_ml_small_df.to_csv(SAVE_PATH + "\\preprocessed_ml_small.csv", index=False)

## 시나리오1, 유저와 아이템 모두 증가. (increase)

In [80]:
pre_ml_100k_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_100k.csv"))
pre_ml_small_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_small.csv"))

In [81]:
# 경로 설정
INCREASE_PATH = os.path.join(SAVE_PATH, "increase\\")
INCREASE_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\increase\\'

In [82]:
# base block 설정
def SetBaseBlock(df, path, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    print(f"base block size: {pivot}")
    df[:pivot].to_csv(path + category + "_inc0.csv", index=False)


SetBaseBlock(pre_ml_100k_df, INCREASE_PATH, "ml_100k")
SetBaseBlock(pre_ml_small_df, INCREASE_PATH, "ml_small")

base block size: 56275
base block size: 45332


In [83]:
def SetIncreaseBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    remain = df[pivot:]
    len_per_block = remain.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(remain[start:start + len_per_block])
        else:
            inc_block.append(remain[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(INCREASE_PATH + category + f"_inc{i + 1}.csv", index=False)


SetIncreaseBlocks(pre_ml_100k_df, "ml_100k")
SetIncreaseBlocks(pre_ml_small_df, "ml_small")

Inc1 Block Size : 4823
Inc2 Block Size : 4823
Inc3 Block Size : 4823
Inc4 Block Size : 4823
Inc5 Block Size : 4826
Inc1 Block Size : 3885
Inc2 Block Size : 3885
Inc3 Block Size : 3885
Inc4 Block Size : 3885
Inc5 Block Size : 3888


## 시나리오2, 유저와 아이템 모두 고정. (fixed)

In [84]:
pre_ml_100k_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_100k.csv"))
pre_ml_small_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_small.csv"))

In [85]:
FIXED_PATH = os.path.join(SAVE_PATH, "fixed\\")
FIXED_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\fixed\\'

In [86]:
# base block 설정
SetBaseBlock(pre_ml_100k_df, FIXED_PATH, "ml_100k")
SetBaseBlock(pre_ml_small_df, FIXED_PATH, "ml_small")

base block size: 56275
base block size: 45332


In [87]:

def SetFixedBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    # base block에 있는 unique한 user, item index 추출
    unique_user = df[:pivot]['user'].unique()
    unique_item = df[:pivot]['item'].unique()
    # remain block에서 unique user/item에 해당하는 데이터만 추출
    remain = df[pivot:]
    fixed_df = remain[remain['user'].isin(unique_user) & remain['item'].isin(unique_item)]
    len_per_block = fixed_df.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(fixed_df[start:start + len_per_block])
        else:
            inc_block.append(fixed_df[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(FIXED_PATH + category + f"_inc{i + 1}.csv", index=False)


SetFixedBlocks(pre_ml_100k_df, "ml_100k")
SetFixedBlocks(pre_ml_small_df, "ml_small")

Inc1 Block Size : 626
Inc2 Block Size : 626
Inc3 Block Size : 626
Inc4 Block Size : 626
Inc5 Block Size : 627
Inc1 Block Size : 190
Inc2 Block Size : 190
Inc3 Block Size : 190
Inc4 Block Size : 190
Inc5 Block Size : 193


## 시나리오3, 유저만 증가 (user)

In [88]:
pre_ml_100k_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_100k.csv"))
pre_ml_small_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_small.csv"))

In [89]:
USER_PATH = os.path.join(SAVE_PATH, "user\\")
USER_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\user\\'

In [90]:
# base block 설정
SetBaseBlock(pre_ml_100k_df, USER_PATH, "ml_100k")
SetBaseBlock(pre_ml_small_df, USER_PATH, "ml_small")

base block size: 56275
base block size: 45332


In [91]:
def SetUserBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    # base block에 있는 unique한 user index 추출
    unique_user = df[:pivot]['user'].unique()
    # base block에 존재하는 unique한 user index를 제외하여 remain을 형성하자.
    remain = df[pivot:]
    new_user_remain_df = remain[~remain['user'].isin(unique_user)]
    len_per_block = new_user_remain_df.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(new_user_remain_df[start:start + len_per_block])
        else:
            inc_block.append(new_user_remain_df[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(USER_PATH + category + f"_inc{i + 1}.csv")


SetUserBlocks(pre_ml_100k_df, "ml_100k")
SetUserBlocks(pre_ml_small_df, "ml_small")

Inc1 Block Size : 4173
Inc2 Block Size : 4173
Inc3 Block Size : 4173
Inc4 Block Size : 4173
Inc5 Block Size : 4174
Inc1 Block Size : 3613
Inc2 Block Size : 3613
Inc3 Block Size : 3613
Inc4 Block Size : 3613
Inc5 Block Size : 3615


## 시나리오4, 아이템만 증가. (item)

In [92]:
pre_ml_100k_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_100k.csv"))
pre_ml_small_df = pd.read_csv(os.path.join(SAVE_PATH, "preprocessed_ml_small.csv"))

In [93]:
ITEM_PATH = os.path.join(SAVE_PATH, "item\\")
ITEM_PATH

'C:\\tech\\Study\\KCC\\dataset\\Movielens\\item\\'

In [94]:
# base block 설정
SetBaseBlock(pre_ml_100k_df, ITEM_PATH, "ml_100k")
SetBaseBlock(pre_ml_small_df, ITEM_PATH, "ml_small")

base block size: 56275
base block size: 45332


In [95]:
def SetItemBlocks(df, category):
    pivot = len(df) * BASE_DATA_RATIO // 10
    # base block에 있는 unique한 item index 추출
    unique_item = df[:pivot]['item'].unique()
    # base block에 존재하는 unique한 user index를 제외하여 remain을 형성하자.
    remain = df[pivot:]
    new_item_remain_df = remain[~remain['item'].isin(unique_item)]
    len_per_block = new_item_remain_df.shape[0] // INC_STEP
    start = 0
    inc_block = []
    for i in range(INC_STEP):
        if i != INC_STEP - 1:
            inc_block.append(new_item_remain_df[start:start + len_per_block])
        else:
            inc_block.append(new_item_remain_df[start:])
        start += len_per_block
        print(f"Inc{i + 1} Block Size : {len(inc_block[i])}")
        inc_block[i].to_csv(ITEM_PATH + category + f"_inc{i + 1}.csv")

SetItemBlocks(pre_ml_100k_df, "ml_100k")
SetItemBlocks(pre_ml_small_df, "ml_small")

Inc1 Block Size : 68
Inc2 Block Size : 68
Inc3 Block Size : 68
Inc4 Block Size : 68
Inc5 Block Size : 69
Inc1 Block Size : 512
Inc2 Block Size : 512
Inc3 Block Size : 512
Inc4 Block Size : 512
Inc5 Block Size : 516
