In [1]:
import pandas as pd
import numpy as np

u_cols = ['user_id', 'game_name', 'rated', 'playtime', 'player_achievement']
dtype_map = {
    'rated': 'int8',
    'playtime': 'float32',
    'player_achievement': 'float32'
}

games = pd.read_csv('data/dataset.csv')

users = pd.read_csv(
    'data/user_data.csv',
    sep=',',
    names=u_cols,
    skiprows=1,
    encoding='utf-8',
    dtype=dtype_map
)


In [2]:
def score(rated, playtime, average_playtime, median_playtime, player_achievement, total_achievement):
    if player_achievement > 0:
        rated_ratio = 0.45
        playtime_ratio = 0.3
        player_achievement_ratio = 0.25
    else:
        rated_ratio = 0.6
        playtime_ratio = 0.4
        player_achievement_ratio = 0
    preference = 0
    if rated == 1:
        preference += rated_ratio
    else:
        preference += 0

    if playtime > median_playtime:
        if playtime > average_playtime:
            preference += playtime_ratio
        else:
            preference += (playtime / average_playtime) * playtime_ratio
    else:
        if playtime > average_playtime:
            preference += (1.8 * playtime / (average_playtime + median_playtime)) * playtime_ratio
        else:
            preference += (playtime / (average_playtime + median_playtime)) * playtime_ratio

    achievement_percent = player_achievement / total_achievement
    if player_achievement >= 0.7:
        preference += (player_achievement / total_achievement) * player_achievement_ratio
    elif player_achievement >= 0.4:
        preference += 0.7 * (player_achievement / total_achievement) * player_achievement_ratio
    else:
        preference += 0

    return preference

In [3]:
df = users.merge(games, left_on='game_name', right_on='name', how='left')
def min_max_scale(series):
    return (series - series.min()) / (series.max() - series.min())

for col in ['rated', 'playtime', 'player_achievement', 'average_playtime', 'median_playtime', 'achievements']:
    df[f'norm_{col}'] = min_max_scale(df[col].fillna(0))

df['score'] = df[
    ['norm_rated', 'norm_playtime', 'norm_player_achievement',
     'norm_average_playtime', 'norm_median_playtime', 'norm_achievements']
].mean(axis=1)

df['preference'] = pd.qcut(df['score'], 5, labels=[1, 2, 3, 4, 5])


In [11]:
from CF import CF
from sklearn.model_selection import train_test_split

ratings = df[['user_id', 'game_name', 'preference']].copy()
ratings['user_id_code'] = ratings['user_id'].astype('category').cat.codes
ratings['game_name_code'] = ratings['game_name'].astype('category').cat.codes

Y_data = ratings[['user_id_code', 'game_name_code', 'preference']].to_numpy()

Y_train, Y_test = train_test_split(Y_data, test_size=0.2, random_state=42)

rs = CF(Y_train, k=20, uuCF=0)
rs.fit()


In [10]:
n_tests = min(10000, Y_test.shape[0])
SE = 0
for n in range(n_tests):
    u, i, actual_rating = Y_test[n]
    u, i = int(u), int(i)  # ép kiểu về int
    pred_rating = rs.pred(u, i, normalized=0)
    SE += (pred_rating - actual_rating) ** 2

RMSE = np.sqrt(SE / n_tests)
print("Item-Item CF, RMSE =", RMSE)

Item-Item CF, RMSE = 1.6965264018213724


In [7]:
print(Y_test.shape[0])

350521
