In [16]:
import pandas as pd
import numpy as np

u_cols = ['user_id', 'game_name', 'rated', 'playtime', 'player_achievement']
dtype_map = {
    'rated': 'int8',
    'playtime': 'float32',
    'player_achievement': 'float32'
}

games = pd.read_csv('data/dataset.csv')

users = pd.read_csv(
    'data/user_data.csv',
    sep=',',
    names=u_cols,
    skiprows=1,
    encoding='latin-1',
    dtype=dtype_map
)


In [17]:
split_index = int(len(users) * 0.8)
train_df = users.iloc[:split_index]
test_df = users.iloc[split_index:]

train = train_df.to_numpy()
test = test_df.to_numpy()

print(train.shape[0])
print(test.shape[0])

1399885
349972


In [20]:
def score(rated, playtime, average_playtime, median_playtime, player_achievement, total_achievement):
    if player_achievement > 0:
        rated_ratio = 0.45
        playtime_ratio = 0.3
        player_achievement_ratio = 0.25
    else:
        rated_ratio = 0.6
        playtime_ratio = 0.4
        player_achievement_ratio = 0
    preference = 0
    if rated == 1:
        preference += rated_ratio
    else:
        preference += 0

    if playtime > median_playtime:
        if playtime > average_playtime:
            preference += playtime_ratio
        else:
            preference += (playtime / average_playtime) * playtime_ratio
    else:
        if playtime > average_playtime:
            preference += (1.8 * playtime / (average_playtime + median_playtime)) * playtime_ratio
        else:
            preference += (playtime / (average_playtime + median_playtime)) * playtime_ratio

    achievement_percent = player_achievement / total_achievement
    if player_achievement >= 0.7:
        preference += (player_achievement / total_achievement) * player_achievement_ratio
    elif player_achievement >= 0.4:
        preference += 0.7 * (player_achievement / total_achievement) * player_achievement_ratio
    else:
        preference += 0

    return preference

In [26]:
df = users.merge(games, left_on='game_name', right_on='name', how='left')
def min_max_scale(series):
    return (series - series.min()) / (series.max() - series.min())

for col in ['rated', 'playtime', 'player_achievement', 'average_playtime', 'median_playtime', 'achievements']:
    df[f'norm_{col}'] = min_max_scale(df[col].fillna(0))

df['score'] = df[
    ['norm_rated', 'norm_playtime', 'norm_player_achievement',
     'norm_average_playtime', 'norm_median_playtime', 'norm_achievements']
].mean(axis=1)

df['reference'] = pd.qcut(df['score'], 5, labels=[1, 2, 3, 4, 5])

df.head()


Unnamed: 0,user_id,game_name,rated,playtime,player_achievement,appid,name,release_date,english,developer,...,owners,price,norm_rated,norm_playtime,norm_player_achievement,norm_average_playtime,norm_median_playtime,norm_achievements,score,reference
0,user_0,Schein,1,164.070007,20.0,321920.0,Schein,2014-10-09,1.0,Zeppelin Studio,...,100000-200000,5.49,1.0,0.00051,0.004001,0.00289,0.00289,0.0042,0.169082,5
1,user_0,Deepworld,1,375.029999,0.0,340810.0,Deepworld,2015-04-22,1.0,Bytebin,...,200000-500000,0.0,1.0,0.001165,0.0,0.001962,0.003436,0.0,0.167761,4
2,user_0,The Expendabros,0,125.050003,0.0,312990.0,The Expendabros,2014-08-05,1.0,Free Lives,...,2000000-5000000,0.0,0.0,0.000388,0.0,0.000446,0.000294,0.0,0.000188,1
3,user_0,Tallowmere,0,705.390015,21.0,340520.0,Tallowmere,2015-03-03,1.0,Chris McFarland,...,20000-50000,2.89,0.0,0.002192,0.004201,0.002203,0.002555,0.0056,0.002792,3
4,user_0,eversion,0,531.76001,1.0,33680.0,eversion,2010-06-07,1.0,Zaratustra Productions,...,50000-100000,3.99,0.0,0.001652,0.0002,0.002445,0.002445,0.0028,0.00159,2
