In [115]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import math
from scipy import stats
import statistics as st

import operator

In [116]:
steam_data = pd.read_csv("data_final/steam-200k.csv",header=None, names=["User_ID", "Game", "Interaction", "Hours", "Ignore"])

steam_raw = steam_data.drop("Ignore", axis =1)
steam_purchase = steam_raw[steam_raw['Interaction'] == "purchase"]
steam_play = steam_raw[steam_raw['Interaction'] == "play"]
steam = pd.merge(steam_purchase, steam_play, left_on = ['User_ID', 'Game'], right_on = ['User_ID', 'Game'])
steam['Hours_y'] = steam['Hours_y'].fillna(0)

# Only keep the useful data for our dataframe, user_id, game, and hours
steam_clean = steam.drop(['Interaction_x', 'Interaction_y', 'Hours_x'], axis = 1)
steam_clean.head()

Unnamed: 0,User_ID,Game,Hours_y
0,151603712,The Elder Scrolls V Skyrim,273.0
1,151603712,Fallout 4,87.0
2,151603712,Spore,14.9
3,151603712,Fallout New Vegas,12.1
4,151603712,Left 4 Dead 2,8.9


In [127]:
steam_clean.shape

(70785, 3)

In [117]:
# algorithm based on https://www.researchgate.net/publication/330249306_Estimated_Rating_Based_on_Hours_Played_for_Video_Game_Recommendation
game_time = steam_clean.groupby(['Game'])['Hours_y'].sum().reset_index()
# total 3600 games played in dataframe
game_time = game_time.sort_values(by=['Hours_y'],ascending = False)
game_time.head()

Unnamed: 0,Game,Hours_y
922,Dota 2,981684.6
673,Counter-Strike Global Offensive,322771.6
2994,Team Fortress 2,173673.3
670,Counter-Strike,134261.1
2691,Sid Meier's Civilization V,99821.3


In [118]:
# change above table to dictionary
total_time_dic = {}
for i in range(len(game_time)):
    total_time_dic[game_time.iloc[i][0]] = round(game_time.iloc[i][1],1)


In [119]:
# create new columns for frequency and user score
steam_new = steam_clean.copy()
steam_new['frequency'] = 'None'
steam_new['score'] = 'None'
steam_new.head()

Unnamed: 0,User_ID,Game,Hours_y,frequency,score
0,151603712,The Elder Scrolls V Skyrim,273.0,,
1,151603712,Fallout 4,87.0,,
2,151603712,Spore,14.9,,
3,151603712,Fallout New Vegas,12.1,,
4,151603712,Left 4 Dead 2,8.9,,


In [120]:
for i in range(len(steam_clean)):
    temp = steam_clean.iloc[i]

    # get hours played for temp game and total hours
    temp_name = temp[1]
    temp_hours = temp[2]
    total_hours = total_time_dic[temp_name]

    # calculate and implement frequency
    # player weights the higher time percentage for current game will get higher frequency
    steam_new.at[i,'frequency'] = temp_hours/total_hours
    




In [121]:
steam_new.head()

Unnamed: 0,User_ID,Game,Hours_y,frequency,score
0,151603712,The Elder Scrolls V Skyrim,273.0,0.003851,
1,151603712,Fallout 4,87.0,0.008083,
2,151603712,Spore,14.9,0.008731,
3,151603712,Fallout New Vegas,12.1,0.000816,
4,151603712,Left 4 Dead 2,8.9,0.000265,


In [122]:
# group by game, and sort frequency by decending order
steam_new.sort_values(by = ['Game', 'frequency'], inplace = True, ascending = False, ignore_index = True)
steam_read = steam_new.copy()
steam_new.head(10)

Unnamed: 0,User_ID,Game,Hours_y,frequency,score
0,159800136,theHunter Primal,71.0,0.826542,
1,62878249,theHunter Primal,9.4,0.10943,
2,207424334,theHunter Primal,4.6,0.053551,
3,157080495,theHunter Primal,0.9,0.010477,
4,43913966,theHunter,95.0,0.307245,
5,135879753,theHunter,31.0,0.100259,
6,137610845,theHunter,15.4,0.049806,
7,176929122,theHunter,10.1,0.032665,
8,163968268,theHunter,8.6,0.027814,
9,163930591,theHunter,8.5,0.02749,


In [123]:
def cal_score(sum_freq):
    # score range[1,5]
    return round((1-sum_freq)*4 + 1,1)

# calculate the total frequency for the current game and get the score
idx = 0
stored_name = ""
sum_freq = 0
while (idx < len(steam_read)):
    # for a new game, we reset sum of frequency, else keep increment sum of frequency
    if (steam_read.iloc[idx][1] != stored_name):
        sum_freq = steam_read.iloc[idx][3]
        # assume player with the highest frequency gives the game 5 star
        steam_new.at[idx,'score'] = 5.0
        stored_name = steam_read.iloc[idx][1]
    else:
        if (steam_read.iloc[idx][3] != steam_read.iloc[idx-1][3]):
            # case if the temp game is not a new game, and the previous player frequency is not equals to the temp frequency
            steam_new.at[idx,'score'] = cal_score(sum_freq)
        else:
            # if temp frequency is the same as previous, assume they give the same score
            steam_new.at[idx,'score'] = steam_new.at[idx-1,'score']
        # increment sum of frequency
        sum_freq += steam_read.iloc[idx][3]
    idx += 1



In [124]:
steam_new.head(15)

Unnamed: 0,User_ID,Game,Hours_y,frequency,score
0,159800136,theHunter Primal,71.0,0.826542,5.0
1,62878249,theHunter Primal,9.4,0.10943,1.7
2,207424334,theHunter Primal,4.6,0.053551,1.3
3,157080495,theHunter Primal,0.9,0.010477,1.0
4,43913966,theHunter,95.0,0.307245,5.0
5,135879753,theHunter,31.0,0.100259,3.8
6,137610845,theHunter,15.4,0.049806,3.4
7,176929122,theHunter,10.1,0.032665,3.2
8,163968268,theHunter,8.6,0.027814,3.0
9,163930591,theHunter,8.5,0.02749,2.9


In [125]:
steam_new.to_csv( "data_final/user_based_score.csv", index=False, encoding='utf-8-sig')