In [10]:
# import related library 
import numpy as np
import pandas as pd 
import heapq
from IPython.display import display, clear_output
import os

In [4]:
# csv file contains user based game score
data_user = pd.read_csv('data_final/user_based_score.csv')
data_user.head(5)


Unnamed: 0,User_ID,Game,Hours_y,frequency,score
0,159800136,theHunter Primal,71.0,0.826542,5.0
1,62878249,theHunter Primal,9.4,0.10943,1.7
2,207424334,theHunter Primal,4.6,0.053551,1.3
3,157080495,theHunter Primal,0.9,0.010477,1.0
4,43913966,theHunter,95.0,0.307245,5.0


In [5]:
# csv file contains content based information
data_content = pd.read_csv('data_final/combined_csv.csv')
data_content.head(5)

Unnamed: 0.1,difficulty,length,name,rating,platform,sales,type1,type2,Unnamed: 0
0,2.93,44.84,Mass Effect 2,4.33,pc,1000,role-playing,,
1,3.15,17.52,The Curse of Monkey Island,4.32,pc,200,adventure,,
2,3.12,35.22,Half-Life 1: Anthology,4.32,pc,100,miscellaneous,,
3,3.24,60.04,Warcraft III: Battlechest,4.32,pc,200,strategy,,
4,2.46,46.86,Steins;Gate,4.32,pc,200,adventure,,


In [6]:
# group by games and user_id
data_user.sort_values(by = ['Game','User_ID'], inplace = True, ignore_index = True)
data_user.head()

Unnamed: 0,User_ID,Game,Hours_y,frequency,score
0,46055854,007 Legends,0.7,1.0,5.0
1,11940338,0RBITALIS,0.6,0.5,5.0
2,86055705,0RBITALIS,0.3,0.25,3.0
3,93030550,0RBITALIS,0.3,0.25,3.0
4,49893565,1... 2... 3... KICK IT! (Drop That Beat Like a...,2.4,0.12,1.8


In [7]:
# This function creates a dictionary to find users for each game
# The key is game name, the value is a list contains all user id
def create_user_dic(df):
    dic = {}
    for i in range(len(df)):
        if df['Game'][i] in dic:
            dic[df['Game'][i]].append(df['User_ID'][i])
        else:
            dic[df['Game'][i]] = [df['User_ID'][i]]
    return dic
user_dic = create_user_dic(data_user)
data_user.head()

Unnamed: 0,User_ID,Game,Hours_y,frequency,score
0,46055854,007 Legends,0.7,1.0,5.0
1,11940338,0RBITALIS,0.6,0.5,5.0
2,86055705,0RBITALIS,0.3,0.25,3.0
3,93030550,0RBITALIS,0.3,0.25,3.0
4,49893565,1... 2... 3... KICK IT! (Drop That Beat Like a...,2.4,0.12,1.8


In [8]:
vc = data_user.User_ID.value_counts()
# create a list to only keep users have more than three games. 
# This will prevent recommendation algorithms recommend games based on users that less than two games
li = vc[vc > 3].index
user_game_dic = {}

for i in range(len(data_user)):
    temp_id = data_user['User_ID'][i]
    if (temp_id in li):
        if (temp_id in user_game_dic):
            user_game_dic[temp_id].append(data_user['Game'][i])
        else:
            user_game_dic[temp_id] = [data_user['Game'][i]]

print('number of users who have more than 3 games: ',len(li))

game_list = []
for i in range(len(data_content)):
    game_list.append(data_content['name'][i])
print('game names that we want to find recommendations ',len(game_list))


number of users who have more than 3 games:  2840
game names that we want to find recommendations  3092


In [9]:
# This function gives us a list of same games for two users
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

intersection(user_game_dic[11940338],user_game_dic[46055854])


['Grand Theft Auto V', 'Left 4 Dead 2']

In [10]:
# This function prints the progress for training KNN
def print_info(cnt,total):
    if (cnt == 0):
        print('start training kNN')
    elif (cnt % 200 == 0):
        print(round(cnt/total*100,2), '(%) done for KNN')

# This function calculates square of distance between two points
def cal_dist(a,b):
    sum = 0
    for i in range(len(a)):
        sum += (a[i]-b[i])**2
    return np.sqrt(sum)/len(a)

# This function finds the Kth similar players with the current id. 
# Also, the number of games same between two players should > min_games_same
def find_neibor(cur_id,li,k,min_games_same):
    # create heap to find first kth distance
    dist_hp = []
    heapq.heapify(dist_hp)
    for i in range(len(li)):
        print_info(i,len(li))
        if (li[i] != cur_id):
            # get same games between two users
            repeated_games = intersection(user_game_dic[cur_id],user_game_dic[li[i]])
            if (len(repeated_games) < min_games_same):
                continue
            l1 = []
            l2 = []

            # append distance to list
            for game in repeated_games:
                temp_a = data_user[(data_user['Game'] == game) & (data_user['User_ID'] == cur_id)]['score'].values[0]
                temp_b = data_user[(data_user['Game'] == game) & (data_user['User_ID'] == li[i])]['score'].values[0]
                l1.append(temp_a)
                l2.append(temp_b)
            
            dist = -1*cal_dist(l1,l2) # make distance be negative to pop element has largest distance
            id_ = li[i]

            # add into heap if heap size is smaller than k elements
            if (len(dist_hp) < k):
                heapq.heappush(dist_hp,(dist,id_))
            else:
                temp = heapq.heappop(dist_hp)
                if (temp[0] > dist):
                    # pop back if the element with greatest distance in heap has less distance than current
                    heapq.heappush(dist_hp,temp)
                else:
                    # push current element
                    heapq.heappush(dist_hp,(dist,id_))
            
    return dist_hp

# sort the game score by descending order and return user id table
def find_max_score_id(df,game_name):
    small_df = df[df['Game'] == game_name]
    small_df.sort_values(by = ['score'],ascending=False)
    return small_df['User_ID']

# make recommendation and return the list
def make_recommendation(df,table,num_games_aimed,game_list,out_list):
    # create a heap to hold names of recommendation
    game_hp = []
    heapq.heapify(game_hp)
    # for loop to looping through different users for the same game
    for i in range(len(table)):
        user_id = table[i][1]
        small_df = df[df['User_ID'] == user_id]
        for i in range(len(small_df)):
            # push (-1*user_score, game_name) as a tuple into heap
            heapq.heappush(game_hp,(-1*small_df['score'].iloc[i],small_df['Game'].iloc[i]))
    
    while(len(out_list) < num_games_aimed and game_hp):
        # pop the game with maximum score from heap
        temp = heapq.heappop(game_hp)

        # check if game is in our content based game list and not already in out list
        if (temp[1] in game_list and temp[1] not in out_list):
            out_list.append(temp[1])
    return out_list
            
                    
    

In [11]:
# put it all together
def get_rmd(name,data_user,num_games_aimed,li,k,min_games_same,game_list):
    id_list = find_max_score_id(data_user,name)
    # print('Finish find similar user!')
    out = []
    for i in range(len(id_list)):
        if (len(out) >= num_games_aimed):
            return out
        temp_id = id_list.iloc[i]
        dist_table = find_neibor(temp_id,li,k,min_games_same)
        # print('Finish KNN for current user!')
        temp_recmd = make_recommendation(data_user,dist_table,num_games_aimed-len(out),game_list,out)
        # print('Finish recommendation!')
    return out
        


In [12]:
# find common games between two csv files (user_based csv and content_based csv)
common_games = []
for i in range(len(data_user)):
    if (data_user['Game'][i] in game_list and data_user['Game'][i] not in common_games):
        common_games.append(data_user['Game'][i])

print(user_game_dic[li[128]])

['A Virus Named TOM', 'Agarest Generations of War', 'Age of Empires II HD Edition', 'Alan Wake', 'Alien Swarm', 'Anomaly Warzone Earth', 'Antichamber', 'BIT.TRIP Presents... Runner2 Future Legend of Rhythm Alien', 'Bastion', 'Blood Bowl Legendary Edition', 'Borderlands', 'Borderlands 2', 'Borderlands The Pre-Sequel', 'Brtal Legend', 'Chivalry Medieval Warfare', 'Company of Heroes 2', 'Counter-Strike Global Offensive', 'DARK SOULS II Scholar of the First Sin', 'Dead Space', 'Defense Grid The Awakening', 'DiRT 2', 'Divinity Original Sin', 'Divinity Original Sin Enhanced Edition', 'Dota 2', 'Dust An Elysian Tail', 'Evolve', 'FTL Faster Than Light', 'Fallout New Vegas', 'Gauntlet ', 'Giana Sisters Twisted Dreams', 'Guacamelee! Gold Edition', 'Guardians of Middle-earth', 'Hammerwatch', 'Injustice Gods Among Us Ultimate Edition', 'Joe Danger 2 The Movie', "King's Bounty Armored Princess", "King's Bounty The Legend", 'Krater', 'Left 4 Dead 2', 'Mark of the Ninja', 'Monaco', 'Mortal Kombat Kol

In [23]:
# run and train knn to find recommandation for every game and record them into txt file
for i in range(len(common_games)):
    file_object = open('drive/MyDrive/Colab/project/rmd_output.txt','a')
    clear_output(wait=True)
    print('start find the ', i+1, ' game in 473')
    temp_game  = common_games[i]
    # load number
    file_object.write(str(i+1))
    file_object.write('\n')
    # load temp game name
    file_object.write(temp_game)
    file_object.write('\n')
    rmd_list = get_rmd(temp_game,data_user,10,li,10,3,common_games)
    for game in rmd_list:
        # write each game on a new row
        file_object.write(game)
        file_object.write('\n')
    file_object.close()

start find the  473  game in 473
start training kNN
7.04 (%) done for KNN
14.08 (%) done for KNN
21.13 (%) done for KNN
28.17 (%) done for KNN
35.21 (%) done for KNN
42.25 (%) done for KNN
49.3 (%) done for KNN
56.34 (%) done for KNN
63.38 (%) done for KNN
70.42 (%) done for KNN
77.46 (%) done for KNN
84.51 (%) done for KNN
91.55 (%) done for KNN
98.59 (%) done for KNN


In [8]:
# read txt file and convert to csv
f = open('recommendation_output.txt','r')

temp_list = []
out = []
while(True):
    line = f.readline()
    line = line[:-1]
    if (line == ''):
        break
    if (line[0].isdigit()):
        if (len(temp_list) == 11):
            out.append(temp_list)
        temp_list = []
    else:
        temp_list.append(line)
        
f.close()
print(len(out))
print(out[0])

458
['A Bird Story', 'State of Decay', 'Ys Origin', 'Saints Row IV', 'BioShock 2', 'Game Dev Tycoon', 'Terraria', 'Alpha Protocol', 'Emily is Away', 'Hotline Miami', 'Rogue Legacy']


In [13]:
df_recommend = pd.DataFrame (out, columns = ['game_name'] + ['recommend'+ str(i) for i in range(1,11)])
df_recommend.head()

Unnamed: 0,game_name,recommend1,recommend2,recommend3,recommend4,recommend5,recommend6,recommend7,recommend8,recommend9,recommend10
0,A Bird Story,State of Decay,Ys Origin,Saints Row IV,BioShock 2,Game Dev Tycoon,Terraria,Alpha Protocol,Emily is Away,Hotline Miami,Rogue Legacy
1,A Story About My Uncle,Unturned,Grand Theft Auto V,Tomb Raider,Garry's Mod,Warframe,Thief Gold,BioShock Infinite,Metro 2033,Quake Live,Rocket League
2,AdVenture Capitalist,Dark Void,Gothic,Gothic 3,MDK,PlanetSide 2,Singularity,The Room,Tomb Raider II,Enclave,Thief Gold
3,Age of Empires Online,And Yet It Moves,Braid,Grand Theft Auto,Street Fighter IV,Team Fortress Classic,Unturned,Arma 3,Fable Anniversary,Team Fortress 2,Grand Theft Auto IV
4,Age of Wonders,Gone Home,Portal,Machinarium,Max Payne,Portal 2,Borderlands 2,Crusader Kings II,Garry's Mod,Grand Theft Auto V,Half-Life 2


In [16]:
df_recommend.to_csv('user_based_rmd.csv',encoding='utf-8', index=False)