In [3]:
import pandas as pd
import numpy as np
import heapq

In [4]:
rmd_user = pd.read_csv('user_based_rmd.csv')
content_based_data = pd.read_csv('data_final/combined_csv.csv')

In [5]:
rmd_user.head()

Unnamed: 0,game_name,recommend1,recommend2,recommend3,recommend4,recommend5,recommend6,recommend7,recommend8,recommend9,recommend10
0,A Bird Story,State of Decay,Ys Origin,Saints Row IV,BioShock 2,Game Dev Tycoon,Terraria,Alpha Protocol,Emily is Away,Hotline Miami,Rogue Legacy
1,A Story About My Uncle,Unturned,Grand Theft Auto V,Tomb Raider,Garry's Mod,Warframe,Thief Gold,BioShock Infinite,Metro 2033,Quake Live,Rocket League
2,AdVenture Capitalist,Dark Void,Gothic,Gothic 3,MDK,PlanetSide 2,Singularity,The Room,Tomb Raider II,Enclave,Thief Gold
3,Age of Empires Online,And Yet It Moves,Braid,Grand Theft Auto,Street Fighter IV,Team Fortress Classic,Unturned,Arma 3,Fable Anniversary,Team Fortress 2,Grand Theft Auto IV
4,Age of Wonders,Gone Home,Portal,Machinarium,Max Payne,Portal 2,Borderlands 2,Crusader Kings II,Garry's Mod,Grand Theft Auto V,Half-Life 2


In [6]:
content_based_data.head()

Unnamed: 0.1,difficulty,length,name,rating,platform,sales,type1,type2,Unnamed: 0
0,2.93,44.84,Mass Effect 2,4.33,pc,1000,role-playing,,
1,3.15,17.52,The Curse of Monkey Island,4.32,pc,200,adventure,,
2,3.12,35.22,Half-Life 1: Anthology,4.32,pc,100,miscellaneous,,
3,3.24,60.04,Warcraft III: Battlechest,4.32,pc,200,strategy,,
4,2.46,46.86,Steins;Gate,4.32,pc,200,adventure,,


In [7]:
# choose all games in content table
big_game_list = content_based_data['name'].to_list()
print(len(big_game_list))

# remove duplicates
big_game_list = list(set(big_game_list))
print(len(big_game_list))

3092
2378


In [8]:
small_game_list = rmd_user['game_name'].to_list()
len(small_game_list)

458

In [9]:
# content based recommendation algorithm only works on items that not included in user based algorithm
aimed_list = []
for i in range(len(big_game_list)):
    if (big_game_list[i] not in small_game_list):
        aimed_list.append(big_game_list[i])
print(len(aimed_list))

1921


In [10]:
# find range of different parameters
print(content_based_data['difficulty'].min(),content_based_data['difficulty'].max())
print(content_based_data['length'].min(),content_based_data['length'].max())
print(content_based_data['rating'].min(),content_based_data['rating'].max())

1.16 4.79
0.76 80.0
1.21 4.56


In [11]:
# This function calculates square of distance between two points
def cal_dist(a,b):
    sum = 0
    for i in range(len(a)):
        sum += (a[i]-b[i])**2
    return np.sqrt(sum)

# we create new normalized difficulty and length in range [0,1]
# formula to normalize norm_val = (X-Xmin)/(Xmax-Xmin)
# append list contains [name,norm_difficulty,norm_length]
norm_list = []
content_based_data.head()
for i in range(len(content_based_data)):
    name = content_based_data.iloc[i][2]
    difficulty = content_based_data.iloc[i][0]
    length = content_based_data.iloc[i][1]
    rating = content_based_data.iloc[i][3]
    norm_difficulty = (difficulty-content_based_data['difficulty'].min())/(content_based_data['difficulty'].max()-content_based_data['difficulty'].min())
    norm_length = (length-content_based_data['length'].min())/(content_based_data['length'].max()-content_based_data['length'].min())
    norm_rating = (rating-content_based_data['rating'].min())/(content_based_data['rating'].max()-content_based_data['rating'].min())
    temp = [name,round(norm_difficulty,2),round(norm_length,2),round(norm_rating,2),content_based_data.iloc[i][5],content_based_data.iloc[i][4]]
    norm_list.append(temp)

norm_list[:5]


[['Mass Effect 2', 0.49, 0.56, 0.93, 1000, 'pc'],
 ['The Curse of Monkey Island', 0.55, 0.21, 0.93, 200, 'pc'],
 ['Half-Life 1: Anthology', 0.54, 0.43, 0.93, 100, 'pc'],
 ['Warcraft III: Battlechest', 0.57, 0.75, 0.93, 200, 'pc'],
 ['Steins;Gate', 0.36, 0.58, 0.93, 200, 'pc']]

In [15]:
dist_dic = {}
sale_volume_bonus_dic = { 100:0, 200:0.02, 500:0.05, 1000:0.1 }

for i in range(len(norm_list)):
    key = norm_list[i][0]
    dist_list = []
    for j in range(len(norm_list)):
        # check only recommends games in the same platform and not the same name
        if (norm_list[i][0] == norm_list[j][0] or norm_list[i][5] != norm_list[j][5]):
            continue
        temp_dist = cal_dist([norm_list[i][1],norm_list[i][2]],[norm_list[j][1],norm_list[j][2]])
        # give some bonus if the game has high rating or high sale volume
        temp_dist -= sale_volume_bonus_dic[norm_list[i][4]]
        temp_dist -= norm_list[i][3]*0.1  # bonus range for rating is [0,0.1]
        dist_list.append((norm_list[j][0],temp_dist))
    dist_list.sort(key = lambda x:x[1])
    dist_dic[key] = dist_list



In [16]:
# example output
dist_dic['The Legend of Zelda: Breath of the Wild']

[('The Witcher 3: Wild Hunt - Complete Edition', -0.16999999999999998),
 ("Dragon's Dogma: Dark Arisen", -0.14999999999999997),
 ('Octopath Traveler', -0.14169048105154697),
 ('Super Smash Bros. Ultimate', -0.11455996254682473),
 ('Disgaea 5 Complete', -0.11000000000000004),
 ('Xenoblade Chronicles 2', -0.11000000000000004),
 ('Hyrule Warriors: Definitive Edition', -0.10566018867943398),
 ('Shin Megami Tensei V', -0.10100505063388343),
 ('The Elder Scrolls V: Skyrim', -0.10000000000000003),
 ('Xenoblade Chronicles: Definitive Edition', -0.09000000000000002),
 ('Final Fantasy XII: The Zodiac Age', -0.08819660112501053),
 ('Final Fantasy X / X-2 HD Remaster', -0.08819660112501053),
 ('Bravely Default II', -0.08819660112501053),
 ('Yu-Gi-Oh! Legacy of the Duelist: Link Evolution', -0.08819660112501053),
 ('Fire Emblem: Three Houses', -0.08000000000000002),
 ('Digimon Story Cyber Sleuth: Complete Edition', -0.08000000000000002),
 ('Monster Hunter Generations Ultimate', -0.08000000000000002

In [14]:
out = []
for key,item in dist_dic.items():
    li = [key]

    # append first 20th games
    for i in range(len(item)):
        if (len(li) == 11):
            break
        if (item[i][0] != key):
            li.append(item[i][0])

    out.append(li)
df_recommend = pd.DataFrame(out, columns = ['game_name']+ ['recommend'+ str(i) for i in range(1,11)])
df_recommend.to_csv('content_based_rmd.csv',encoding='utf-8', index=False)