In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy import stats
import statistics as st
import operator

In [None]:
data = pd.read_csv('data/steam-200k.csv', header=None, names=["User_ID", "Game", "Interaction", "Hours", "Ignore"])
data = data.drop("Ignore", axis =1)
steam1 = data

steam1 = data[data['Interaction'] == "purchase"]
steam2 = data[data['Interaction'] == "play"]
steam3 = pd.merge(steam1, steam2, how = 'left', left_on = ['User_ID', 'Game'], right_on = ['User_ID', 'Game'])
steam3['Hours_y'] = steam3['Hours_y'].fillna(0)

steam_clean = steam3.drop(['Interaction_x', 'Interaction_y', 'Hours_x'], axis = 1)
steam_clean.head()
steam_clean.info()

In [None]:
print("Max " + str(steam_clean['Hours_y'].max())
    + "\nMean " + str(steam_clean['Hours_y'].mean()) 
      + "\nMedian " + str(steam_clean['Hours_y'].median()) 
      + "\nMode " + str(steam_clean['Hours_y'].mode()))

## Data Visualization

In [None]:
out = pd.cut(steam_clean['Hours_y'], bins = [0, 25, 50, 100, 250, 500, 750, 1000, 1250])
ax = out.value_counts(sort=False).plot.bar(rot=0, color="green", figsize = (13, 8))

In [None]:
out = pd.cut(steam_clean['Hours_y'], bins = [25, 50, 100, 250, 500, 750, 1000, 1250])
ax = out.value_counts(sort=False).plot.bar(rot=0, color="green", figsize = (13, 8))

In [None]:
top_hours = steam_clean.groupby('Game')['Hours_y'].max().reset_index()
top_hours = top_hours.sort_values(by=['Hours_y'], ascending = False)
top_games = top_hours.head(20)

top_games_graph = top_games[['Game', 'Hours_y']].plot.bar(x = 'Game', y = 'Hours_y', figsize = (13, 8), title = "Top 20 Games", color = "green")

## Frequency and Rating

In [None]:
def frequency(hours_i, user_hours):
    if user_hours == 0:
        return 0
    elif hours_i == user_hours:
        return 1
    return hours_i/(user_hours - hours_i)

def rating(frequency_sum):
    return 4*(1-frequency_sum)+1

In [None]:
b = 0
for b in range(0, 2):
    print(b)

In [None]:
game_hours = steam_clean.groupby(['Game'])['Hours_y'].sum().reset_index()

steam_clean2 = steam_clean
steam_length = int(len(steam_clean2.index))
i = 0
steam_clean2['Frequency'] = np.nan
for i in range(0, steam_length):
    hours_i = steam_clean2.iloc[i][2]
    user_hours = game_hours[game_hours['Game'] == steam_clean2.iloc[i][1]].iloc[0][1]

    steam_clean2.at[i, 'Frequency'] = frequency(hours_i, user_hours)

In [None]:
steam_clean3 = steam_clean2

steam_clean3['Rating'] = np.nan
steam_clean3.sort_values(by = ['Game', 'Frequency'], inplace = True, ascending = False, ignore_index = True)

print(steam_clean3)

In [None]:
store = "" 
sum_f = 0 
last_f = 0 
last_r = 0 

for i in range(0, steam_length): 
    temp = steam_clean3.iloc[i][1] 
    f_i = steam_clean3.iloc[i][3] 
    
    if temp != store: 
        store = temp 
        if f_i == 0.0:
            sum_f = 0 
            last_f = 0
            last_r = 1 
            steam_clean3.at[i, 'Rating'] = 1
        else: 
            sum_f = f_i
            last_f = f_i
            last_r = 5 
            steam_clean3.at[i, 'Rating'] = 5 
        
    else: 
        if f_i == 0: 
            last_r = 1 
            steam_clean3.at[i, 'Rating'] = 1
        elif last_f == f_i: 
            sum_f += f_i 
            steam_clean3.at[i, 'Rating'] = last_r
        else:
            rating_f = rating(sum_f) 
            sum_f += f_i 
            last_f = f_i 
            last_r = rating_f
            steam_clean3.at[i, 'Rating'] = rating_f

## KNN 

In [None]:
steam_clean4 = steam_clean3
steam_clean4.sort_values(by = ['User_ID', 'Game'], inplace = True, ignore_index = True)
#remove the games with no hours played
steam_clean4 = steam_clean4[steam_clean4['Rating'] != 1]
print(steam_clean4)

In [None]:
def distance(q, p):
    total = 0
    for i in range(0, len(q)):
        total += (q[i]-p[i])**2
    return math.sqrt(total)/len(q)

def neighbors(df, k_neighbors, user):
    distances = []
    user_games = df[df['User_ID'] == user]
    df_subset = df[df['User_ID'] != user]
    user_temp = []
    temp = []
    temp_id = 0
    
    for index, row in df_subset.iterrows():
        if row['Game'] in set(user_games['Game']):
            if row['User_ID'] == temp_id:
                temp.append(row['Rating'])
                user_temp.append(user_games.loc[user_games['Game'] == row['Game'], 'Rating'].iloc[0])
            elif temp_id == 0:
                temp_id = row['User_ID']
                temp.append(row['Rating'])
                user_temp.append(user_games.loc[user_games['Game'] == row['Game'], 'Rating'].iloc[0])
            else:
                dist = distance(user_temp, temp)
                distances.append((temp_id, dist))
                temp_id = row['User_ID']
                temp = []
                temp.append(row['Rating'])
                user_temp = []
                user_temp.append(user_games.loc[user_games['Game'] == row['Game'], 'Rating'].iloc[0])
    
    distances.sort(key=operator.itemgetter(1))
    neighbor_list =[]
    
    for i in range(k_neighbors):
        neighbor_list.append(distances[i])
    return neighbor_list
        
def recommend(user, neighbor_list, df):
    user_games = df[df['User_ID'] == user]
    dissim_games = []

    for neighbor in neighbor_list:
        temp = df[(df['User_ID'] == neighbor[0]) & (~df['Game'].isin(user_games['Game']))]
        
        for index, game in temp.iterrows():
            dissim_games.append((game['Game'], game['Rating']))

    dissim_games.sort(key=operator.itemgetter(0))
    flag = ""
    running_sum = 0
    rec_list = []
    count = 0

    for dis in dissim_games:
        if flag != dis[0]:
            if flag != "":
                rec_list.append((flag, running_sum/count))
            flag = dis[0]
            running_sum = dis[1]
            count = 1
        else:
            running_sum += dis[1]
            count += 1

    sort_list = sorted(rec_list, key=operator.itemgetter(1), reverse = True)
    return(sort_list)
        
def rec_games(rec_tuple):
    games = []
    for pair in rec_tuple:
        games.append(pair[0])
    return games

In [None]:
test_neighbors = neighbors(steam_clean4, 10, 159800136)
print(test_neighbors)

In [None]:
recs = recommend(53875128, test_neighbors, steam_clean4)
recommended_games = rec_games(recs)
print(recommended_games)

In [41]:
def knn(user, k_neighbors, df):
    knearest = neighbors(df, k_neighbors, user)
    rec_list = recommend(user, knearest, df)
    games = rec_games(rec_list)
    return games

In [36]:
steam_clean4.to_pickle("data.pkl")

In [31]:
output = pd.read_pickle("data.pkl")

In [47]:
knn(5250, 10, steam_clean4)
#53875128

['Borderlands 2 RU',
 'Dust An Elysian Tail',
 'Hotline Miami',
 'METAL SLUG 3',
 'METAL SLUG X',
 'Septerra Core',
 'Steel Ocean',
 'Mafia II',
 'Borderlands 2',
 'Mark of the Ninja',
 'Tomb Raider',
 'Counter-Strike Source',
 'Half-Life Opposing Force',
 'South Park The Stick of Truth',
 'Burnout Paradise The Ultimate Box',
 'Resident Evil 6 / Biohazard 6',
 'Half-Life Blue Shift',
 'Left 4 Dead 2',
 'Monaco',
 'Borderlands The Pre-Sequel',
 'Counter-Strike Condition Zero Deleted Scenes',
 'Darksiders',
 'METAL GEAR RISING REVENGEANCE',
 'Castle Crashers',
 'Dead Space',
 'Borderlands',
 'Bastion',
 'Counter-Strike Condition Zero',
 'Fallout 3 - Game of the Year Edition',
 'Skullgirls',
 'SpeedRunners',
 'Devil May Cry 3 Special Edition',
 'Half-Life',
 "Mirror's Edge",
 'Counter-Strike Global Offensive',
 'DmC Devil May Cry',
 'Robocraft',
 'FEZ',
 'Counter-Strike',
 'Terraria',
 'Saints Row The Third',
 'Unturned',
 'Dead Island',
 'The Walking Dead',
 'BioShock',
 'Devil May Cry 4