# Assumptions:
- each account is not shared (i.e. only 1 human playing games using one account)

# Importing required libraries

In [7]:
#importing required libraries

import requests
import pandas
from tqdm.notebook import tqdm
from datetime import date, timedelta
import numpy as np

from surprise import NormalPredictor, Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import cross_validate

In [8]:
#noticed that a lot of account_ids in match_info is None if matches are randomly sampled

# Part 1, leaderboard given a list of account_id's, ranked in descending order by win-loss ratio (by default all historical matches are considered. if starting date is specified, then will only conder matches from that point onwards

In [9]:
MATCH_ID = 6898280972

In [10]:

def accounts_from_match(match_id: int) -> list:
    """
        Returns the list of player account_ids from specified match
        
        Params:
        match_id (int) - match_id to check
        
        Returns:
        A list of ints, containing the account_ids of players
    """
    
    # request = requests.get(f"https://api.opendota.com/api/matches/{match_id}?api_key={API_KEY}")
    request = requests.get(f"https://api.opendota.com/api/matches/{match_id}").json()
    accounts_list = [player["account_id"] for player in request["players"]]
    return(accounts_list)

def check_player_wl(account_id: int, num_days_back: int=None) -> tuple:
    """
        Calculates win-loss rate for a player given player's account_id, 
        given as ratio calculated using the following formula - number of wins divided by number of losses
        
        Params:
        account_id (int) - player's account_id to check, passed in as int
        
        Returns:
        A tuple with player's account_id and float representing win loss ratio of specified player, 3 decimal places
    """
    if num_days_back:
        player_wl_request = requests.get(f"https://api.opendota.com/api/players/{account_id}/wl?date={num_days_back}").json()
    else:
        player_wl_request = requests.get(f"https://api.opendota.com/api/players/{account_id}/wl?").json()
    
    return((account_id, round(player_wl_request["win"]/player_wl_request["lose"], 3)))

def rank_players_by_wl(accounts_list: list, num_days_back: int=None) -> list:
    """
        Calculates win-loss rate for a list of players given players' account_id's, 
        given as ratio calculated using the following formula - number of wins divided by number of losses, 
        ranked by ratio in descending order
        
        Params:
        account_ids (list) - list of players account_id to check, passed in as int
        
        Returns:
        List of account_ids and win-loss ratio, counted only using matches from "num_days_back" days ago till now,
        sorted by win-loss ratio in descending order
    """
    
    players_ranked_by_wl = []
    
    for account in accounts_list:
        players_ranked_by_wl.append(check_player_wl(account, num_days_back))
    
    return(sorted(players_ranked_by_wl, key=lambda x:x[1], reverse=True))

In [11]:
accounts_list = accounts_from_match(MATCH_ID)
ranked = rank_players_by_wl(accounts_list)
ranked

[(250544263, 1.666),
 (190258756, 1.539),
 (237578577, 1.397),
 (302429528, 1.318),
 (240727522, 1.289),
 (394257871, 1.287),
 (460481806, 1.15),
 (236322966, 1.134),
 (1062400248, 1.097),
 (133049595, 1.068)]

In [None]:
[250544263, 190258756, 237578577, 302429528]
 (240727522, 1.289),
 (394257871, 1.287),
 (460481806, 1.15),
 (236322966, 1.134),
 (1062400248, 1.097),
 (133049595, 1.068)]

# Part 2

In [16]:
#testing account_id: 191312823

def list_heroes_game() -> list:
    """
        checks the heroes available in game
        
        Returns:
        list of hero_ids in available in game
    """
    
    heroes = requests.get(f"https://api.opendota.com/api/heroes").json()
    
    return({hero["id"]: hero["localized_name"] for hero in heroes})

def prepare_new_player_dict(list_heroes_game: list):
    """
        Prepare a new empty dictionary to track hero counts for an account
        
        Params:
        list_heroes_game (list) - list of hero_ids available in game
        
        Returns:
        Empty dictionary for newly processed player, with the keys as a distinct hero
    """
    
    return({_:0 for _ in list_heroes_game})

def get_hero_info(hero_id: int) -> list:
    """
        Get two pieces information about a hero, 
        account_id that has played that hero and number of times that account_id has played that hero
        
        Params:
        hero_id (int) - specific hero_id
        
        Returns:
        Returns a list of lists, with each element of the main list being an account_id
        and how many times that account has played that hero
    """
    
    match_info = requests.get(f"https://api.opendota.com/api/heroes/{hero_id}/players").json()
    
    return([[player["account_id"], player["games_played"]] for player in match_info])

# def prepare_data_dict():
#     """
#         Prepare dataset for recommendation system
        
#         Returns:
#         Dictionary with account_id as key, and for each account_id is an inner dictionary with hero_ids as keys
#         , this keeps track of the number of times that account_id has played that hero
#     """
    
#     dataset_dict = {}
#     # all_matches = get_public_matches_samples(num_samples)
    
#     all_heroes_list = list_heroes_game()
    
#     for hero_id in tqdm(all_heroes_list):
#         for account_id, games_played in get_hero_info(hero_id):
#             if account_id not in dataset_dict:
#                 dataset_dict[account_id] = prepare_new_player_dict(all_heroes_list)
#             dataset_dict[account_id][hero_id] += games_played
        
#     return(dataset_dict)

def get_public_matches_samples(query_end_date: date, query_period: int) -> list:
    """
        searches for matches for the past XX days
        
        Params:
        query_date (datetime.date) - date
        
        Returns:
        List of dictionaries, with each dictionary specifying how many times an account has played a hero
    """
    
    query_start_date = query_end_date - timedelta(days=query_period)
    
    query_str = f"""
    SELECT
    player_matches.account_id,
    player_matches.hero_id,
    COUNT(DISTINCT player_matches.match_id)
    FROM matches
    JOIN player_matches using(match_id)
    WHERE matches.start_time > extract(epoch from timestamp '{query_start_date.year}-{query_start_date.month}-{query_start_date.day}T00:00:00.000Z')
    AND matches.start_time <= extract(epoch from timestamp '{query_end_date.year}-{query_end_date.month}-{query_end_date.day}T00:00:00.000Z')
    GROUP BY player_matches.account_id, player_matches.hero_id
    """
    
    sample = requests.get(f"https://api.opendota.com/api/explorer?sql={query_str}").json()
    
    return(sample["rows"])

def prepare_accounts_hero_data(query_period: int=90):
    """
        retrieves match data from matches which started after 'query_period' days ago, but started before today 0000hrs 
        
        Params:
        query_period (int) - number of days back to start collecting match data from
        
        Returns:
        List of dictionaries, with each dictionary specifying how many times an account has played a hero, 
        based on match data retreived
    """
    # dataset_list = []
    all_heroes_list = list_heroes_game()
    hero_played_counts_by_account = get_public_matches_samples(date.today(), query_period)
    
    return(pandas.DataFrame(hero_played_counts_by_account), all_heroes_list)

def get_account_hero_hist(account_id: int, num_days_back: int=90) -> tuple:
    """
        Retrieves 'num_days_back' days worth of match history for given 'account_id', 
        defaults to 90 days
        
        Params:
        account_id (int) - player's account_id to check, passed in as int
        num_days_back (int) - number of days of match data to retrieve
        
        Returns:
        
    """
    
    account_hist_matches = requests.get(f"https://api.opendota.com/api/players/{account_id}/matches?date={num_days_back}").json()
    
    account_dict = {}
    
    for match in account_hist_matches:
        if match["hero_id"] not in account_dict:
            account_dict[match["hero_id"]] = 0
        account_dict[match["hero_id"]] += 1
    
    account_df = pandas.DataFrame([{"account_id":account_id, "hero_id":key , 'count':value} for key, value in account_dict.items()])
    
    account_df["prop_of_matches"] = account_df["count"] / sum(account_df["count"])
    
    return(account_df[["account_id", "hero_id", "prop_of_matches"]])

def recommendation_wrapper(account_id: int, query_period: int=90, min_num_matches: int=30, num_reccs: int=3):
    """
        main wrapper method for hero recommendations
        
        Params:
        account_id (int) - player's account_id to check for historical match data and subsequently recommend heroes for
        query_period (int) - number of days of match data to retrieve
        min_num_matches (int) - will only consider profiles with this minimum number of matches in recommendation matrix
        num_reccs (int) - number of hero recommendations
        
        Returns:
        A list of recommended heroes based on player match history and other player's match history. Will only recommend 
        heroes that have not been played.
    """
    
    
    matches_dataset, hero_list = prepare_accounts_hero_data(query_period)
    
    #removes the account_id to recommend for if it already exists
    matches_dataset = matches_dataset[matches_dataset["account_id"] != account_id]
    
    #removes other account with less than min_num_matches
    removal_set = matches_dataset.groupby("account_id").sum()["count"].reset_index()
    matches_dataset = matches_dataset.merge(removal_set, on='account_id', how='left')
    matches_dataset = matches_dataset[matches_dataset["count_y"] >= min_num_matches]
    
    #calculate proportion of matches that a player selects that hero
    matches_dataset["prop_of_matches"] = matches_dataset["count_x"] / matches_dataset["count_y"]
    matches_dataset = matches_dataset.drop(["count_x", "count_y"], axis=1)
    
    #gets match history of account to predict for
    one_account_hist = get_account_hero_hist(account_id)
    
    matches_dataset = pandas.concat([matches_dataset, one_account_hist]).reset_index(drop=True)
    
    reader = Reader(rating_scale=(0, 1))
    data = Dataset.load_from_df(matches_dataset, reader)
    final_set = data.build_full_trainset()
    
    sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }

    algo = KNNBasic(sim_options=sim_options)
    algo.fit(final_set)
    
    anti_testset_user = []
    # account to predict for is always the last
    targetUser = max(final_set.all_users())
    #initialise with global mean of matrix
    fillValue = final_set.global_mean
    
    #checks which heroes already played, we only want to recommend heroes that have not been played in that period
    user_item_ratings = final_set.ur[targetUser]
    user_items = [item for (item,_) in (user_item_ratings)]
    
    ratings = final_set.all_ratings()
    for iid in final_set.all_items():
        if(iid not in user_items):
            anti_testset_user.append((final_set.to_raw_uid(targetUser),final_set.to_raw_iid(iid),fillValue))
    
    #get predictions
    predictions = algo.test(anti_testset_user)
    recommendations = sorted([[pred.iid, pred.est] for pred in predictions], key=lambda x:x[1], reverse=True)
    final_reccs = [hero_list[hero[0]] for hero in recommendations[:num_reccs]]
    
    return(final_reccs)

In [17]:
reccs = recommendation_wrapper(191312823)


Computing the cosine similarity matrix...
Done computing similarity matrix.


In [378]:
#preparing template for experiments

QUERY_PERIOD = 90
min_num_matches = 0

#retrieves match data from matches which started after 'query_period' days ago, but started before today 0000hrs
testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)

#checks the number of matches played by each account in specified query period
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()

#performs a left merge
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')

#removes account which have not played a minimum number of matches in past XX days
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]

#calculates the proportion of matches that an account plays a hero, 
#out of all the matches played by that account in the past XX days
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

#taking a look
testing_dataset.head()


Unnamed: 0,account_id,hero_id,count_x,count_y,prop_of_matches
0,88470,9,1,47,0.021277
1,88470,29,2,47,0.042553
2,88470,36,2,47,0.042553
3,88470,38,5,47,0.106383
4,88470,40,1,47,0.021277


## starting experiments to determine arguments

In [379]:
#experiment 1, 90 days worth of matches, without removing anything, using SVD

QUERY_PERIOD = 90
min_num_matches = 0

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1730  0.1715  0.1755  0.1709  0.1692  0.1720  0.0021  
MAE (testset)     0.1071  0.1059  0.1073  0.1059  0.1069  0.1066  0.0006  
Fit time          0.06    0.06    0.06    0.06    0.06    0.06    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([0.17302411, 0.17147419, 0.17548656, 0.17088758, 0.16915575]),
 'test_mae': array([0.10705875, 0.10589196, 0.10728106, 0.10593227, 0.10689463]),
 'fit_time': (0.06307125091552734,
  0.059430837631225586,
  0.05923008918762207,
  0.05994296073913574,
  0.059877872467041016),
 'test_time': (0.007136106491088867,
  0.007169008255004883,
  0.007038116455078125,
  0.007091999053955078,
  0.006957054138183594)}

In [380]:
#experiment 2, 90 days worth of matches, removing accounts with less than 30 matches, using SVD

QUERY_PERIOD = 90
min_num_matches = 30

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0729  0.0736  0.0723  0.0734  0.0730  0.0730  0.0005  
MAE (testset)     0.0543  0.0544  0.0536  0.0542  0.0540  0.0541  0.0003  
Fit time          0.04    0.03    0.03    0.03    0.03    0.03    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.0728786 , 0.07356606, 0.0722594 , 0.07341729, 0.07300651]),
 'test_mae': array([0.05425384, 0.05438524, 0.0536349 , 0.05420983, 0.05401668]),
 'fit_time': (0.03887200355529785,
  0.0338749885559082,
  0.0334320068359375,
  0.033193111419677734,
  0.03377676010131836),
 'test_time': (0.00437617301940918,
  0.004194021224975586,
  0.004163026809692383,
  0.00407099723815918,
  0.00409698486328125)}

In [381]:
#experiment 3, 90 days worth of matches, removing accounts with less than 50 matches, using SVD

QUERY_PERIOD = 90
min_num_matches = 50

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0718  0.0745  0.0731  0.0724  0.0741  0.0732  0.0010  
MAE (testset)     0.0536  0.0542  0.0543  0.0530  0.0541  0.0538  0.0005  
Fit time          0.03    0.02    0.02    0.03    0.02    0.03    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.0718463 , 0.0745211 , 0.07309424, 0.07235271, 0.07408453]),
 'test_mae': array([0.05363811, 0.05421172, 0.0542618 , 0.05295164, 0.05405767]),
 'fit_time': (0.031355857849121094,
  0.024631023406982422,
  0.024729013442993164,
  0.025238990783691406,
  0.024960994720458984),
 'test_time': (0.0032401084899902344,
  0.0030651092529296875,
  0.0030527114868164062,
  0.0031359195709228516,
  0.0031189918518066406)}

In [382]:
#experiment 4, 90 days worth of matches, removing accounts with less than 70 matches, using SVD

QUERY_PERIOD = 90
min_num_matches = 70

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0756  0.0742  0.0764  0.0793  0.0774  0.0766  0.0017  
MAE (testset)     0.0546  0.0533  0.0560  0.0576  0.0561  0.0555  0.0015  
Fit time          0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.07559876, 0.07419589, 0.07642082, 0.07927195, 0.07738745]),
 'test_mae': array([0.05462852, 0.05333192, 0.05604093, 0.05760917, 0.05607604]),
 'fit_time': (0.02202916145324707,
  0.016865968704223633,
  0.01613306999206543,
  0.01612710952758789,
  0.01611804962158203),
 'test_time': (0.0023827552795410156,
  0.0020067691802978516,
  0.0019969940185546875,
  0.002034902572631836,
  0.00208282470703125)}

In [383]:
#experiment 5, 90 days worth of matches, removing accounts with less than 40 matches, using SVD

QUERY_PERIOD = 90
min_num_matches = 40

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0753  0.0706  0.0758  0.0703  0.0718  0.0727  0.0023  
MAE (testset)     0.0557  0.0522  0.0554  0.0516  0.0528  0.0535  0.0017  
Fit time          0.04    0.03    0.03    0.03    0.03    0.03    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.0752563 , 0.07061243, 0.07578992, 0.07026797, 0.07180428]),
 'test_mae': array([0.05566211, 0.05220058, 0.05536435, 0.05161284, 0.05283402]),
 'fit_time': (0.03564310073852539,
  0.028768062591552734,
  0.028650999069213867,
  0.028474092483520508,
  0.02888798713684082),
 'test_time': (0.003643035888671875,
  0.0035331249237060547,
  0.003596067428588867,
  0.0035300254821777344,
  0.0035822391510009766)}

In [384]:
#experiment 6, 30 days worth of matches, removing accounts with less than 30 matches, using SVD

QUERY_PERIOD = 30
min_num_matches = 30

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0883  0.0895  0.0869  0.0857  0.0920  0.0885  0.0022  
MAE (testset)     0.0654  0.0657  0.0640  0.0635  0.0669  0.0651  0.0012  
Fit time          0.02    0.02    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.08829555, 0.08950326, 0.08686597, 0.08570559, 0.09198775]),
 'test_mae': array([0.06542861, 0.0657178 , 0.06402847, 0.06352132, 0.0669485 ]),
 'fit_time': (0.021724939346313477,
  0.01572704315185547,
  0.01287698745727539,
  0.012186288833618164,
  0.012111186981201172),
 'test_time': (0.002454996109008789,
  0.0018129348754882812,
  0.001489877700805664,
  0.0014998912811279297,
  0.0014810562133789062)}

In [385]:
#experiment 7, 60 days worth of matches, removing accounts with less than 30 matches, using SVD

QUERY_PERIOD = 60
min_num_matches = 30

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0852  0.0799  0.0812  0.0821  0.0808  0.0818  0.0018  
MAE (testset)     0.0625  0.0593  0.0593  0.0604  0.0600  0.0603  0.0012  
Fit time          0.03    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.08517045, 0.07987274, 0.08124019, 0.08205054, 0.0807569 ]),
 'test_mae': array([0.06253268, 0.05930301, 0.05931496, 0.0603534 , 0.05996569]),
 'fit_time': (0.02925586700439453,
  0.02188706398010254,
  0.020620107650756836,
  0.0203092098236084,
  0.020586013793945312),
 'test_time': (0.003036975860595703,
  0.0026679039001464844,
  0.0025370121002197266,
  0.002479076385498047,
  0.002541065216064453)}

In [386]:
#experiment 8, 90 days worth of matches, removing accounts with less than 30 matches, using SVD

QUERY_PERIOD = 90
min_num_matches = 30

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0734  0.0706  0.0739  0.0722  0.0744  0.0729  0.0014  
MAE (testset)     0.0549  0.0532  0.0555  0.0528  0.0545  0.0542  0.0010  
Fit time          0.04    0.03    0.03    0.03    0.03    0.03    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.07338813, 0.07061515, 0.07394346, 0.07217432, 0.07440729]),
 'test_mae': array([0.05491757, 0.05315664, 0.05545326, 0.0527895 , 0.05448349]),
 'fit_time': (0.038659095764160156,
  0.03312206268310547,
  0.03301811218261719,
  0.03300189971923828,
  0.03309798240661621),
 'test_time': (0.004099845886230469,
  0.00406193733215332,
  0.0040760040283203125,
  0.0040662288665771484,
  0.004050016403198242)}

In [393]:
#experiment 9, 120 days worth of matches, removing accounts with less than 30 matches, using SVD

QUERY_PERIOD = 120
min_num_matches = 30

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0678  0.0664  0.0678  0.0686  0.0660  0.0673  0.0010  
MAE (testset)     0.0503  0.0491  0.0506  0.0501  0.0485  0.0497  0.0008  
Fit time          0.05    0.04    0.04    0.04    0.04    0.04    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([0.0678343 , 0.06641413, 0.06777189, 0.06858286, 0.06601311]),
 'test_mae': array([0.05030895, 0.04913995, 0.05060456, 0.05014525, 0.04848993]),
 'fit_time': (0.0469660758972168,
  0.04396796226501465,
  0.043845176696777344,
  0.04379582405090332,
  0.04386305809020996),
 'test_time': (0.005434751510620117,
  0.005507946014404297,
  0.005357027053833008,
  0.005385160446166992,
  0.0055119991302490234)}

In [394]:
#experiment 10, 150 days worth of matches, removing accounts with less than 30 matches, using SVD

QUERY_PERIOD = 150
min_num_matches = 30

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0616  0.0632  0.0613  0.0633  0.0629  0.0625  0.0009  
MAE (testset)     0.0459  0.0466  0.0453  0.0469  0.0466  0.0463  0.0006  
Fit time          0.06    0.06    0.06    0.06    0.06    0.06    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([0.06156475, 0.063237  , 0.06131998, 0.0633304 , 0.06286434]),
 'test_mae': array([0.04585284, 0.04658316, 0.04534866, 0.04690428, 0.04662134]),
 'fit_time': (0.05696296691894531,
  0.056520938873291016,
  0.056720733642578125,
  0.05803203582763672,
  0.05813908576965332),
 'test_time': (0.0071680545806884766,
  0.007092952728271484,
  0.0072100162506103516,
  0.007173061370849609,
  0.00727391242980957)}

In [395]:
#experiment 11, 180 days worth of matches, removing accounts with less than 30 matches, using SVD

QUERY_PERIOD = 180
min_num_matches = 30

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0594  0.0586  0.0591  0.0594  0.0580  0.0589  0.0005  
MAE (testset)     0.0435  0.0432  0.0439  0.0438  0.0430  0.0435  0.0003  
Fit time          0.07    0.07    0.07    0.07    0.07    0.07    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([0.05936766, 0.05862909, 0.05908878, 0.05938492, 0.05803612]),
 'test_mae': array([0.04352257, 0.04324701, 0.04387633, 0.04384825, 0.0429947 ]),
 'fit_time': (0.0684809684753418,
  0.0740506649017334,
  0.06983304023742676,
  0.06885004043579102,
  0.06918978691101074),
 'test_time': (0.008687019348144531,
  0.009164094924926758,
  0.008887052536010742,
  0.008808135986328125,
  0.008867025375366211)}

In [13]:
#experiment 12, 60 days worth of matches, removing accounts with less than 30 matches, using KNN

QUERY_PERIOD = 60
min_num_matches = 30

testing_dataset, hero_list = prepare_accounts_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }

algo = KNNBasic(sim_options=sim_options)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0520  0.0497  0.0514  0.0476  0.0504  0.0502  0.0015  
MAE (testset)     0.0353  0.0339  0.0346  0.0335  0.0340  0.0342  0.0006  
Fit time          0.01    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.04    0.04    0.04    0.04    0.04    0.04    0.00    


{'test_rmse': array([0.05198584, 0.04967989, 0.05143455, 0.04761632, 0.05039511]),
 'test_mae': array([0.0352525 , 0.03388113, 0.03456327, 0.0334638 , 0.03400024]),
 'fit_time': (0.005181074142456055,
  0.0029239654541015625,
  0.0029230117797851562,
  0.0028748512268066406,
  0.0028619766235351562),
 'test_time': (0.043627023696899414,
  0.03659200668334961,
  0.036835670471191406,
  0.03629112243652344,
  0.03632473945617676)}

In [402]:
#experiment 13, 60 days worth of matches, removing accounts with less than 30 matches, using NormalPredictor

QUERY_PERIOD = 60
min_num_matches = 30

testing_dataset, hero_list = prepare_account_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
# train_set = data.build_full_trainset()

algo = NormalPredictor()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0692  0.0674  0.0682  0.0682  0.0673  0.0681  0.0007  
MAE (testset)     0.0506  0.0496  0.0496  0.0497  0.0503  0.0500  0.0004  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.06917514, 0.06739308, 0.06820463, 0.06815011, 0.06734511]),
 'test_mae': array([0.05058685, 0.04956867, 0.04961651, 0.04970694, 0.05028406]),
 'fit_time': (0.0030558109283447266,
  0.0027959346771240234,
  0.002401113510131836,
  0.002157926559448242,
  0.002009868621826172),
 'test_time': (0.002613067626953125,
  0.002309083938598633,
  0.0020749568939208984,
  0.001920938491821289,
  0.0018339157104492188)}

In [398]:
#Final selection, 30 days worth of matches, removing accounts with less than 30 matches, using KNN

QUERY_PERIOD = 30
min_num_matches = 30

testing_dataset, hero_list = prepare_account_hero_data(query_period=QUERY_PERIOD)
removal_set = testing_dataset.groupby("account_id").sum()["count"].reset_index()
testing_dataset = testing_dataset.merge(removal_set, on='account_id', how='left')
testing_dataset = testing_dataset[testing_dataset["count_y"] >= min_num_matches]
testing_dataset["prop_of_matches"] = testing_dataset["count_x"] / testing_dataset["count_y"]

# testing_dataset.head()
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(testing_dataset[["account_id", "hero_id", "prop_of_matches"]], reader)
final_set = data.build_full_trainset()

sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }

algo = KNNBasic(sim_options=sim_options)
algo.fit(final_set)



Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1777e2be0>

358

In [409]:
account_to_predict = max(final_set.all_users())

anti_testset_user = []
targetUser = 0 #inner_id of the target user
fillValue = final_set.global_mean
user_item_ratings = final_set.ur[targetUser]
user_items = [item for (item,_) in (user_item_ratings)]
ratings = final_set.all_ratings()
for iid in final_set.all_items():
    if(iid not in user_items):
        anti_testset_user.append((final_set.to_raw_uid(targetUser),final_set.to_raw_iid(iid),fillValue))

predictions = algo.test(anti_testset_user)
recommendations = sorted([[pred.iid, pred.est] for pred in predictions], key=lambda x:x[1], reverse=True)


In [411]:
recommendations

[[128, 0.17795044933634585],
 [21, 0.162765311768566],
 [17, 0.15291157241551329],
 [23, 0.13840233049502804],
 [62, 0.13836600593849632],
 [11, 0.13729040197852294],
 [86, 0.13588638930305746],
 [48, 0.12210444307853643],
 [105, 0.11429800309962543],
 [136, 0.10884163658724025],
 [83, 0.10787653203648712],
 [104, 0.10712133402585136],
 [18, 0.10429261982968505],
 [56, 0.10285099529813452],
 [87, 0.10211609089444121],
 [71, 0.10209613154414734],
 [61, 0.10190347936481257],
 [126, 0.09660477783489936],
 [120, 0.09655384575006487],
 [97, 0.09409662309950526],
 [79, 0.09387861175884868],
 [40, 0.08720957591523029],
 [95, 0.08691168589323389],
 [16, 0.08648977847107844],
 [33, 0.08444071221768071],
 [19, 0.0843352553779434],
 [68, 0.08267717237027691],
 [32, 0.08097023403060088],
 [113, 0.07972394111820244],
 [34, 0.07806498059455674],
 [25, 0.07325700340064323],
 [101, 0.0714063969982861],
 [31, 0.0690429905448874],
 [114, 0.06808842078999189],
 [67, 0.06706878024570101],
 [102, 0.0669178

In [None]:
##workings 
query_str = f"""
SELECT
matches.match_id,
matches.start_time,
player_matches.hero_id,
player_matches.account_id
FROM matches
JOIN player_matches using(match_id)
JOIN heroes on heroes.id = player_matches.hero_id
AND matches.start_time >= extract(epoch from timestamp '2022-11-07T00:00:00.000Z')
AND matches.start_time <= extract(epoch from timestamp '2022-11-07T00:00:00.000Z')
ORDER BY matches.match_id NULLS LAST
LIMIT 250
"""

query_str = f"""
SELECT
matches.match_id,
matches.start_time,
player_matches.hero_id,
player_matches.account_id
FROM matches
JOIN player_matches using(match_id)
JOIN heroes on heroes.id = player_matches.hero_id
AND matches.start_time >= extract(epoch from timestamp '{query_start_date.year}-{query_start_date.month}-{query_start_date.day}T00:00:00.000Z')
AND matches.start_time <= extract(epoch from timestamp '{query_end_date.year}-{query_end_date.month}-{query_end_date.day}T00:00:00.000Z')
ORDER BY matches.match_id NULLS LAST
LIMIT 5000
"""