In [20]:
import numpy as np

import json
import io
from tqdm import tqdm

In [21]:
dataset_path = "../data/pokemon_data.json"

In [22]:
def load_json(path):
    # Opening JSON file
    with open(path, 'r') as openfile:
        # Reading from json file
        json_object = json.load(openfile)
        return json_object
    
def save_json(object_, path):
    # Serializing json
    json_object = json.dumps(object_, indent=4)
    
    # Writing to sample.json
    with open(path, "w") as outfile:
        outfile.write(json_object)

In [4]:
all_pokemon_data = load_json(dataset_path)

In [17]:
# compare pokemon stats
all_similarities = {}
max_p = .95 # max similarity probability
for pid1 in tqdm(all_pokemon_data):
    pkmn1_stats = all_pokemon_data[pid1]["stats"]
    pkmn1_stats = np.array([stat["base_stat"] for stat in pkmn1_stats])
    
    similarities = []
    for pid2 in all_pokemon_data:
        if pid1 == pid2:
            continue
        pkmn2_stats = all_pokemon_data[pid2]["stats"]
        pkmn2_stats = np.array([stat["base_stat"] for stat in pkmn2_stats])

        # Compute stat distance
        cost = np.linalg.norm(pkmn1_stats - pkmn2_stats)
        # If stats identical, skip
        if cost == 0:
            continue
        
        similarity = -1 * cost
        similarities.append((pid2, similarity))
        
    _, min_sim = min(similarities, key = lambda x: x[1])
    _, max_sim = max(similarities, key = lambda x: x[1])
    max_sim -= min_sim
    
    similarities = [(pid, ((sim - min_sim) * max_p) / max_sim) for pid, sim in similarities]
    
    similarities.sort(key = lambda x: x[1], reverse=True)
    # truncate to top 50
    similarities = similarities[:50]
    all_similarities[pid1] = similarities

100%|█████████████████████████████████████████████████████████████████████████████████████| 899/899 [00:06<00:00, 133.72it/s]


In [19]:
save_json(all_similarities, "../data/client_data/stat_similarities.json")

In [18]:
# explore results
chosen_pid = "150"
print([stat["base_stat"] for stat in all_pokemon_data[chosen_pid]["stats"]])
print("========")
for pid, similarity in all_similarities[chosen_pid][:20]:
    print(pid, similarity)
    print([stat["base_stat"] for stat in all_pokemon_data[pid]["stats"]])
    print()

[106, 110, 90, 154, 90, 130]
890 0.95
[140, 85, 95, 145, 95, 130]

491 0.9430794311492136
[70, 90, 90, 135, 90, 125]

635 0.9429200655164381
[92, 105, 90, 125, 90, 98]

484 0.9356712545791814
[90, 120, 100, 150, 120, 100]

716 0.9301130409787238
[126, 131, 95, 131, 98, 99]

717 0.9301130409787238
[126, 131, 95, 131, 98, 99]

145 0.9300391117632293
[90, 90, 85, 125, 90, 100]

641 0.928786953736677
[79, 115, 70, 125, 80, 111]

642 0.928786953736677
[79, 115, 70, 125, 80, 111]

381 0.9252974994810638
[80, 90, 80, 130, 110, 110]

646 0.9249378177903214
[125, 130, 90, 130, 90, 95]

792 0.922226864699057
[137, 113, 89, 137, 107, 97]

483 0.9182326331060566
[100, 120, 120, 150, 100, 90]

643 0.9182326331060566
[100, 120, 100, 150, 120, 90]

647 0.9173333209420951
[91, 72, 90, 129, 90, 108]

146 0.9158209424248759
[90, 100, 90, 125, 85, 90]

384 0.9151373967565394
[105, 150, 90, 150, 90, 95]

243 0.9114201491148431
[90, 85, 75, 115, 100, 115]

645 0.9106183667137948
[89, 125, 90, 115, 80, 101]