In [46]:
import json
import pandas as pd
from pathlib import Path

In [47]:
stats = json.load(open('./data/stat/player_all_stat.json'))

In [48]:
df = pd.DataFrame(stats)

In [49]:
total_recorded_games = df["nr"].sum()

def print_games_count(df: pd.DataFrame):
    remaining_games = df["nr"].sum()
    print(f"Remaining / total (%): {remaining_games} / {total_recorded_games} ({100 * remaining_games / total_recorded_games:.2f})")

Remove anonymous players and players that average less than half of maximum points

In [50]:
df = df[(df["id"] > 0) & (df["mean"] >= 157 / 2)]
print_games_count(df)

Remaining / total (%): 2680927 / 7287512 (36.79)


Show statistics on all non-anonymous players

In [51]:
df.describe()

Unnamed: 0,id,mean,std,nr
count,2023.0,2023.0,2023.0,2023.0
mean,45283.501236,82.616613,42.341373,1325.223431
std,27953.728597,5.464596,5.241301,2714.161758
min,1.0,78.5,0.0,1.0
25%,14723.0,79.341487,41.114312,14.0
50%,52068.0,80.535809,42.424945,91.0
75%,70963.5,83.833333,43.571734,1494.5
max,88321.0,140.0,92.630988,24413.0


Set threshold for minimum number of games played to 75%, so we only keep the 25% of players who have played a large part of the games

In [52]:
min_games = 1494.5
df = df[df["nr"] >= min_games]
print_games_count(df)

Remaining / total (%): 2382558 / 7287512 (32.69)


Show statistics for remaining games

In [53]:
df.describe()

Unnamed: 0,id,mean,std,nr
count,506.0,506.0,506.0,506.0
mean,42958.701581,79.632465,42.478889,4708.612648
std,25284.656762,0.904516,0.548904,3721.986917
min,421.0,78.503758,40.944051,1495.0
25%,15017.75,79.003743,42.178681,2148.0
50%,51846.0,79.443663,42.460073,3254.5
75%,59350.25,80.067166,42.767603,5845.0
max,87247.0,84.898984,45.1506,24413.0


In [54]:
df.head()

Unnamed: 0,id,mean,std,nr
10,55302,80.104669,42.997449,9382
11,5148,79.449837,42.014048,2452
13,48103,78.964493,42.094382,6562
16,71080,78.71735,42.761549,7925
18,10776,78.690511,42.719069,12889


In [55]:
final_game_aggregation = Path("./data/games.json")

Save all games that have been played only by players from cleaned player data to json file

In [56]:
n_games = 0
games = []
for file in Path("./data").glob("**/*.txt"):
    print(file)
    with open(file, "r") as f:
        for line in f.readlines():
            game = json.loads(line)
            if sum([1 for p in game["player_ids"] if p in df["id"].values]) >= 4:
                n_games += 1
                games.append(game["game"])
with open(final_game_aggregation, "w") as o:
    o.write(json.dumps(games))
print(f"Number of games: {n_games}")

data\games\jass_game_0001.txt
data\games\jass_game_0002.txt
data\games\jass_game_0003.txt
data\games\jass_game_0004.txt
data\games\jass_game_0005.txt
data\games\jass_game_0006.txt
data\games\jass_game_0007.txt
data\games\jass_game_0008.txt
data\games\jass_game_0009.txt
data\games\jass_game_0010.txt
data\games\jass_game_0011.txt
data\games\jass_game_0012.txt
data\games\jass_game_0013.txt
data\games\jass_game_0014.txt
data\games\jass_game_0015.txt
data\games\jass_game_0016.txt
data\games\jass_game_0017.txt
data\games\jass_game_0018.txt
data\games\jass_game_0019.txt
Number of games: 58684
