In [98]:
import pandas as pd
import numpy as np
import json

In [99]:
def get_redundant_columns(df, excepted_columns=[]):
    redundant_columns = {}
    for col in df.columns:
        if col not in excepted_columns:
            val = df[col].unique()
            if len(df[col].unique()) == 1:
                redundant_columns[col] = val[0]
    return redundant_columns

def clean_redundant_columns(df : pd.DataFrame, dict_json, excepted_columns=[]):
    rc = get_redundant_columns(df, excepted_columns=excepted_columns)
    for column, value in rc.items():
        dict_json[column] = value

    df.drop(columns=rc.keys(), inplace=True)
    
# https://stackoverflow.com/questions/58408054/typeerror-object-of-type-bool-is-not-json-serializable
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.bool_):
            return super().encode(bool(obj))
        elif isinstance(obj, np.int64):
            return super().encode(int(obj))
        elif not pd.notna(obj):
            print(obj)
            return super().encode(None)
        else: 
            return super().default(obj)
            


In [100]:

winter_folder = "data/rlcs-202122/winter-major/"
main_filename = winter_folder + "main_wmajor.csv"
players_m_filename = winter_folder + "players_bo_wmajor.csv"
players_g_filename = winter_folder + "players_g_wmajor.csv"
teams_m_filename = winter_folder + "teams_bo_wmajor.csv"
teams_g_filename = winter_folder + "teams_g_wmajor.csv"



main_df = pd.read_csv(main_filename).drop(columns=["Unnamed: 0"])
players_m_df = pd.read_csv(players_m_filename).drop(columns=["Unnamed: 0"])
players_g_df = pd.read_csv(players_g_filename).drop(columns=["Unnamed: 0"])
teams_m_df = pd.read_csv(teams_m_filename).drop(columns=["Unnamed: 0"])
teams_g_df = pd.read_csv(teams_g_filename).drop(columns=["Unnamed: 0"])



In [101]:
# Files with errors
main_df[~main_df.game_date.notna()]


Unnamed: 0,event_id,event,event_split,event_region,event_slug,event_start_date,event_end_date,event_tier,event_phase,prize_money,...,reverse_sweep_attempt,reverse_sweep,game_id,game_number,game_date,game_duration,map_id,map_name,overtime,ballchasing_id
73,614b6649f8090ec745286427,Major,Winter,World,https://octane.gg/events/6427-rlcs-2021-22-win...,2022-03-23 00:00:00+00:00,2022-03-27 22:30:00+00:00,S,Main Event,300000.0,...,True,True,623c6335da9d7ca1c7bab21f,3.0,,300.0,,DFH Stadium,False,
129,614b6649f8090ec745286427,Major,Winter,World,https://octane.gg/events/6427-rlcs-2021-22-win...,2022-03-23 00:00:00+00:00,2022-03-27 22:30:00+00:00,S,Main Event,300000.0,...,False,False,623fab38c437fde7e02d2c70,1.0,,300.0,,Mannfield,False,


In [102]:
winter_major_json = {}

main_df_copy = main_df.copy()

clean_redundant_columns(main_df_copy, winter_major_json)

winter_major_json["players"] = players_g_df[["player_tag", "player_id", "team_id", "team_region"]].drop_duplicates().to_dict(orient="records")
winter_major_json["teams"] = teams_g_df[["team_name", "team_id", "team_slug", "team_region"]].drop_duplicates().to_dict(orient="records")

match_ids = main_df_copy.match_id.unique()
matches = []
for m_id in match_ids:
    match_json = {}
    main_match_df = main_df_copy[main_df_copy.match_id == m_id]
    clean_redundant_columns(main_match_df, match_json, ['reverse_sweep_attempt', 'reverse_sweep', 'game_duration', 'overtime'])
    
    teams_match = teams_m_df[teams_m_df.match_id == m_id].drop(columns="match_id").to_dict(orient="records")
    players_match = players_m_df[players_m_df.match_id == m_id].drop(columns="match_id").to_dict(orient="records")
    
    team_count = 1
    for tm in teams_match:
        player_count = 1
        for pm in players_match:
            if pm["team_id"] == tm["team_id"]:
                tm[f"player{player_count}"] = pm.copy()
                player_count += 1
        match_json[f"team{team_count}"] = tm.copy()
        team_count += 1
                
        
    game_ids = main_match_df.game_id.unique()
    games = []
    match_duration = 0.0
    for g_id in game_ids:
        game_json = main_match_df[main_match_df.game_id == g_id].to_dict(orient="records")[0]
        
        if pd.notna(game_json["game_date"]):
            teams_game = teams_g_df[teams_g_df.game_id == g_id].drop(columns="game_id").to_dict(orient="records")
            players_game = players_g_df[players_g_df.game_id == g_id].drop(columns="game_id").to_dict(orient="records")
            
            team_count = 1
            for tm in teams_game:
                player_count = 1
                for pm in players_game:
                    if pm["team_id"] == tm["team_id"]:
                        tm[f"player{player_count}"] = pm.copy()
                        player_count += 1
                game_json[f"team{team_count}"] = tm.copy()
                team_count += 1
            match_duration += game_json["game_duration"]
            game_json["technical_problems"] = False
        else:
            for k in game_json:
                if not pd.notna(game_json[k]):
                    game_json[k] = None                    
            game_json["technical_problems"] = True
        games.append(game_json)
        
    match_json["games"] = games
    match_json["game_count"] = len(games)
    match_json["match_duration"] = match_duration
    matches.append(match_json)
    
winter_major_json["matches"] = matches




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [103]:
# WINTER MAJORS TEAM STATS
for t in winter_major_json["teams"]:
    tid = t["team_id"]
    team_matches = []
    team_matches_stats = []
    team_games_stats = []
    team_games = []
    for m in winter_major_json["matches"]:
        team_match_stats = None
        team_match_games_stats = None 
        if m["team1"]["team_id"] == tid:
            team_match_stats = m["team1"]
            team_match_games_stats = [g["team1"] for g in m["games"] if not g["technical_problems"]]
        elif m["team2"]["team_id"] == tid:
            team_match_stats = m["team2"]
            team_match_games_stats = [g["team2"] for g in m["games"] if not g["technical_problems"]]
        
        if team_match_stats is not None:
            team_matches_stats.append(team_match_stats)
            team_matches.append(m)
            team_games.extend(m["games"])
            team_games_stats.extend(team_match_games_stats)
    
    team_match_stats_df = pd.DataFrame(team_matches_stats).drop(columns=["team_id", "color", "team_slug", "team_name", "team_region"])
    t["match_stats_average"] = team_match_stats_df.mean().to_dict()
    
    team_game_stats_df = pd.DataFrame(team_games_stats)
    t["game_stats_average"] = team_match_stats_df.mean().to_dict()
     
    t["matches"] = team_matches
    t["games"] = team_games
    
    
    

  t["match_stats_average"] = team_match_stats_df.mean().to_dict()
  t["game_stats_average"] = team_match_stats_df.mean().to_dict()


In [104]:
out_filename = winter_folder + "winter_major_data.json"

with open(out_filename, "w") as outfile:
    json.dump(winter_major_json, outfile, cls=CustomJSONizer, indent=4)