## Create ELO file
We are using this iPython notebook to create a raw data file for ELO ratings.

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
# Load in DataFrames with every result pulled from the WIAA's 
# website since the 2021-2022 season
wiaa22 = pd.read_csv("wiaa2021.csv", header=None).rename(columns={
    0:"Team",
    1:"Enrollment", 
    2:"Website", 
    3:"Teams_Played",
    4:"Scores", 
    5:"Results", 
    6:"Dates"
}).drop(labels=["Scores"], axis=1).sort_values(by="Team").reset_index(drop=True)
wiaa23 = pd.read_csv("wiaa2022.csv", header=None).rename(columns={
    0:"Team",
    1:"Enrollment", 
    2:"Website", 
    3:"Teams_Played", 
    4:"Scores", 
    5:"Results", 
    6:"Dates"
}).drop(labels=["Scores"], axis=1).sort_values(by="Team").reset_index(drop=True)
wiaa24 = pd.read_csv("wiaa2023.csv", header=None).rename(columns={
    0:"Team",
    1:"Enrollment", 
    2:"Website", 
    3:"Teams_Played", 
    4:"Scores", 
    5:"Results", 
    6:"Dates"
}).drop(labels=["Scores"], axis=1).sort_values(by="Team").reset_index(drop=True)
# View wiaa22 DataFrame's first few rows
wiaa22.head()

Unnamed: 0,Team,Enrollment,Website,Teams_Played,Results,Dates
0,Abbotsford,222,https://schools.wiaawi.org/Directory/Schedule/...,"['Chequamegon', 'Colby', 'Athens', 'Greenwood'...","['W 70-19', 'W 52-32', 'L 40-53', 'W 52-46', '...","['11/18/2021', '11/22/2021', '11/23/2021', '11..."
1,Abundant Life Christian/St. Ambrose,142,https://schools.wiaawi.org/Directory/Schedule/...,"['Monticello', 'Pecatonica', 'Parkview', 'Juda...","['L 15-37', 'L 20-69', 'L 31-37', 'L 42-49', '...","['11/16/2021', '11/18/2021', '11/23/2021', '11..."
2,Adams-Friendship,435,https://schools.wiaawi.org/Directory/Schedule/...,"['Royall', 'Tri-County', 'Omro', 'Portage', 'W...","['L 41-55', 'W 61-18', 'L 44-48', 'W 40-34', '...","['11/16/2021', '11/18/2021', '11/23/2021', '11..."
3,Albany,96,https://schools.wiaawi.org/Directory/Schedule/...,"['Benton', 'Belleville', 'Belmont', 'Brodhead'...","['W 60-38', 'L 54-62', 'L 62-67', 'L 35-45', '...","['11/16/2021', '11/19/2021', '11/23/2021', '11..."
4,Algoma,196,https://schools.wiaawi.org/Directory/Schedule/...,"['Ashwaubenon', 'Two Rivers', 'Kewaunee', 'Gre...","['L 27-36', 'L 29-47', 'L 31-60', 'W 45-37', '...","['11/16/2021', '11/23/2021', '11/29/2021', '12..."


#### 538's NBA ELO
The FiveThirtyEight model makees three adjustments.
1. After every season, all teams get regressed to the mean.
2. The home team gets an extra 100 points when comparing winning probabilities, as there's home court advantage. Since the home team wins about 60% of the time, this should probably be +70.
3. $K$ is adjusted depending on the margin of victory and the ELO difference. You win by more points, you get more of $K$ with a small adjustment for team strength as well.

It may be useful to utilize some of [FiveThirtyEight's NBA ELO definitions](https://github.com/fivethirtyeight/data/tree/master/nba-elo) to build a simplified model for the WIAA Tournament:

| Header | Definition |
| :----- | :--------- |
| `gameorder` | Play order of every game HSGBB for WIAA teams |
| `game_id` | Unique ID for each game |
| `year_id` | Season ID based on the year which the season ended |
| `date_game` | Date of game played |
| `is_playoffs` | Flag for WIAA playoff games |
| `team_name` | Name of team involved in a game |
| `pts` | Points score by team in that game |
| `elo_i` | Team ELO entering the game |
| `elo_n` | Team ELO following the game |
| `win_equiv` | Equivalent number of wins in a 24-game season for a team of elo_n quality |
| `opp_name` | Team name of opponent played |
| `opp_pts` | Points scored by opponent |
| `opp_elo_i` | Opponent ELO entering the game |
| `opp_elo_n` | Opponent ELO following the game |
| `game_location` | Home (H) or Away (A) *will work on adding neutral (N) later* |
| `game_result` | Win or loss for the team in the team_id column, 0 for loss, 1 for win |
| `forecast` | ELO-based chances of winning for the team in `team_id` colun, based on ELO ratings and location |

The next issue we have to overcome is to recreate a file that is similar to [FiveThirtyEight's NBA ELO file](https://raw.githubusercontent.com/fivethirtyeight/data/master/nba-elo/nbaallelo.csv). We are not calculating the ELO steps and from the files given, we cannot figure out the game location (*fix in the future*), but otherwise we should be able to figure out:
- the game order
- game identifier
- year
- date of game
- whether the game was a playoff contest
- points scored by a team in a game
- points scored against that in a game
- teams that played in a game

In [84]:
# Create DataFrame of games
df = pd.DataFrame({
    "gameorder": [],
    "game_id": [],
    "year_id": [],
    "date_game": [],
    "is_playoffs": [],
    "team_name": [],
    "opponent_name": [],
    "pts": [],
    "opp_pts": []
})
# Cast datatypes... dumb
df["gameorder"] = df.gameorder.astype(int)
df["game_id"] = df.game_id.astype(str)
df["year_id"] = df.year_id.astype(int)
df["date_game"] = pd.to_datetime(df["date_game"])
df["is_playoffs"] = df.is_playoffs.astype(bool)
df["team_name"] = df.team_name.astype(str)
df["opponent_name"] = df.opponent_name.astype(str)
df["pts"] = df.pts.astype(float)
df["opp_pts"] = df.opp_pts.astype(float)
# Read in records for the DataFrame of games
# Record-by-record read in of a team's Data
df_num = 0
# Loop through each df
for res in [wiaa22, wiaa23, wiaa24]:
    df_num += 1
    for i in range(len(res)):
        # Grab each record individually
        record = res.iloc[i, :]
        # Set teams played
        teams_played_all = record.iloc[3].split(", ")
        # Clean the strings
        for j in range(len(teams_played_all)):
            teams_played_all[j] = str(teams_played_all[j]).replace("[", "")
            teams_played_all[j] = str(teams_played_all[j]).replace("]", "")
            teams_played_all[j] = str(teams_played_all[j]).replace("'", "")
            teams_played_all[j] = str(teams_played_all[j]).replace('"', '')
        # Get all the scores
        game_scores = record.iloc[4].split(", ")
        # Clean scores
        for j in range(len(game_scores)):
            game_scores[j] = str(game_scores[j]).replace("[", "")
            game_scores[j] = str(game_scores[j]).replace("]", "")
            game_scores[j] = str(game_scores[j]).replace("'", "")
        # Set dates of games
        date_of_games = record.iloc[5].split(", ")
        # Clean dates
        for j in range(len(date_of_games)):
            date_of_games[j] = str(date_of_games[j]).replace("[", "")
            date_of_games[j] = str(date_of_games[j]).replace("]", "")
            date_of_games[j] = str(date_of_games[j]).replace("'", "")
            try:
                date_of_games[j] = datetime.strptime(str(date_of_games[j]), '%m/%d/%Y')
            except ValueError:
                if df_num == 1:
                    date_of_games[j] = datetime.strptime("04/01/2022", '%m/%d/%Y')
                elif df_num == 2:
                    date_of_games[j] = datetime.strptime("04/01/2023", '%m/%d/%Y')
                else:
                    date_of_games[j] = datetime.strptime("04/01/2024", '%m/%d/%Y')
        # Get Team for each record
        team = str(record.iloc[0])
        # Initialize added team DataFrame
        add = pd.DataFrame({
            "gameorder": [],
            "game_id": [],
            "year_id": [],
            "date_game": [],
            "is_playoffs": [],
            "team_name": [],
            "opponent_name": [],
            "pts": [],
            "opp_pts": []
        })
        add["gameorder"] = add.gameorder.astype(int)
        add["game_id"] = add.game_id.astype(str)
        add["year_id"] = add.year_id.astype(int)
        add["date_game"] = pd.to_datetime(add["date_game"])
        add["is_playoffs"] = add.is_playoffs.astype(bool)
        add["team_name"] = add.team_name.astype(str)
        add["opponent_name"] = add.opponent_name.astype(str)
        add["pts"] = add.pts.astype(float)
        add["opp_pts"] = add.opp_pts.astype(float)
        # Loop through to get wins and losses prior to the seeding dat
        for j in range(len(teams_played_all)):
            # Go through and create the data frame
            if df_num == 1:
                try:
                    # Set the values of the dataframe
                    add.at[j, "team_name"] = str(team)
                    add.at[j, "opponent_name"] = str(teams_played_all[j])
                    add.at[j, "date_game"] = date_of_games[j]
                    add.at[j, "year_id"] = 2022
                    # Set playoff game flag to False, points scored, and opponent 
                    # points scored for regular season game
                    if datetime(2022, 2, 22, 0, 0) >= date_of_games[j]:
                        # Split the score
                        add.at[j, "is_playoffs"] = False
                        add.at[j, "pts"] = float(game_scores[j].split("-")[0])
                        add.at[j, "opp_pts"] = float(game_scores[j].split("-")[1])
                    # Set playoff game flag to True, points scored, and opponent
                    # points scored for playoff game
                    # Need to change once we can get notes or W-L values
                    else:
                        # Set flag
                        add.at[j, "is_playoffs"] = True
                        # Need to check because playoff game scores have different format
                        if ((j+1) == len(teams_played_all)) or (teams_played_all[j] not in [
                            "Kettle Moraine", "Notre Dame", "Waupun", "Mineral Point", "Randolph"
                        ]):
                            s0 = float(game_scores[j].split("-")[0])
                            s1 = float(game_scores[j].split("-")[1])
                            add.at[j, "pts"] = max([s0, s1])
                            add.at[j, "opp_pts"] = min([s0, s1])
                            # print(f"Playoff final score: {add.at[j, "pts"]}-{add.at[j, "opp_pts"]}")
                        else:
                            s0 = float(game_scores[j].split("-")[0])
                            s1 = float(game_scores[j].split("-")[1])
                            add.at[j, "pts"] = min([s0, s1])
                            add.at[j, "opp_pts"] = max([s0, s1])
                            # print(f"Playoff final score: {add.at[j, "pts"]}-{add.at[j, "opp_pts"]}")
                except Exception as err:
                    print(err)
            elif df_num == 2:
                try:
                    # Set the values of the dataframe
                    add.at[j, "team_name"] = str(team)
                    add.at[j, "opponent_name"] = str(teams_played_all[j])
                    add.at[j, "date_game"] = date_of_games[j]
                    add.at[j, "year_id"] = 2023
                    # Set playoff game flag to False, points scored, and opponent 
                    # points scored for regular season game
                    if datetime(2023, 2, 21, 0, 0) >= date_of_games[j]:
                        # Split the score
                        add.at[j, "is_playoffs"] = False
                        add.at[j, "pts"] = float(game_scores[j].split("-")[0])
                        add.at[j, "opp_pts"] = float(game_scores[j].split("-")[1])
                    # Set playoff game flag to True, points scored, and opponent
                    # points scored for playoff game
                    # Need to change once we can get notes or W-L values
                    else:
                        # Set flag
                        add.at[j, "is_playoffs"] = True
                        # Need to check because playoff game scores have different format
                        if ((j+1) == len(teams_played_all)) or (teams_played_all[j] not in [
                            "Kettle Moraine", "Notre Dame", "Kewauskum", "Laconia", "McDonnell Catholic"
                        ]):
                            s0 = float(game_scores[j].split("-")[0])
                            s1 = float(game_scores[j].split("-")[1])
                            add.at[j, "pts"] = max([s0, s1])
                            add.at[j, "opp_pts"] = min([s0, s1])
                            # print(f"Playoff final score: {add.at[j, "pts"]}-{add.at[j, "opp_pts"]}")
                        else:
                            s0 = float(game_scores[j].split("-")[0])
                            s1 = float(game_scores[j].split("-")[1])
                            add.at[j, "pts"] = min([s0, s1])
                            add.at[j, "opp_pts"] = max([s0, s1])
                            # print(f"Playoff final score: {add.at[j, "pts"]}-{add.at[j, "opp_pts"]}")
                except Exception as err:
                    print(err)
            elif df_num == 3:
                try:
                    # Set the values of the dataframe
                    add.at[j, "team_name"] = str(team)
                    add.at[j, "opponent_name"] = str(teams_played_all[j])
                    add.at[j, "date_game"] = date_of_games[j]
                    add.at[j, "year_id"] = 2024
                    # Set playoff game flag to False, points scored, and opponent 
                    # points scored for regular season game
                    if datetime(2024, 2, 20, 0, 0) >= date_of_games[j]:
                        # Split the score
                        add.at[j, "is_playoffs"] = False
                        add.at[j, "pts"] = float(game_scores[j].split("-")[0])
                        add.at[j, "opp_pts"] = float(game_scores[j].split("-")[1])
                    # Set playoff game flag to True, points scored, and opponent
                    # points scored for playoff game
                    # Need to change once we can get notes or W-L values
                    else:
                        # Set flag
                        add.at[j, "is_playoffs"] = True
                        # Need to check because playoff game scores have different format
                        if ((j+1) == len(teams_played_all)) or (teams_played_all[j] not in [
                            "Arrowhead", "Pewaukee", "Edgewood", "Laconia", "Albany/Monticello"
                        ]):
                            s0 = float(game_scores[j].split("-")[0])
                            s1 = float(game_scores[j].split("-")[1])
                            add.at[j, "pts"] = max([s0, s1])
                            add.at[j, "opp_pts"] = min([s0, s1])
                            # print(f"Playoff final score: {add.at[j, "pts"]}-{add.at[j, "opp_pts"]}")
                        else:
                            s0 = float(game_scores[j].split("-")[0])
                            s1 = float(game_scores[j].split("-")[1])
                            add.at[j, "pts"] = min([s0, s1])
                            add.at[j, "opp_pts"] = max([s0, s1])
                            # print(f"Playoff final score: {add.at[j, "pts"]}-{add.at[j, "opp_pts"]}")
                except Exception as err:
                    print(err)
        # Okay now that we have one DataFrame collected for a team's data
        # we need to concatenate the record to the DataFrame df
        df = pd.concat([df, add], axis=0).reset_index(drop=True)

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
could not convert string to float: ''
list index out of range
list index out of range
list index out of range
list index out of range
could not convert string to float: ''
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
could not convert string to float: ''
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of rang

In [85]:
df

Unnamed: 0,gameorder,game_id,year_id,date_game,is_playoffs,team_name,opponent_name,pts,opp_pts
0,,,2022.0,2021-11-18,False,Abbotsford,Chequamegon,70.0,19.0
1,,,2022.0,2021-11-22,False,Abbotsford,Colby,52.0,32.0
2,,,2022.0,2021-11-23,False,Abbotsford,Athens,40.0,53.0
3,,,2022.0,2021-11-29,False,Abbotsford,Greenwood,52.0,46.0
4,,,2022.0,2021-12-03,False,Abbotsford,Auburndale,32.0,61.0
...,...,...,...,...,...,...,...,...,...
34966,,,2024.0,2024-02-23,True,Xavier,Denmark,65.0,38.0
34967,,,2024.0,2024-02-24,True,Xavier,Omro,74.0,33.0
34968,,,2024.0,2024-02-29,True,Xavier,Winneconne,57.0,41.0
34969,,,2024.0,2024-03-02,True,Xavier,Wittenberg-Birnamwood,55.0,39.0


In [86]:
df[(df.team_name == "Laconia") & (df.year_id == 2023)]

Unnamed: 0,gameorder,game_id,year_id,date_game,is_playoffs,team_name,opponent_name,pts,opp_pts
16382,,,2023.0,2022-11-15,False,Laconia,Oostburg,79.0,58.0
16383,,,2023.0,2022-11-17,False,Laconia,Xavier,39.0,38.0
16384,,,2023.0,2022-11-22,False,Laconia,Waupun,52.0,61.0
16385,,,2023.0,2022-11-25,False,Laconia,Mishicot,70.0,43.0
16386,,,2023.0,2022-11-26,False,Laconia,Martin Luther,54.0,53.0
16387,,,2023.0,2022-11-29,False,Laconia,North Fond du Lac,81.0,41.0
16388,,,2023.0,2022-12-01,False,Laconia,Grafton,54.0,53.0
16389,,,2023.0,2022-12-06,False,Laconia,Omro,74.0,48.0
16390,,,2023.0,2022-12-09,False,Laconia,Winnebago Lutheran,62.0,51.0
16391,,,2023.0,2022-12-10,False,Laconia,Randolph,55.0,50.0


In [62]:
record = wiaa22[wiaa22["Team"] == "Messmer"]
teams_played_all = str(record.iloc[:, 3]).split("', ")
# Clean the strings
for j in range(len(teams_played_all)):
    teams_played_all[j] = str(teams_played_all[j]).replace("[", "")
    teams_played_all[j] = str(teams_played_all[j]).replace("]", "")
    teams_played_all[j] = str(teams_played_all[j]).replace("'", "")
# Set dates of games
date_of_games = str(record.iloc[:, 5]).split(", ")
# Clean dates
for j in range(len(date_of_games)):
    date_of_games[j] = str(date_of_games[j]).replace("[", "")
    date_of_games[j] = str(date_of_games[j]).replace("]", "")
    date_of_games[j] = str(date_of_games[j]).replace("'", "")
    if len(str(date_of_games[j]).split("    ")) == 1:
        continue
    else:
        date_of_games[j] = str(date_of_games[j]).split("    ")[1]
    try:
        date_of_games[j] = datetime.strptime(str(date_of_games[j]), '%m/%d/%Y')
    except ValueError:
        date_of_games[j] = datetime.strptime(str(date_of_games[j]), "%m/%d/%Y")
for i in range(len(date_of_games)):
    try:
        print(f"Game: {i+1}")
        print(f"Date: {date_of_games[i]}")
        print(f"Opponent: {teams_played_all[i]}")
    except IndexError:
        print("Team out of index.")

Game: 1
Date: 2021-11-23 00:00:00
Opponent: 244    University School of Milwaukee
Game: 2
Date: 11/30/2021
Opponent: Wauwatosa ...
Name: Teams_Played, dtype: object
Game: 3
Date: 12/01/2021
Team out of index.
Game: 4
Date: 12...
Name: Dates
Team out of index.
Game: 5
Date: dtype: object
Team out of index.


1