In [None]:
# This is a test of the initial files Jason gave me.
# Mostly this will be for developing a preprocessing script to get the forms into the proper format

In [None]:
import pandas as pd
import numpy as np

In [None]:
division = "Boys Grades 3-4 - Spring 2022 In-Town Soccer"

In [None]:
"""
Questions for Jason:
- Do we want to try and include goalie into the balancing?
- What fields do you want to keep for tracking purposes? (email?)
- When missing skill, what should we assume? (average? bad?)
- Do these players have locations (address or lat/long?)
"""

existing_players_raw = pd.read_csv("Spring2022-registrations.csv")
existing_players_raw = existing_players_raw[existing_players_raw["Division"] == division]

column_map = {
    "Player rating - Effectiveness": "coach_skill",
    "Lastname": "last_name",
    "Firstname": "first_name",
    "Grade": "grade",
    "Assigned Team": "team",
    "School Attending": "school",
}
existing_players = existing_players_raw.rename(columns=column_map)[column_map.values()]

# Extract skill
existing_players["coach_skill"] = existing_players["coach_skill"].str.extract('(\d+)', expand=False).astype(float)


# Freeze players to a team if they already have one
existing_players["frozen"] = True
missing_team = pd.isnull(existing_players.team)
existing_players.loc[missing_team, "frozen"] = False

# Set all other columns to be empty
# We may get info for these from another sheet?
for column in ["parent_skill", "longitude", "latitude", "preferred_days", "unavailable_days", "comment"]:
    existing_players[column] = np.nan

print(f"Found {len(existing_players)} existing players in division: {division}")
existing_players.head(15)

In [None]:
from tqdm import tqdm
tqdm.pandas()

import logging
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
geolocator = Nominatim(user_agent="cyslf")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1/2)

In [None]:
def lookup_location(address):
    # If an address is poorly formed, geopy gives sad-looking warnings,
    # so let's temporarily disable this.
    logging.getLogger("geopy").setLevel(logging.ERROR)
    location = geocode(address)
    logging.getLogger("geopy").setLevel(logging.WARNING)
    if location:
        return location.latitude, location.longitude
    else:
        print(f"Failed to find address: {address}")
        return np.nan, np.nan

In [None]:
new_players_raw["Location"] = new_players_raw['Address'].progress_apply(lookup_location)

In [None]:
new_players_raw["latitude"] = new_players_raw["Location"].apply(lambda x: x[0])
new_players_raw["longitude"] = new_players_raw["Location"].apply(lambda x: x[1])

In [None]:
new_players_raw.head()

In [None]:
"""
- does Players Recent Team mean anything in this sheet?
- same question of what we do if skill is missing?
"""
new_players_raw = pd.read_csv("3-4boys-sample-22registration.csv")

# Join the two school columns
has_other_school = ~pd.isnull(new_players_raw["School Name other:"])
new_players_raw.loc[has_other_school, "School Name"] = new_players_raw[has_other_school]["School Name other:"]

# Look up player latitude / longitude
new_players_raw["Postal Code"] = new_players_raw["Postal Code"].astype(str)
new_players_raw["Address"] = new_players_raw[["Street", "City", "Region", "Postal Code"]].agg(", ".join, axis=1)


column_map = {
    "Player Last Name": "last_name",
    "Player First Name": "first_name",
    "Current Grade": "grade",
    "Parental assessment of player ability/athleticism:": "parent_skill",
    "School Name": "school",
    "Special Requests": "comment",
}
new_players = new_players_raw.rename(columns=column_map)[column_map.values()]

# Extract grade
new_players["grade"] = new_players["grade"].str.extract('(\d+)', expand=False).astype(int)

# Extract skill
new_players["parent_skill"] = new_players["parent_skill"].str.extract('(\d+)', expand=False).astype(float)

# Look up player latitude / longitude



# Set frozen to False
new_players["frozen"] = False

for column in ["team", "coach_skill", "longitude", "latitude", "preferred_days", "unavailable_days"]:
    new_players[column] = np.nan

print(f"Found {len(new_players)} new players in division: {division}")
existing_players.head(15)

In [None]:
players = pd.concat([existing_players, new_players])

In [None]:
players

In [None]:
# Random TODOs
# - someone requested to not be on a particular team
# - a quick fix is to say they can't practie on that day?