In [66]:
import pandas as pd
from rapidfuzz import fuzz, process

coaches = pd.read_csv('../data/raw/coaches.csv')
players = pd.read_csv('../data/raw/players.csv')
schools = pd.read_csv('../data/raw/schools.csv')


In [67]:
print(coaches.head())
print(players.head())
print(schools.head())

                  School              Name                    Title
0  University Of Florida  Kevin O'Sullivan               Head Coach
1  University Of Florida   Chuck Jeroloman     Associate Head Coach
2  University Of Florida      Taylor Black          Assistant Coach
3  University Of Florida        David Kopp           Pitching Coach
4  University Of Florida     Michael Byrne  Student Assistant Coach
                  School           Name  Jersey Position Class Year   Height  \
0  University Of Florida  Justin Nadeau       1   INF/OF        Jr.   6' 0''   
1  University Of Florida       Ty Evans       2       OF        Sr.   6' 2''   
2  University Of Florida     Kyle Jones       3       OF        So.   6' 3''   
3  University Of Florida   Cade Kurland       4      INF        Jr.  5' 11''   
4  University Of Florida      Blake Cyr       5   OF/INF        Jr.  5' 11''   

    Weight                   High School  
0  195 lbs  Bartram Trail (Jacksonville)  
1  215 lbs            Lak

In [68]:
counts = players['Name'].value_counts()
multiple_player_names = counts[counts > 1]
multiple_player_names

Name
Joey McMannis      2
Cameron Johnson    2
Name: count, dtype: int64

In [69]:
high_schools = players['High School'].dropna().unique()
len(high_schools)

444

In [70]:
temp_schools = high_schools[:5]
for school in temp_schools:
    if school == 'N/A':
        continue

    matches = process.extract(
        school,
        temp_schools,
        scorer = fuzz.ratio,
        score_cutoff = 80
    )

    print(matches[0])

('Bartram Trail (Jacksonville)', 100.0, 0)
('Lakeland Christian', 100.0, 1)
('North Oconee (Stetson)', 100.0, 2)
('Berkeley Prep', 100.0, 3)
('Windermere (Miami)', 100.0, 4)


In [79]:
def dedup_high_schools(high_schools, canonical_hs, score_cutoff):
    hs_mappings = {}
    for school in high_schools:
        if school == 'N/A':
            continue

        matches = process.extract(
            school,
            canonical_hs,
            scorer = fuzz.partial_ratio,
            score_cutoff = score_cutoff
        )

        if matches:
            hs_mappings[school] = matches[0][0]
        else:
            hs_mappings[school] = school

    return hs_mappings

In [None]:
for score_cutoff in [95]:
    hs_mappings = {}
    print(hs_mappings)
    canonical_hs = schools[schools['school type'] == 'high school']['name'].unique()
    high_schools = players['High School'].dropna().unique()
    hs_mappings = dedup_high_schools(high_schools, canonical_hs, score_cutoff=score_cutoff)

    for key, vals in hs_mappings.items():
        if key != vals:
            print(f"{key!r} -> {vals!r}")

    print("\n")

{}
'Lakeland Christian (Virginia Tech)' -> 'Lakeland Christian'
'TNXL Academy (Charleston Southern)' -> 'TNXL Academy'
'Liberty HS' -> 'Liberty'
'John Carroll Catholic HS' -> 'Catholic HS'
'Brandon HS' -> 'Brandon'
'Franklin HS' -> 'Franklin'
'Pearland HS' -> 'Pearl'
'Columbia River' -> 'Columbia'
'Buchanan HS' -> 'Buchanan'
'Sherwood HS' -> 'Sherwood'
'Aquinas HS' -> 'Aquinas'
'Liberty-Wentzville' -> 'Liberty'
'East Ridge High School' -> 'East Ridge'
'Austin Prep' -> 'Austin'
'Wayzata HS' -> 'Wayzata'
'Montgomery Lake Creek' -> 'Lake Creek'
'Trussville' -> 'Hewitt-Trussville'
'TNXL Academy [Fla.]' -> 'TNXL Academy'




: 

In [75]:
hs_mappings = {}

for school in high_schools:
    if school == 'N/A':
        continue

    matches = process.extract(
        school,
        canonical_hs,
        scorer = fuzz.partial_ratio,
        score_cutoff = 70
    )

    if matches:
        hs_mappings[school] = matches[0][0]
    else:
        hs_mappings[school] = school

for key, vals in hs_mappings.items():
    if key != vals:
        print(f"{key!r} -> {vals!r}")

'Lakeland Christian (Virginia Tech)' -> 'Lakeland Christian'
'TNXL Academy (Charleston Southern)' -> 'TNXL Academy'
'Liberty HS' -> 'Liberty'
'John Carroll Catholic HS' -> 'Catholic HS'
'Brandon HS' -> 'Brandon'
'De La Salle HS' -> 'De La Salle'
'Franklin HS' -> 'Franklin'
'Pearland HS' -> 'Pearl'
'Columbia River' -> 'Columbia'
'Santa Margarita Catholic HS' -> 'Catholic HS'
'Buchanan HS' -> 'Buchanan'
'Sherwood HS' -> 'Sherwood'
'Centennial HS' -> 'Centennial'
'Columbia River HS' -> 'Columbia'
'Cardinal Newman HS' -> 'Cardinal Newman'
'Aquinas HS' -> 'Aquinas'
'Liberty-Wentzville' -> 'Liberty'
'East Ridge High School' -> 'East Ridge'
'Austin Prep' -> 'Austin'
'Wayzata HS' -> 'Wayzata'
'Montgomery Lake Creek' -> 'Lake Creek'
'Trussville' -> 'Hewitt-Trussville'
'TNXL Academy [Fla.]' -> 'TNXL Academy'


In [77]:
def dedup_coaches(coaches_df):
    # 1. John Smith at ABC Univ: Head Coach
    # 2. John Smith at ABC Univ: Recruiting Coordinator
    # => 1. John Smith at ABC Univ: Head Coach | Recruiting Coordinator
    coaches_clean = coaches_df.groupby(['Name', 'School'], as_index=False).agg({
        'Title': lambda x: ' | '.join(sorted(set(x)))
    })
    return coaches_clean

print(coaches.head())
coaches_clean = dedup_coaches(coaches)
print(coaches_clean)

                  School              Name                    Title
0  University Of Florida  Kevin O'Sullivan               Head Coach
1  University Of Florida   Chuck Jeroloman     Associate Head Coach
2  University Of Florida      Taylor Black          Assistant Coach
3  University Of Florida        David Kopp           Pitching Coach
4  University Of Florida     Michael Byrne  Student Assistant Coach
                 Name                   School  \
0       Adam  Pallone       Rutgers University   
1         Adam Nelson   University of Maryland   
2       Alec Crawford  University Of Minnesota   
3         Alex Corbin   University of Maryland   
4      Amanda Branson    University Of Alabama   
..                ...                      ...   
150  Tyler Schwalbach   University Of Oklahoma   
151     Vince Rinaldi   University Of Oklahoma   
152          Will Fox     Texas A&M University   
153       Zach Cronin    University Of Florida   
154  Zach Weatherford    University Of Ind