## College Football Data Wrangling

#### The goal of this notebook is to pull in all the data from CollegeFootballData.com
##### This notebook will take in data beginning at the grain where one row is one game, and instead create two records from one game: one from the perspective of the home team and one for the away team. This makes it easier for analysis and offers easy analysis on more familiar metrics like points for, etc.


##### Chris McAllister
#### ----------------------------------

###### Helpful Tutorial
https://blog.collegefootballdata.com/introduction-to-cfb-analytics/

###### Actual Documentation
https://api.collegefootballdata.com/api/docs/?url=/api-docs.json

###### Get custom API key emailed here:
https://collegefootballdata.com/key

In [16]:
# Uncomment and run line below if cfbd library isn't already installed
#! pip install cfbd

import cfbd
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [17]:
# Running this code by itself won't work. You'll need your own API Key.
# See link above to have custom API link emailed and save that key as variable api_key.

import keys
api_key = keys.api_key

#### Set up api connection

In [18]:
def api_setup(api_key):

    """
    Configure the api. 
    Only input is the apikey which can be created from the link above.
    """
    import cfbd
    
    configuration = cfbd.Configuration()
    configuration.api_key['Authorization'] = api_key
    configuration.api_key_prefix['Authorization'] = 'Bearer'

    return cfbd.ApiClient(configuration)
    
api_config = api_setup(api_key)

#### Player Recruiting Rankings

In [30]:
def hs_recruits(start_year, end_year):
    
    """
    Two inputs: start_year and end_year (the ranges of years we want the recruiting data for - inclusive)
    
    1) Get each year as a json
    2) Convert to df
    3) Union each year's df together.
    """

    recruits_df_list = []

    for i in range(start_year, end_year + 1):

        # Connect to api for given year
        recr_api = cfbd.RecruitingApi(api_config)
        recruits = recr_api.get_recruiting_players(year = i)

        # Convert json to df
        df_recruits = pd.DataFrame.from_records([r.to_dict() for r in recruits])

        # Append dfs together to create list of dfs
        recruits_df_list.append(df_recruits)

    # Concatenate / union each year's df together
    df_recruits_final = pd.concat(recruits_df_list).reset_index()
    
    df_recruits_final['latitude'] = df_recruits_final.hometown_info.str['latitude']
    df_recruits_final['longitude'] = df_recruits_final.hometown_info.str['longitude']
    
    df_recruits_final.drop(columns = 'hometown_info', inplace = True)

    return df_recruits_final

df_recruits = hs_recruits(2018, 2023)

In [73]:
df_recruits.shape

(19854, 19)

In [31]:
df_recruits[df_recruits['name'] == 'Mike Sainristil']

Unnamed: 0,index,id,athlete_id,recruit_type,year,ranking,name,school,committed_to,position,height,weight,stars,rating,city,state_province,country,latitude,longitude
4560,610,47391,4428414.0,HighSchool,2019,595.0,Mike Sainristil,Everett,Michigan,CB,70.0,175.0,3,0.8743,Everett,MA,USA,42.40843,-71.053663


#### Script to get every college football team, some attributes, and their name

In [21]:
def team_dataset():

    teams_api = cfbd.TeamsApi(api_config)
    teams = teams_api.get_fbs_teams()

    df_teams = pd.DataFrame.from_records([t.to_dict() for t in teams])
    df_teams
    df_teams = df_teams[['id', 'school', 'conference', 'division', 'color', 'logos']]
    
    return df_teams

df_teams = team_dataset()

In [143]:
df_teams.head()

Unnamed: 0,id,school,conference,division,color,logos
0,2005,Air Force,Mountain West,Mountain,#004a7b,[http://a.espncdn.com/i/teamlogos/ncaa/500/200...
1,2006,Akron,Mid-American,East,#00285e,[http://a.espncdn.com/i/teamlogos/ncaa/500/200...
2,333,Alabama,SEC,West,#690014,[http://a.espncdn.com/i/teamlogos/ncaa/500/333...
3,2026,Appalachian State,Sun Belt,East,#000000,[http://a.espncdn.com/i/teamlogos/ncaa/500/202...
4,12,Arizona,Pac-12,,#002449,[http://a.espncdn.com/i/teamlogos/ncaa/500/12....


#### Function to get every college football game played over a timeframe and stored in a dataframe

In [285]:
def games_non_transformed(start_year, final_year):
    
    
    """
    Connect to the games api and get every post and regular season game over a given time frame
    
    1) Beginning with post season, iterate over every year in given range.
    2) Union each year together
    3) Repeat for regular season
    4) Union post and regular season dfs together
    """

    # Connect to games api
    games_api = cfbd.GamesApi(api_config)

    # Post Season Games
    postseason_games = []
    for i in range(start_year, final_year + 1):

        games = games_api.get_games(year=i, season_type = 'postseason')
        df_games_post_i = pd.DataFrame.from_records([g.to_dict() for g in games])
        postseason_games.append(df_games_post_i)

    postseason_games_df = pd.concat(postseason_games)

    # Regular Season Games
    regseason_games = []
    for i in range(start_year, final_year + 1):

        games = games_api.get_games(year=i, season_type = 'regular')
        df_games_reg_i = pd.DataFrame.from_records([g.to_dict() for g in games])
        regseason_games.append(df_games_reg_i)

    regseason_games_df = pd.concat(regseason_games)

    # Union post and regular season
    return pd.concat([regseason_games_df, postseason_games_df])

df_games = games_non_transformed(2014, 2024)

#### Maniuplate data so it's at the team-game grain, rather than game grain
###### -- There will be duplicate games, but we can filter for a team one one column now. 

###### -- Each game will have two records: one for the home team's perspective, one for the away team.

In [286]:
def games_manipulation(df_games):
    
    """
    The function takes in the output of the previous function games_non_transformed().
    It converts the grain of the data. Before, the grain was 1 row per game.
    Now, each game has two rows: one from the perspective of each team.
    For example, when Ohio lost to Michigan 42-27, Michigan will a win by 15 points, and OSU will show a loss by -15 points.
    
    1) First identifies every team that played at least 1 home game.
    2) Loops over every team.
    3) Converts a bunch of data points so that the numbers are referenced from the perspective of the team of interst (also called main_team)
    """


    df_seasons = []
    teams_list = list(df_games['home_team'].unique()[1:])

    # Loop over every team that played a game over the time frame specified in games_non_transformed()
    for team in teams_list:

        # find every home / away game for team of interst
        df_home = df_games[df_games['home_team'] == team]
        df_away = df_games[df_games['away_team'] == team]

        # Combine home and away games into 1 table. 
        df_season_i = pd.concat([df_home, df_away])

        # Add column specifying what team that row of data pertains to
        df_season_i['main_team'] = team

        ############
        # Adjust key columns so they represent our team of interest

        df_season_i['home_game_flag'] = np.where(df_season_i['home_team'] == team, 1, 0)

        df_season_i['team_id'] = np.where(df_season_i['home_team'] == team, df_season_i['home_id'], df_season_i['away_id'])
        df_season_i['opposing_team_id'] = np.where(df_season_i['home_team'] == team, df_season_i['away_id'], df_season_i['home_id'])

        df_season_i['team_conference'] = np.where(df_season_i['home_team'] == team, df_season_i['home_conference'], df_season_i['away_conference'])
        df_season_i['opposing_conference'] = np.where(df_season_i['home_team'] == team, df_season_i['away_conference'], df_season_i['home_conference'])

        df_season_i['points_for'] = np.where(df_season_i['home_team'] == team, df_season_i['home_points'], df_season_i['away_points'])
        df_season_i['points_against'] = np.where(df_season_i['home_team']== team, df_season_i['away_points'], df_season_i['home_points'])

        df_season_i['point_differential'] = df_season_i['points_for'] - df_season_i['points_against']

        df_season_i['team_line_scores']  = np.where(df_season_i['home_team'] == team, df_season_i['home_line_scores'], df_season_i['away_line_scores'])
        df_season_i['opposing_line_scores']  = np.where(df_season_i['home_team'] == team, df_season_i['away_line_scores'], df_season_i['home_line_scores'])

        df_season_i['team_pregame_elo']  = np.where(df_season_i['home_team'] == team, df_season_i['home_pregame_elo'], df_season_i['away_pregame_elo'])
        df_season_i['team_postgame_elo']  = np.where(df_season_i['home_team'] == team, df_season_i['home_postgame_elo'], df_season_i['away_postgame_elo'])

        df_season_i['opponent_pregame_elo'] = np.where(df_season_i['home_team'] != team, df_season_i['home_pregame_elo'], df_season_i['away_pregame_elo'])
        df_season_i['opponent_postgame_elo'] = np.where(df_season_i['home_team'] != team, df_season_i['home_postgame_elo'], df_season_i['away_postgame_elo'])

        df_season_i['win_flag'] = np.where(df_season_i['point_differential'] > 0, 1, 0)
        
#         a = df_season_i.sort_values('start_date', ascending=True) \
#                        .groupby(['main_team', 'season']) \
#                        .cumcount() + 1
        
#         df_season_i['game_that_season'] = list(a)
        
        
        ############

        df_seasons.append(df_season_i)
        
    data = pd.concat(df_seasons)
    data = data.drop(columns = ['home_id', 'home_team', 'home_conference', 'home_division', 'home_points', 'home_line_scores', 'home_post_win_prob',
                                'home_pregame_elo', 'home_postgame_elo', 'away_id', 'away_team', 'away_conference', 'away_division', 'away_points',
                                'away_line_scores', 'away_post_win_prob', 'away_pregame_elo', 'away_postgame_elo'], axis = 1)
    
    # Field that counts what game (ie the 15th game, 3rd game, etc)
    data = data.reset_index()
    data['game_that_season'] = data.sort_values(['season', 'start_date', 'team_id'], ascending=True) \
                                        .groupby(['team_id', 'season']) \
                                        .cumcount() + 1
    
    
    
    return data
    
df = games_manipulation(df_games)

df.head()

Unnamed: 0,index,id,season,week,season_type,start_date,start_time_tbd,completed,neutral_site,conference_game,attendance,venue_id,venue,excitement_index,highlights,notes,main_team,home_game_flag,team_id,opposing_team_id,team_conference,opposing_conference,points_for,points_against,point_differential,team_line_scores,opposing_line_scores,team_pregame_elo,team_postgame_elo,opponent_pregame_elo,opponent_postgame_elo,win_flag,game_that_season
0,1,400548403,2014,1,regular,2014-08-27T23:00:00.000Z,False,True,True,False,10140.0,3495.0,Georgia Dome,9.07878,,,Georgia State,1,2247,2000,Sun Belt,Southland,38.0,37.0,1.0,"[7, 14, 0, 17]","[3, 13, 14, 7]",,,,,1,1
1,148,400548409,2014,2,regular,2014-09-06T18:00:00.000Z,,True,False,True,10126.0,3495.0,Georgia Dome,,,,Georgia State,1,2247,166,Sun Belt,Sun Belt,31.0,34.0,-3.0,"[10, 7, 7, 7]","[0, 10, 7, 17]",1233.0,1217.0,1047.0,1063.0,0,2
2,276,400548189,2014,3,regular,2014-09-13T18:00:00.000Z,,True,False,False,16836.0,3495.0,Georgia Dome,2.555185,,,Georgia State,1,2247,2005,Sun Belt,Mountain West,38.0,48.0,-10.0,"[0, 10, 14, 14]","[14, 17, 7, 10]",1217.0,1196.0,1214.0,1235.0,0,3
3,724,400548424,2014,7,regular,2014-10-11T18:00:00.000Z,,True,False,True,10196.0,3495.0,Georgia Dome,1.26,,,Georgia State,1,2247,2032,Sun Belt,Sun Belt,10.0,52.0,-42.0,"[0, 3, 0, 7]","[24, 7, 14, 7]",1175.0,1086.0,1447.0,1536.0,0,6
4,947,400548432,2014,9,regular,2014-10-25T18:00:00.000Z,,True,False,True,28427.0,3495.0,Georgia Dome,2.076071,,,Georgia State,1,2247,290,Sun Belt,Sun Belt,31.0,69.0,-38.0,"[3, 14, 0, 14]","[14, 13, 21, 21]",1089.0,1036.0,1541.0,1594.0,0,8


#### Begin analysis

In [287]:
import altair as alt

power_5_conf = ['Pac-12', 'Big 12', 'ACC', 'SEC', 'Big Ten']


a = df[df['team_conference'].isin(power_5_conf)]
a = a[a['game_that_season'] == 12] # Final game of reg season

# Round elo to nearest 50 for binning
a['elo_rounded'] = round(a['team_postgame_elo'] / 1) * 1
a = a.groupby(by = ['season', 'team_conference', 'elo_rounded', 'main_team', 'team_id']).size().reset_index().rename({0: 'count'}, axis = 1)

a.head()

Unnamed: 0,season,team_conference,elo_rounded,main_team,team_id,count
0,2014,ACC,1237.0,Wake Forest,154,1
1,2014,ACC,1390.0,Syracuse,183,1
2,2014,ACC,1491.0,Virginia,258,1
3,2014,ACC,1517.0,North Carolina,153,1
4,2014,ACC,1545.0,NC State,152,1


In [288]:
a_final = pd.merge(left = a, right = df_teams, left_on = 'team_id', right_on = 'id')

a_final = a_final[['season', 'team_conference', 'elo_rounded', 'main_team', 'logos']]
a_final['url'] = a_final['logos'].str.get(0)
a_final.drop(columns = ['logos'], axis = 1, inplace = True)

a_final.head()

Unnamed: 0,season,team_conference,elo_rounded,main_team,url
0,2014,ACC,1237.0,Wake Forest,http://a.espncdn.com/i/teamlogos/ncaa/500/154.png
1,2015,ACC,1309.0,Wake Forest,http://a.espncdn.com/i/teamlogos/ncaa/500/154.png
2,2016,ACC,1414.0,Wake Forest,http://a.espncdn.com/i/teamlogos/ncaa/500/154.png
3,2017,ACC,1635.0,Wake Forest,http://a.espncdn.com/i/teamlogos/ncaa/500/154.png
4,2018,ACC,1608.0,Wake Forest,http://a.espncdn.com/i/teamlogos/ncaa/500/154.png


In [292]:
alt.Chart(a_final).mark_image(opacity = .90, width = 20, height = 20).encode(
    
    x = alt.X('elo_rounded', title = 'ELO Rating at Season End', scale=alt.Scale(domain=[800, 2500])), 
                                y = alt.Y('team_conference', title = 'Conference'),
                                tooltip = ['main_team', 'elo_rounded'],
                                url = 'url'
                               ).properties(height = 250, width = 300)\
.facet(facet = 'season:O', columns = 4).interactive().properties(title = 'End of Regular Season ELO Rating')

## Ignore everything below, we won't need this for the Capstone Project

In [14]:
#Add in columns for quarterly breakdown of scores (points for)

def q1_score(row):
    
    try:
        return list(row['team_line_scores'])[0]
    
    except:
        return 0

def q2_score(row):
    
    try:
        return list(row['team_line_scores'])[1]
    
    except:
        return 0

def q3_score(row):
    
    try:
        return list(row['team_line_scores'])[2]
    
    except:
        return 0


def q4_score(row):
    
    try:
        return list(row['team_line_scores'])[3]
    
    except:
        return 0

In [15]:
df_season_final['q1_points_for'] = df_season_final.apply (lambda row: q1_score(row), axis=1)
df_season_final['q2_points_for'] = df_season_final.apply (lambda row: q2_score(row), axis=1)
df_season_final['q3_points_for'] = df_season_final.apply (lambda row: q3_score(row), axis=1)
df_season_final['q4_points_for'] = df_season_final.apply (lambda row: q4_score(row), axis=1)

NameError: name 'df_season_final' is not defined

In [None]:
#Add in columns for quarterly breakdown of scores

def q1_score_opp(row):
    
    try:
        return list(row['opposing_line_scores'])[0]
    
    except:
        return 0

def q2_score_opp(row):
    
    try:
        return list(row['opposing_line_scores'])[1]
    
    except:
        return 0

def q3_score_opp(row):
    
    try:
        return list(row['opposing_line_scores'])[2]
    
    except:
        return 0


def q4_score_opp(row):
    
    try:
        return list(row['opposing_line_scores'])[3]
    
    except:
        return 0

In [None]:
df_season_final['q1_points_against'] = df_season_final.apply (lambda row: q1_score_opp(row), axis=1)
df_season_final['q2_points_against'] = df_season_final.apply (lambda row: q2_score_opp(row), axis=1)
df_season_final['q3_points_against'] = df_season_final.apply (lambda row: q3_score_opp(row), axis=1)
df_season_final['q4_points_against'] = df_season_final.apply (lambda row: q4_score_opp(row), axis=1)