## College Football Data Wrangling

#### The goal of this notebook is to pull in all the data from CollegeFootballData.com
##### This notebook will take in data beginning at the grain where one row is one game, and instead create two records from one game: one from the perspective of the home team and one for the away team. This makes it easier for analysis and offers easy analysis on more familiar metrics like points for, etc.


##### Chris McAllister
#### ----------------------------------

###### Helpful Tutorial
https://blog.collegefootballdata.com/introduction-to-cfb-analytics/

###### Actual Documentation
https://api.collegefootballdata.com/api/docs/?url=/api-docs.json

###### Get custom API key emailed here:
https://collegefootballdata.com/key

In [1]:
import cfbd
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
# Uncomment and run line below if cfbd library isn't already installed
#! pip install cfbd

In [5]:
# configuration = cfbd.Configuration()
# configuration.api_key['Authorization'] = api_key
# configuration.api_key_prefix['Authorization'] = 'Bearer'

# api_config = cfbd.ApiClient(configuration)

In [7]:
# Running this code by itself won't work. You'll need your own API Key.
# See link above to have custom API link emailed and save that key as variable api_key.

import keys
api_key = keys.api_key

#### Set up api connection

In [18]:
# See link above to have custom API link emailed
api_key = 'jn8tnglKRmyayaeT0H3JG5TdxMxSnjLsFyW/QkRfhSS4UicMoNvZ71ao+gOXjMsI'

def api_setup(api_key):

    """
    Configure the api. 
    Only input is the apikey which can be created from the link above.
    """
    import cfbd
    
    configuration = cfbd.Configuration()
    configuration.api_key['Authorization'] = api_key
    configuration.api_key_prefix['Authorization'] = 'Bearer'

    return cfbd.ApiClient(configuration)
    
api_config = api_setup(api_key)

#### Player Recruiting Rankings

In [19]:
def hs_recruits(start_year, end_year):
    
    """
    Two inputs: start_year and end_year (the ranges of years we want the recruiting data for - inclusive)
    
    1) Get each year as a json
    2) Convert to df
    3) Union each year's df together.
    """

    recruits_df_list = []

    for i in range(start_year, end_year + 1):

        # Connect to api for given year
        recr_api = cfbd.RecruitingApi(api_config)
        recruits = recr_api.get_recruiting_players(year = i)

        # Convert json to df
        df_recruits = pd.DataFrame.from_records([r.to_dict() for r in recruits])

        # Append dfs together to create list of dfs
        recruits_df_list.append(df_recruits)

    # Concatenate / union each year's df together
    df_recruits_final = pd.concat(recruits_df_list).reset_index()
    
    df_recruits_final['latitude'] = df_recruits_final.hometown_info.str['latitude']
    df_recruits_final['longitude'] = df_recruits_final.hometown_info.str['longitude']
    
    df_recruits_final.drop(columns = 'hometown_info', inplace = True)

    return df_recruits_final

df_recruits = hs_recruits(2020, 2022)

In [20]:
df_recruits.head()

Unnamed: 0,index,id,athlete_id,recruit_type,year,ranking,name,school,committed_to,position,height,weight,stars,rating,city,state_province,country,latitude,longitude
0,0,61569,4428988.0,HighSchool,2020,1.0,Bryan Bresee,Damascus,Clemson,DT,77.0,290.0,5,0.9995,Damascus,MD,USA,39.288438,-77.203872
1,1,61570,4685720.0,HighSchool,2020,2.0,Bryce Young,Mater Dei,Alabama,DUAL,71.0,183.0,5,0.9994,Santa Ana,CA,USA,33.749495,-117.873221
2,2,61571,4430800.0,HighSchool,2020,3.0,Julian Fleming,Southern Columbia,Ohio State,WR,74.0,199.0,5,0.9979,Catawissa,PA,USA,40.954827,-76.460987
3,3,61572,4428992.0,HighSchool,2020,4.0,Kelee Ringo,Saguaro,Georgia,CB,74.0,205.0,5,0.9976,Scottsdale,AZ,USA,33.494219,-111.926018
4,4,61573,4428994.0,HighSchool,2020,5.0,Arik Gilbert,Marietta,LSU,TE,77.0,253.0,5,0.9972,Marietta,GA,USA,33.952847,-84.549615


#### Script to get every college football team, some attributes, and their name

In [10]:
def team_dataset():

    teams_api = cfbd.TeamsApi(api_config)
    teams = teams_api.get_fbs_teams()

    df_teams = pd.DataFrame.from_records([t.to_dict() for t in teams])
    df_teams
    df_teams = df_teams[['id', 'school', 'conference', 'division', 'color', 'logos']]
    
    return df_teams

df_teams = team_dataset()

In [11]:
df_teams.head()

Unnamed: 0,id,school,conference,division,color,logos
0,2005,Air Force,Mountain West,Mountain,#004a7b,[http://a.espncdn.com/i/teamlogos/ncaa/500/200...
1,2006,Akron,Mid-American,East,#00285e,[http://a.espncdn.com/i/teamlogos/ncaa/500/200...
2,333,Alabama,SEC,West,#690014,[http://a.espncdn.com/i/teamlogos/ncaa/500/333...
3,2026,Appalachian State,Sun Belt,East,#000000,[http://a.espncdn.com/i/teamlogos/ncaa/500/202...
4,12,Arizona,Pac-12,,#002449,[http://a.espncdn.com/i/teamlogos/ncaa/500/12....


#### Function to get every college football game played over a timeframe and stored in a dataframe

In [21]:
def games_non_transformed(start_year, final_year):
    
    
    """
    Connect to the games api and get every post and regular season game over a given time frame
    
    1) Beginning with post season, iterate over every year in given range.
    2) Union each year together
    3) Repeat for regular season
    4) Union post and regular season dfs together
    """

    # Connect to games api
    games_api = cfbd.GamesApi(api_config)

    # Post Season Games
    postseason_games = []
    for i in range(start_year, final_year + 1):

        games = games_api.get_games(year=i, season_type = 'postseason')
        df_games_post_i = pd.DataFrame.from_records([g.to_dict() for g in games])
        postseason_games.append(df_games_post_i)

    postseason_games_df = pd.concat(postseason_games)

    # Regular Season Games
    regseason_games = []
    for i in range(start_year, final_year + 1):

        games = games_api.get_games(year=i, season_type = 'regular')
        df_games_reg_i = pd.DataFrame.from_records([g.to_dict() for g in games])
        regseason_games.append(df_games_reg_i)

    regseason_games_df = pd.concat(regseason_games)

    # Union post and regular season
    return pd.concat([regseason_games_df, postseason_games_df])

df_games = games_non_transformed(2021, 2024)

In [22]:
df_games.head()

Unnamed: 0,id,season,week,season_type,start_date,start_time_tbd,completed,neutral_site,conference_game,attendance,venue_id,venue,home_id,home_team,home_conference,home_division,home_points,home_line_scores,home_post_win_prob,home_pregame_elo,home_postgame_elo,away_id,away_team,away_conference,away_division,away_points,away_line_scores,away_post_win_prob,away_pregame_elo,away_postgame_elo,excitement_index,highlights,notes
0,401282714,2021,1,regular,2021-08-28T17:20:00.000Z,False,True,False,True,41064.0,3832.0,Memorial Stadium,356,Illinois,Big Ten,fbs,30.0,"[2, 14, 14, 0]",0.405661,1392.0,1411.0,158,Nebraska,Big Ten,fbs,22.0,"[0, 9, 7, 6]",0.594339,1503.0,1484.0,5.485981,,
1,401286187,2021,1,regular,2021-08-28T18:00:00.000Z,False,True,False,False,26043.0,3660.0,Bulldog Stadium,278,Fresno State,Mountain West,fbs,45.0,"[7, 24, 7, 7]",0.998968,1465.0,1566.0,41,Connecticut,FBS Independents,fbs,0.0,"[0, 0, 0, 0]",0.001032,1223.0,1122.0,1.535556,,
2,401329133,2021,1,regular,2021-08-28T19:00:00.000Z,False,True,True,True,,3861.0,Ohio Stadium,2119,Central State,SIAC,ii,6.0,"[0, 0, 0, 6]",,,,2310,Kentucky State,SIAC,ii,20.0,"[6, 7, 0, 7]",,,,,,
3,401309833,2021,1,regular,2021-08-28T19:30:00.000Z,False,True,False,False,32982.0,1056.0,Rose Bowl,26,UCLA,Pac-12,fbs,44.0,"[24, 7, 13, 0]",0.998924,1517.0,1605.0,62,Hawai'i,Mountain West,fbs,10.0,"[3, 0, 7, 0]",0.001076,1466.0,1378.0,1.391185,,
4,401328337,2021,1,regular,2021-08-28T22:00:00.000Z,False,True,False,False,,3829.0,Memorial Stadium,282,Indiana State,MVFC,fcs,26.0,"[7, 3, 7, 9]",,,,2197,Eastern Illinois,OVC,fcs,21.0,"[0, 7, 7, 7]",,,,,,


#### Maniuplate data so it's at the team-game grain, rather than game grain
###### -- There will be duplicate games, but we can filter for a team one one column now. 

###### -- Each game will have two records: one for the home team's perspective, one for the away team.

In [25]:
def games_manipulation(df_games):
    
    """
    The function takes in the output of the previous function games_non_transformed().
    It converts the grain of the data. Before, the grain was 1 row per game.
    Now, each game has two rows: one from the perspective of each team.
    For example, when Ohio lost to Michigan 42-27, Michigan will a win by 15 points, and OSU will show a loss by -15 points.
    
    1) First identifies every team that played at least 1 home game.
    2) Loops over every team.
    3) Converts a bunch of data points so that the numbers are referenced from the perspective of the team of interst (also called main_team)
    """


    df_seasons = []
    teams_list = list(df_games['home_team'].unique()[1:])

    # Loop over every team that played a game over the time frame specified in games_non_transformed()
    for team in teams_list:

        # find every home / away game for team of interst
        df_home = df_games[df_games['home_team'] == team]
        df_away = df_games[df_games['away_team'] == team]

        # Combine home and away games into 1 table. 
        df_season_i = pd.concat([df_home, df_away])

        # Add column specifying what team that row of data pertains to
        df_season_i['main_team'] = team

        ############
        # Adjust key columns so they represent our team of interest

        df_season_i['home_game_flag'] = np.where(df_season_i['home_team'] == team, 1, 0)

        df_season_i['team_id'] = np.where(df_season_i['home_team'] == team, df_season_i['home_id'], df_season_i['away_id'])

        df_season_i['team_conference'] = np.where(df_season_i['home_team'] == team, df_season_i['home_conference'], df_season_i['away_conference'])
        df_season_i['opposing_conference'] = np.where(df_season_i['home_team'] == team, df_season_i['away_conference'], df_season_i['home_conference'])

        df_season_i['points_for'] = np.where(df_season_i['home_team'] == team, df_season_i['home_points'], df_season_i['away_points'])
        df_season_i['points_against'] = np.where(df_season_i['home_team']== team, df_season_i['away_points'], df_season_i['home_points'])

        df_season_i['point_differential'] = df_season_i['points_for'] - df_season_i['points_against']

        df_season_i['team_line_scores']  = np.where(df_season_i['home_team'] == team, df_season_i['home_line_scores'], df_season_i['away_line_scores'])
        df_season_i['opposing_line_scores']  = np.where(df_season_i['home_team'] == team, df_season_i['away_line_scores'], df_season_i['home_line_scores'])

        df_season_i['team_pregame_elo']  = np.where(df_season_i['home_team'] == team, df_season_i['home_pregame_elo'], df_season_i['away_pregame_elo'])
        df_season_i['team_postgame_elo']  = np.where(df_season_i['home_team'] == team, df_season_i['home_postgame_elo'], df_season_i['away_postgame_elo'])

        df_season_i['opponent_pregame_elo'] = np.where(df_season_i['home_team'] != team, df_season_i['home_pregame_elo'], df_season_i['away_pregame_elo'])
        df_season_i['opponent_postgame_elo'] = np.where(df_season_i['home_team'] != team, df_season_i['home_postgame_elo'], df_season_i['away_postgame_elo'])

        df_season_i['win_flag'] = np.where(df_season_i['point_differential'] > 0, 1, 0)
        
        
        ############

        df_seasons.append(df_season_i)

        
    return pd.concat(df_seasons)
    
df = games_manipulation(df_games)

In [28]:
df[df['main_team'] == 'Michigan'].sort_values(by = 'start_date', ascending = False).head()

Unnamed: 0,id,season,week,season_type,start_date,start_time_tbd,completed,neutral_site,conference_game,attendance,venue_id,venue,home_id,home_team,home_conference,home_division,home_points,home_line_scores,home_post_win_prob,home_pregame_elo,home_postgame_elo,away_id,away_team,away_conference,away_division,away_points,away_line_scores,away_post_win_prob,away_pregame_elo,away_postgame_elo,excitement_index,highlights,notes,main_team,home_game_flag,team_id,team_conference,opposing_conference,points_for,points_against,point_differential,team_line_scores,opposing_line_scores,team_pregame_elo,team_postgame_elo,opponent_pregame_elo,opponent_postgame_elo,win_flag
47,401551789,2023,1,postseason,2024-01-09T00:30:00.000Z,False,True,True,False,,3891.0,NRG Stadium,130,Michigan,Big Ten,fbs,34.0,"[14, 3, 3, 14]",0.993855,2181.0,2210.0,264,Washington,Pac-12,fbs,13.0,"[3, 7, 3, 0]",0.006145,1908.0,1879.0,5.666155,,CFP National Championship Pres. by AT&T,Michigan,1,130,Big Ten,Pac-12,34.0,13.0,21.0,"[14, 3, 3, 14]","[3, 7, 3, 0]",2181.0,2210.0,1908.0,1879.0,1
44,401551786,2023,1,postseason,2024-01-01T22:00:00.000Z,False,True,True,False,,1056.0,Rose Bowl,130,Michigan,Big Ten,fbs,27.0,"[7, 6, 0, 7, 7]",0.98269,2174.0,2181.0,333,Alabama,SEC,fbs,20.0,"[7, 3, 0, 10, 0]",0.01731,2039.0,2032.0,8.212806,,CFP Semifinal at the Rose Bowl Game Pres. by P...,Michigan,1,130,Big Ten,SEC,27.0,20.0,7.0,"[7, 6, 0, 7, 7]","[7, 3, 0, 10, 0]",2174.0,2181.0,2039.0,2032.0,1
3597,401539480,2023,14,regular,2023-12-03T01:00:00.000Z,False,True,True,True,67842.0,3812.0,Lucas Oil Stadium,2294,Iowa,Big Ten,fbs,0.0,"[0, 0, 0, 0]",0.132754,1675.0,1666.0,130,Michigan,Big Ten,fbs,26.0,"[10, 0, 10, 6]",0.867246,2165.0,2174.0,2.719647,,Big Ten Championship,Michigan,0,130,Big Ten,Big Ten,26.0,0.0,26.0,"[10, 0, 10, 6]","[0, 0, 0, 0]",2165.0,2174.0,1675.0,1666.0,1
3543,401520434,2023,13,regular,2023-11-25T17:00:00.000Z,False,True,False,True,110615.0,3558.0,Michigan Stadium,130,Michigan,Big Ten,fbs,30.0,"[7, 7, 10, 6]",0.81158,2160.0,2165.0,194,Ohio State,Big Ten,fbs,24.0,"[3, 7, 7, 7]",0.18842,2113.0,2108.0,5.94646,,,Michigan,1,130,Big Ten,Big Ten,30.0,24.0,6.0,"[7, 7, 10, 6]","[3, 7, 7, 7]",2160.0,2165.0,2113.0,2108.0,1
3405,401520410,2023,12,regular,2023-11-18T17:00:00.000Z,False,True,False,True,49546.0,3665.0,Maryland Stadium,120,Maryland,Big Ten,fbs,24.0,"[3, 7, 14, 0]",0.45306,1596.0,1623.0,130,Michigan,Big Ten,fbs,31.0,"[16, 7, 6, 2]",0.54694,2187.0,2160.0,4.478163,,,Michigan,0,130,Big Ten,Big Ten,31.0,24.0,7.0,"[16, 7, 6, 2]","[3, 7, 14, 0]",2187.0,2160.0,1596.0,1623.0,1


## Ignore everything below, we won't need this for the Capstone Project

In [57]:
#Add in columns for quarterly breakdown of scores (points for)

def q1_score(row):
    
    try:
        return list(row['team_line_scores'])[0]
    
    except:
        return 0

def q2_score(row):
    
    try:
        return list(row['team_line_scores'])[1]
    
    except:
        return 0

def q3_score(row):
    
    try:
        return list(row['team_line_scores'])[2]
    
    except:
        return 0


def q4_score(row):
    
    try:
        return list(row['team_line_scores'])[3]
    
    except:
        return 0

In [58]:
df_season_final['q1_points_for'] = df_season_final.apply (lambda row: q1_score(row), axis=1)
df_season_final['q2_points_for'] = df_season_final.apply (lambda row: q2_score(row), axis=1)
df_season_final['q3_points_for'] = df_season_final.apply (lambda row: q3_score(row), axis=1)
df_season_final['q4_points_for'] = df_season_final.apply (lambda row: q4_score(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_season_final['q1_points_for'] = df_season_final.apply (lambda row: q1_score(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_season_final['q2_points_for'] = df_season_final.apply (lambda row: q2_score(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_season_final['q3

In [59]:
#Add in columns for quarterly breakdown of scores

def q1_score_opp(row):
    
    try:
        return list(row['opposing_line_scores'])[0]
    
    except:
        return 0

def q2_score_opp(row):
    
    try:
        return list(row['opposing_line_scores'])[1]
    
    except:
        return 0

def q3_score_opp(row):
    
    try:
        return list(row['opposing_line_scores'])[2]
    
    except:
        return 0


def q4_score_opp(row):
    
    try:
        return list(row['opposing_line_scores'])[3]
    
    except:
        return 0

In [60]:
df_season_final['q1_points_against'] = df_season_final.apply (lambda row: q1_score_opp(row), axis=1)
df_season_final['q2_points_against'] = df_season_final.apply (lambda row: q2_score_opp(row), axis=1)
df_season_final['q3_points_against'] = df_season_final.apply (lambda row: q3_score_opp(row), axis=1)
df_season_final['q4_points_against'] = df_season_final.apply (lambda row: q4_score_opp(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_season_final['q1_points_against'] = df_season_final.apply (lambda row: q1_score_opp(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_season_final['q2_points_against'] = df_season_final.apply (lambda row: q2_score_opp(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_