In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor, XGBClassifier
from datetime import datetime
import ast
import json
import urllib
from pathlib import Path
import numpy as np
import pandas as pd
import requests
from requests.exceptions import HTTPError
from helper import flatten
import logging
from datetime import datetime
from tenacity import retry, stop_after_attempt, wait_exponential, after_log

In [54]:
# Setup Logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s')
file_handler = logging.FileHandler('extract.log')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)


@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10), after=after_log(logger, logging.DEBUG))
def games_stat_extract_alt(seasons=[folder.name for folder in Path('data/seasons/').iterdir() if folder.is_dir()], filepath=Path('data/')):

    for season in seasons:
        print(season)
        print('---------------------------')
        games = pd.read_csv(filepath/f'seasons/{season}/games.csv').id

        games_df = []
        for game_id in games:

            url = f'https://api.collegefootballdata.com/games/teams?year={season}&gameId={game_id}'
            data = requests.get(url).json()

            if len(data) > 0:

                subdata_dfs = []
                for subdata in data:
                    teams_dfs = []
                    for team in subdata['teams']:

                        df = pd.DataFrame(
                            team['stats']).set_index('category').T
                        df['game_id'] = game_id
                        df['school'] = team['school']
                        df['home_away'] = team['homeAway']
                        df['points_scored'] = team['points']

                        teams_dfs.append(df)

                    df = pd.concat(teams_dfs, axis=0)
                    df = df.fillna(0)

                    subdata_dfs.append(df)

                df = pd.concat(subdata_dfs, axis=0, ignore_index=True)
                games_df.append(df)

            else:
                pass

        df = pd.concat(games_df, axis=0, ignore_index=True)

        # # Transform Columns    
        split_columns = ['thirdDownEff', 'fourthDownEff', 'completionAttempts']
        for column in split_columns:

            success_column = f'{column}Success'
            attempt_column = f'{column}Attempts'

            df[[success_column, attempt_column]] = df[column].str.split('-', expand=True).iloc[:, :2].astype(float)
            df[column] = df[success_column] / df[attempt_column]

        df[['totalPenalties', 'totalPenaltiesYards']] = df['totalPenaltiesYards'].str.split('-', n=1, expand=True).astype(float)
        
        df['possessionTime'] = df['possessionTime'].fillna('00:00')
        df['possessionTime'] = df['possessionTime'].replace(0, '00:00')
        df['possessionTime'] = pd.to_timedelta(df['possessionTime'] + ':00').dt.total_seconds()
        
        df = df.fillna(0)
        print('saving.....')
        df.to_csv(filepath/f'seasons/{season}/games_stats_alt.csv', index=False)

In [55]:
games_stat_extract_alt(seasons=[2013,2014,2015,2016,2017,2019,2020], filepath=Path('data/'))

2013
---------------------------
2013
---------------------------
saving.....
2014
---------------------------
2013
---------------------------
saving.....
2014
---------------------------
saving.....
2015
---------------------------
saving.....
2016
---------------------------
saving.....
2017
---------------------------
saving.....
2019
---------------------------
saving.....
2020
---------------------------
saving.....


In [21]:
copydf = df.copy()
df.head()

Unnamed: 0,rushingTDs,puntReturnYards,puntReturnTDs,puntReturns,passingTDs,kickReturnYards,kickReturnTDs,kickReturns,kickingPoints,interceptionYards,...,yardsPerRushAttempt,totalPenaltiesYards,turnovers,fumblesLost,interceptions,possessionTime,game_id,school,home_away,points_scored
0,4,2,0,2,4,53,0,4,9,60.0,...,7.2,4-30,1,1,0,26:40,401013357,UMass,home,63
1,1,-3,0,1,1,160,0,8,3,0.0,...,3.6,3-22,4,1,3,33:20,401013357,Duquesne,away,15
2,3,-1,0,1,0,170,0,6,11,,...,6.2,7-54,0,0,0,39:39,401014972,Rice,home,31
3,1,0,0,0,3,31,0,3,4,,...,5.0,6-60,1,1,0,20:21,401014972,Prairie View,away,28
4,2,5,0,2,3,43,0,2,13,0.0,...,5.5,10-65,1,1,0,34:43,401022510,Hawai'i,away,43


In [22]:
df = copydf.copy()

In [23]:
split_columns = ['thirdDownEff', 'fourthDownEff', 'completionAttempts']
for column in split_columns:

    success_column = f'{column}Success'
    attempt_column = f'{column}Attempts'

    df[[success_column, attempt_column]] = df[column].str.split('-', expand=True).iloc[:, :2].astype(float)
    df[column] = df[success_column] / df[attempt_column]

df[['totalPenalties', 'totalPenaltiesYards']] = df['totalPenaltiesYards'].str.split('-', n=1, expand=True).astype(float)
df['possessionTime'] = pd.to_timedelta(df['possessionTime'] + ':00').dt.total_seconds()

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [43]:
pd.to_timedelta(df['possessionTime'] + ':00').dt.total_seconds()

0        96000.0
1       120000.0
2       142740.0
3        73260.0
4       124980.0
          ...   
1763     89820.0
1764     90600.0
1765    125400.0
1766    102180.0
1767    113820.0
Name: possessionTime, Length: 1768, dtype: float64

In [32]:
df['possessionTime'] = df['possessionTime'].fillna('00:00')
df['possessionTime'] = df['possessionTime'].replace(0, '00:00')
