In [1]:
import zipfile, os, re, datetime
import pandas as pd
import numpy as np 
from functions.functions import createvariables, yrRecode, fixTOP, removeSlashes, teamhistory, \
coach_history, previous_yrs, opponent, opp, create_variables

# Read in the fooball data

In [2]:
_file = os.path.join('ncaa_football_scrapy','Data', 'Data.zip')
zf = zipfile.ZipFile(_file)
filenames = zf.namelist()

# Create Program History DF

In [3]:
history = pd.read_csv(zf.open('history.csv'))
history['year'] = history.apply(lambda x: int(x['Year'][:4]), axis = 1)

# Coach links

In [4]:
coach_links = pd.read_csv(os.path.join("ncaa_football_scrapy", "Links", "links_teaminfo.csv"))
coach_links = coach_links.rename(columns={"txt":"coach", 'team':"Team"})
coach_links = coach_links[coach_links.key == 'people'][['Team', 'coach', 'year']]
coach_links['Team'] = coach_links.Team.str.replace("  ", " ")

# Account for coaches who get fired in the season

In [5]:
coach_dict = coach_links.to_dict(orient='record')

In [6]:
coach_fixed = {}
for x in coach_dict: 
    if x['year'] not in coach_fixed: 
        coach_fixed[x['year']] = {}
    if x['Team'] not in coach_fixed[x['year']]: 
        coach_fixed[x['year']][x['Team']] = []
    coach_fixed[x['year']][x['Team']].append(x['coach'])

In [7]:
coaches_list = []
for yr, teams in coach_fixed.items(): 
    for team, coaches in teams.items(): 
        tmp = {'year':yr, 'Team':team, 'coach':coaches}
        coaches_list.append(tmp)

In [8]:
coaches_list_fixed = pd.DataFrame(coaches_list)

# Create Coach DF 

In [9]:
coaches = pd.read_csv(zf.open('coaches.csv'))

In [10]:
coaches['year'] = coaches.apply(lambda x: int(x['Year'][:4]), axis = 1)
coaches.rename(columns = {'Name':'coach'}, inplace = True)

# Create Roster DF

In [11]:
roster = pd.read_csv(zf.open('roster.csv'))

In [12]:
roster['year'] = roster.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [13]:
roster_by_year = roster[['Team', 'Yr', 'year', 'GS']].groupby(['Team', 'year', 'Yr']).count().reset_index()

In [14]:
roster_games_started = roster[['Team', 'year', 'Yr', 'GS']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [15]:
roster_games_played = roster[['Team', 'year', 'Yr', 'GP']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [16]:
roster_final = pd.merge(roster_games_played, 
                        roster_games_started, 
                        left_on = ['Team', 'year', 'Yr'], 
                        right_on = ['Team', 'year', 'Yr'],
                        how = 'left'
                       )

In [17]:
roster_df = roster_final.set_index(['Team', 'year', 'Yr']).unstack().reset_index()

In [18]:
columns = [x[0] if x[0] in ['Team', 'year'] else x[0] + "_" + x[1] for x in roster_df.columns.values]
roster_df.columns = columns

# Merge together game-by-game data

In [19]:
# Subset the gamebygame files
targetfiles = [file for file in filenames if 'gamebygame' in file ]

In [20]:
# Remove the team names data file 
idx = targetfiles.index('gamebygame_teamnames.csv')
targetfiles.pop(idx);

In [21]:
# Create a dict of dfs with all the game-by-game stats using a multi-index 
dfs = {}
for f in targetfiles: 
    filename = f.split(".")[0]
    dfs[filename] = pd.read_csv(zf.open(f))
    dfs[filename]['Date'] = pd.to_datetime(dfs[filename]['Date'])
    dfs[filename].set_index(['Team', 'Date', 'OffenseDefense'], inplace = True)

In [22]:
# Concatenate the data together and drop duplicate columns 
master = pd.concat([v for k, v in dfs.items()], axis = 1)
master = master.loc[:, ~master.columns.duplicated()]

In [23]:
# Standardize time of possession format 
master['TOP'] = master.apply(lambda x: fixTOP(x['TOP']) if pd.isnull(x['TOP']) == False else x['TOP'], axis =1)
master['TOP'] = pd.to_timedelta(master['TOP'])

In [24]:
# Remove slashes in column data and convert to numeric 

cols = list(master.columns)
TOP_index = list(cols).index('TOP')
cols.pop(TOP_index);

master[cols[3:]] = master.apply(lambda x: removeSlashes(x, cols), axis =1)
master[cols[3:]] = master[cols[3:]].apply(pd.to_numeric)

# Subset Offense Defense

In [25]:
defense = master[master.index.get_level_values('OffenseDefense') == 'Defense'].copy()
offense = master[master.index.get_level_values('OffenseDefense') == 'Offense'].copy()

In [26]:
assert len(defense) == len(offense),  "Offense/Defense datasets different sizes"

# Merge together offense / defense data

In [27]:
offense.reset_index(inplace = True)
offense['year'] = offense['Date'].dt.year
offense.drop(['OffenseDefense', 'G', 'Result'], axis = 1, inplace = True)

In [28]:
# Relabel defense columns to defense
defense.columns = ['def_' + x for x in defense.columns]
defense.reset_index(inplace = True)
defense['year'] = defense['Date'].dt.year
defense.drop(['OffenseDefense', 'def_G', 'def_Result'], axis = 1, inplace = True)

In [29]:
gamestats = pd.merge(offense,
                     defense, 
                     left_on = ['Team', 'Date', 'year'], 
                     right_on = ['Team', 'Date', 'year'],
                     how = 'left'
                    )

In [30]:
assert len(gamestats) == len(offense), "Size change during merging"

In [31]:
gamestats['gamenumber'] = gamestats.groupby(['Team', 'year']).cumcount()+1

# Game list 

In [32]:
# Read the file in
gamelist = pd.read_csv(zf.open('gamebygame_participation.csv'))

In [33]:
# Create a year variable and only keep the offense
gamelist['Date'] = pd.to_datetime(gamelist['Date'])
gamelist['year'] = gamelist['Date'].dt.year
gamelist = gamelist[gamelist.OffenseDefense == 'Offense']

# Add in the game count 
gamelist['gamenumber'] = gamelist.groupby(['Team', 'year']).cumcount() +1

# Read in team names

In [34]:
# Create a dataframe with the "short" and "long" names of the opponent
teamnames = pd.read_csv(zf.open('gamebygame_teamnames.csv'))
teamnames['shortName'] = teamnames.apply(lambda x: opponent(x, 'shortName'), axis=1)
teamnames.drop_duplicates(inplace = True)
teamnames = {x["shortName"]:x["longName"] for x in teamnames.to_dict(orient='record')}

# Convert opponent names

In [35]:
# Exact the opponent 
gamelist['Opponent2'] = gamelist.apply(lambda x: opponent(x, 'Opponent'), axis = 1)

In [36]:
# Create the long opponent name 
gamelist['opponentName'] = gamelist.apply(lambda x:teamnames[x['Opponent2']]
                                          if x['Opponent2'] in teamnames.keys()
                                          else 'Missing', 
                                          axis = 1
                                         )

# Add in program history to game list  

In [37]:
# Add in team historical data 
yrs = [1, 2, 3, 5, 10]
variables = ['history_wins_{yr}yrs', 'history_losses_{yr}yrs', 'history_WL_{yr}yrs']
years = [v.format(yr = yr) for yr in yrs for v in variables ]
years.extend([v.format(yr = 'max') for v in variables])

gamelist[years] = gamelist.apply(lambda x: teamhistory(x['Team'], x['year'], yrs, history), axis = 1)

# Add in coaching history 

In [38]:
# Make sure that all the teams in the game list are in the coach list
coach_teams = coach_links.Team.unique()
for team in gamelist.Team.unique(): 
    if team not in coach_teams: 
        print(team)

In [39]:
gamelist = gamelist.merge(coaches_list_fixed, 
                          left_on = ['Team', 'year'], 
                          right_on = ['Team', 'year'], 
                          how = 'left', 
                         )

In [40]:
coach_variables = ['Coach_wins', 'Coach_losses', 'Coach_WL', 'Coach_years']
gamelistOut = gamelist.copy()
gamelistOut[coach_variables] = gamelist.apply(lambda x: coach_history(x['coach'], x['year'], coaches), axis = 1)

# Add in roster data

In [41]:
gamelist.merge(roster_df, 
               left_on = ['Team', 'year'], 
               right_on = ['Team', 'year'], 
               how = 'left'
              );

# Split out HomeAway and Win/Loss in game file 

In [42]:
gamelistOut[['HomeAway', 'WinLoss']] = gamelistOut.apply(lambda x: create_variables(x), axis = 1).copy()
drop_columns = ['G', 'OffenseDefense', 'Opponent2']
gamelistOut.drop(drop_columns, axis = 1, inplace = True)

# Write the final files out 

In [44]:
zip_file_out = os.path.join('Data', 'finalgamedata.zip')

with zipfile.ZipFile(zip_file_out, 'w') as csv_zip:
    csv_zip.writestr("gameslist.csv", pd.DataFrame(gamelistOut).to_csv(index = False))
    csv_zip.writestr("gamestats.csv", pd.DataFrame(gamestats).to_csv(index = False))

# Merge in the data for the opponents

In [None]:
cols2 = ["Coach_WL", "Coach_years", "FG",
         "RushNetYards","PassYards","Interceptions","PassAttempts","Punts","Points","FumblesLost",
         "Penalties","Plays","def_RushNetYards","def_PassYards","def_Interceptions","def_Punts", 
         "def_FF","def_PenaltyYards","def_PDef","Sacks","Tackles","def_Plays",
        ]

# Filter out game 1 of season 2013
base_year_mask = ~((gamebygame_part.year == 2013) & (gamebygame_part['count'] == 1))

gamebygame_part[cols2] = gamebygame_part[base_year_mask].apply(lambda x: previous_yrs(team = x['Team'], 
                                                     year = x['year'], 
                                                     game = x['count'], 
                                                     cols = cols2, 
                                                     final = final,                                                                                     
                                                     debug = False
                                                     ), axis = 1
                                                   )

In [None]:
gamedata = list(gamelistOut['Team'].unique())

def opp(team, date, year, cols, debug = False):
    """ Calculate the game-by-game stats for the opponents"""
    game = final[(final['Team'] == team) &  (final['Date'] == date) ]['count'].values[0]
    return previous_yrs(team = team, 
                        year = year, 
                        game = game, 
                        cols = cols2, 
                        debug = False
                       )

In [None]:
# Create the oppoent variable names 
opps = ['opp_' + x for x in cols2]

# Subset the datset to games to keep it short
games = gamedata[~((gamebygame_part.year == 2013) & (gamebygame_part['count'] == 1))].copy()

# Calculate the opponent data 
games[opps] = gamedata.apply(lambda x: opp(team = x['Team'],
                                        date = x['Date'], 
                                        year = x['year'], 
                                        cols = cols2,
                                        final = final,
                                        debug = False
                                       )
                          if x['opponentName'] in gamedata
                          else pd.Series([None for x in range(0, len(cols2))]),
                          axis = 1
                         )

In [None]:
# Select all the games where they don't have missing data for rush data for home team and opponent team
finalgames = games[(pd.notnull(games['opp_def_RushNetYards'])) & (pd.notnull(games['RushNetYards']))]

In [None]:
finalgames[['HomeAway', 'WinLoss']] = finalgames.apply(lambda x: create_variables(x), axis = 1).copy()

In [None]:
# Variables to drop out for analysis
drop_variables = ['Date', 'Opponent', 'Result', 'G', 'Team', 'OffenseDefense', 
                  'year', 'Opponent2', 'opponentName', 'WinLoss']

In [None]:
y = finalgames['WinLoss'].values
X = finalgames.drop(drop_variables, axis = 1).values

In [None]:
stats = list(final.columns)