In [44]:
import zipfile, os, re, datetime
import pandas as pd
import numpy as np 
from functions.functions import yrRecode, fixTOP, removeSlashes, teamhistory, \
coach_history, previous_yrs, extract_name, opponent_stats, create_variables

# Read in the fooball data

In [45]:
_file = os.path.join('scrapy','Data', 'Data.zip')
zf = zipfile.ZipFile(_file)
filenames = zf.namelist()

# Create Program History DF

In [None]:
history = pd.read_csv(zf.open('history.csv'))
history['year'] = history.apply(lambda x: int(x['Year'][:4]), axis = 1)

# Coach links

In [48]:
coach_links = pd.read_csv(os.path.join("scrapy", "Links", "links_teaminfo.csv"))
coach_links = coach_links.rename(columns={"txt":"coach", 'team':"Team"})
coach_links = coach_links[coach_links.key == 'people'][['Team', 'coach', 'year']]
coach_links['Team'] = coach_links.Team.str.replace("  ", " ")
coach_links['year'] = coach_links.year - 1 

# Account for coaches who get fired in the season

In [49]:
coach_dict = coach_links.to_dict(orient='record')

In [50]:
coach_fixed = {}
for x in coach_dict: 
    if x['year'] not in coach_fixed: 
        coach_fixed[x['year']] = {}
    if x['Team'] not in coach_fixed[x['year']]: 
        coach_fixed[x['year']][x['Team']] = []
    coach_fixed[x['year']][x['Team']].append(x['coach'])

In [51]:
coaches_list = []
for yr, teams in coach_fixed.items(): 
    for team, coaches in teams.items(): 
        tmp = {'year':yr, 'Team':team, 'coach':coaches}
        coaches_list.append(tmp)

In [52]:
coaches_list_fixed = pd.DataFrame(coaches_list)

# Create Coach DF 

In [53]:
coaches = pd.read_csv(zf.open('coaches.csv'))

In [54]:
coaches['year'] = coaches.apply(lambda x: int(x['Year'][:4]) -1 , axis = 1)
coaches.rename(columns = {'Name':'coach'}, inplace = True)

# Create Roster DF

In [55]:
roster = pd.read_csv(zf.open('roster.csv'))

In [56]:
roster['year'] = roster.apply(lambda x: int(x['Year'][:4]) -1, axis = 1)

In [57]:
roster_by_year = roster[['Team', 'Yr', 'year', 'GS']].groupby(['Team', 'year', 'Yr']).count().reset_index()

In [58]:
roster_games_started = roster[['Team', 'year', 'Yr', 'GS']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [59]:
roster_games_played = roster[['Team', 'year', 'Yr', 'GP']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [60]:
roster_final = pd.merge(roster_games_played, 
                        roster_games_started, 
                        left_on = ['Team', 'year', 'Yr'], 
                        right_on = ['Team', 'year', 'Yr'],
                        how = 'left'
                       )

In [61]:
roster_df = roster_final.set_index(['Team', 'year', 'Yr']).unstack().reset_index()

In [62]:
columns = [x[0] if x[0] in ['Team', 'year'] else x[0] + "_" + x[1] for x in roster_df.columns.values]
roster_df.columns = columns

# Merge together game-by-game data

In [63]:
# Subset the gamebygame files
targetfiles = [file for file in filenames if 'gamebygame' in file ]

In [64]:
# Remove the team names data file 
idx = targetfiles.index('gamebygame_teamnames.csv')
targetfiles.pop(idx);

In [65]:
# Create a dict of dfs with all the game-by-game stats using a multi-index 
dfs = {}
for f in targetfiles: 
    filename = f.split(".")[0]
    dfs[filename] = pd.read_csv(zf.open(f))
    dfs[filename]['Date'] = pd.to_datetime(dfs[filename]['Date'])
    dfs[filename].set_index(['Team', 'Date', 'OffenseDefense'], inplace = True)

In [66]:
# Concatenate the data together and drop duplicate columns 
master = pd.concat([v for k, v in dfs.items()], axis = 1)
master = master.loc[:, ~master.columns.duplicated()]

In [67]:
# Standardize time of possession format 
master['TOP'] = master.apply(lambda x: fixTOP(x['TOP']) if pd.isnull(x['TOP']) == False else x['TOP'], axis =1)
master['TOP'] = pd.to_timedelta(master['TOP'])

In [68]:
# Remove slashes in column data and convert to numeric 

cols = list(master.columns)
TOP_index = list(cols).index('TOP')
cols.pop(TOP_index);

master[cols[3:]] = master.apply(lambda x: removeSlashes(x, cols), axis =1)
master[cols[3:]] = master[cols[3:]].apply(pd.to_numeric)

# Subset Offense Defense

In [69]:
defense = master[master.index.get_level_values('OffenseDefense') == 'Defense'].copy()
offense = master[master.index.get_level_values('OffenseDefense') == 'Offense'].copy()

In [70]:
assert len(defense) == len(offense),  "Offense/Defense datasets different sizes"

# Merge together offense / defense data

In [71]:
offense.reset_index(inplace = True)
offense['year'] = offense['Date'].dt.year
offense.drop(['OffenseDefense', 'G', 'Result'], axis = 1, inplace = True)

In [72]:
# Relabel defense columns to defense
defense.columns = ['def_' + x for x in defense.columns]
defense.reset_index(inplace = True)
defense['year'] = defense['Date'].dt.year
defense.drop(['OffenseDefense', 'def_G', 'def_Result'], axis = 1, inplace = True)

In [73]:
gamestats = pd.merge(offense,
                     defense, 
                     left_on = ['Team', 'Date', 'year'], 
                     right_on = ['Team', 'Date', 'year'],
                     how = 'left'
                    )

In [74]:
assert len(gamestats) == len(offense), "Size change during merging"

In [75]:
gamestats['gamenumber'] = gamestats.groupby(['Team', 'year']).cumcount()+1

# Game list 

In [76]:
# Read the file in
gamelist = pd.read_csv(zf.open('gamebygame_participation.csv'))

In [77]:
# Create a year variable and only keep the offense
gamelist['Date'] = pd.to_datetime(gamelist['Date'])
gamelist['year'] = gamelist['Date'].dt.year
gamelist = gamelist[gamelist.OffenseDefense == 'Offense']

# Add in the game count 
gamelist['gamenumber'] = gamelist.groupby(['Team', 'year']).cumcount() +1

# Read in team names

In [78]:
# Create a dataframe with the "short" and "long" names of the opponent
teamnames = pd.read_csv(zf.open('gamebygame_teamnames.csv'))
teamnames['shortName'] = teamnames.apply(lambda x: extract_name(x, 'shortName'), axis=1)
teamnames.drop_duplicates(inplace = True)
teamnames = {x["shortName"]:x["longName"] for x in teamnames.to_dict(orient='record')}

# Convert opponent names

In [79]:
# Extract the opponent 
gamelist['Opponent2'] = gamelist.apply(lambda x: extract_name(x, 'Opponent'), axis = 1)

In [80]:
# Create the long opponent name 
gamelist['opponentName'] = gamelist.apply(lambda x:teamnames[x['Opponent2']]
                                          if x['Opponent2'] in teamnames.keys()
                                          else 'Missing', 
                                          axis = 1
                                         )

# Add in program history to game list  

In [81]:
# Add in team historical data 
yrs = [1, 2, 3, 5, 10]
variables = ['history_wins_{yr}yrs', 'history_losses_{yr}yrs', 'history_WL_{yr}yrs']
years = [v.format(yr = yr) for yr in yrs for v in variables ]
years.extend([v.format(yr = 'max') for v in variables])

gamelist[years] = gamelist.apply(lambda x: teamhistory(x['Team'], x['year'], yrs, history), axis = 1)

# Add in coaching history 

In [82]:
# Make sure that all the teams in the game list are in the coach list
coach_teams = coach_links.Team.unique()
for team in gamelist.Team.unique(): 
    if team not in coach_teams: 
        print(team)

In [83]:
gamelist = gamelist.merge(coaches_list_fixed, 
                          left_on = ['Team', 'year'], 
                          right_on = ['Team', 'year'], 
                          how = 'left', 
                         )

In [84]:
coach_variables = ['Coach_wins', 'Coach_losses', 'Coach_WL', 'Coach_years']
gamelistOut = gamelist.copy()
gamelistOut[coach_variables] = gamelist.apply(lambda x: coach_history(x['coach'], x['year'], coaches), axis = 1)

# Add in roster data

In [85]:
gamelist.merge(roster_df, 
               left_on = ['Team', 'year'], 
               right_on = ['Team', 'year'], 
               how = 'left'
              );

# Split out HomeAway and Win/Loss in game file 

In [86]:
gamelistOut[['HomeAway', 'WinLoss']] = gamelistOut.apply(lambda x: create_variables(x), axis = 1).copy()
drop_columns = ['G', 'OffenseDefense', 'Opponent2']
gamelistOut.drop(drop_columns, axis = 1, inplace = True)

# Write the final files out 

In [87]:
zip_file_out = os.path.join('Data', 'finalgamedata.zip')

with zipfile.ZipFile(zip_file_out, 'w') as csv_zip:
    csv_zip.writestr("gameslist.csv", pd.DataFrame(gamelistOut).to_csv(index = False))
    csv_zip.writestr("gamestats.csv", pd.DataFrame(gamestats).to_csv(index = False))