In [1]:
import zipfile, os, re
import pandas as pd
import numpy as np 

# Read in the fooball data

In [2]:
_file = os.path.join('ncaa_football_scrapy','Data', 'Data.zip')
zf = zipfile.ZipFile(_file)
filenames = zf.namelist()

# Create Program History DF

In [3]:
history = pd.read_csv(zf.open('history.csv'))

In [4]:
history['year'] = history.apply(lambda x: int(x['Year'][:4]), axis = 1)

# Coach links

In [5]:
coach_links = pd.read_csv(os.path.join("ncaa_football_scrapy", "Links", "links_teaminfo.csv"))
coach_links = coach_links.rename(columns={"txt":"coach", 'team':"Team"})
coach_links = coach_links[coach_links.key == 'people'][['Team', 'coach', 'year']]

# Create Coach DF 

In [6]:
coaches = pd.read_csv(zf.open('coaches.csv'))

In [7]:
coaches['year'] = coaches.apply(lambda x: int(x['Year'][:4]), axis = 1)
coaches.rename(columns = {'Name':'coach'}, inplace = True)

# Create Roster DF

In [8]:
roster = pd.read_csv(zf.open('roster.csv'))

In [9]:
roster['year'] = roster.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [10]:
def yrRecode(data): 
    yr = {'Fr':0, 'So':1, 'Jr':2, 'Sr':3}
    return yr[data]

In [11]:
roster_by_year = roster[['Team', 'Yr', 'year', 'GS']].groupby(['Team', 'year', 'Yr']).count().reset_index()

In [12]:
roster_games_started = roster[['Team', 'year', 'Yr', 'GS']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [13]:
roster_games_played = roster[['Team', 'year', 'Yr', 'GP']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [14]:
roster_final = pd.merge(roster_games_played, 
                        roster_games_started, 
                        left_on = ['Team', 'year', 'Yr'], 
                        right_on = ['Team', 'year', 'Yr'],
                        how = 'left'
                       )

In [15]:
roster_final2 = roster_final.set_index(['Team', 'year', 'Yr']).unstack().reset_index()

In [16]:
columns = [x[0] if x[0] in ['Team', 'year'] else x[0] + "_" + x[1] for x in roster_final2.columns.values]
roster_final2.columns = columns

In [17]:
roster_final2.head(3)

Unnamed: 0,Team,year,GP_Fr,GP_Jr,GP_So,GP_Sr,GS_Fr,GS_Jr,GS_So,GS_Sr
0,Air Force Falcons,2013,5.133333,7.783784,6.956522,9.0,0.666667,2.459459,3.521739,4.411765
1,Air Force Falcons,2014,1.0,8.0,6.363636,9.9,0.0,3.84,1.575758,4.466667
2,Air Force Falcons,2015,1.8,9.555556,6.714286,9.142857,0.0,4.611111,0.885714,5.047619


In [18]:
def createvariables(data):
    """Create Opponent, Home, Win/Loss, Overtime, and Scores
    
    """
    regexs = {'WL': "[WL]",
              'team': ["\d+\s\-", "\d+"],
              'opponent_score':["\-\s\d+", '\d+'],
              'OT':["\(\d+OT\)", "\d+"], 
              'opponent':['[\w\s]+\.?\s\@\s\w+']
             }
    
    # Opponent 
    if '@' in data['Opponent']: 
        search = re.search(regexs['opponent'][0], data['Opponent']) 
        if search: 
            opponent = data['Opponent'].split("@")[0].strip()
            home = 0
        else: 
            opponent = data['Opponent'].replace("@","").strip()
            home = 0
    else: 
        opponent = data['Opponent']
        home = 1
    
    
    # Win/Loss
    WinLoss = re.search(regexs['WL'], data['Result']).group()
    
    # Overtime 
    Overtime = 0
    re_ot = re.search(regexs['OT'][0], data['Result'])
    if re_ot:
        Overtime = re.search(regexs['OT'][1], re_ot.group()).group()
        
    # Team Score
    team_score = None
    re_team = re.search(regexs['team'][0], data['Result'])
    if re_team:
        team_score = re.search(regexs['OT'][1], re_team.group()).group()
        
    # Opponent Score
    opponent_score = None
    re_opponent = re.search(regexs['opponent_score'][0], data['Result'])
    if re_opponent:
        opponent_score = re.search(regexs['OT'][1], re_opponent.group()).group()
        
    return pd.Series([opponent, home, WinLoss, Overtime, team_score, opponent_score])    

# Start merging

In [19]:
# Subset the gamebygame files
targetfiles = [file for file in filenames if 'gamebygame' in file ]

In [20]:
# Create a dict of dfs and create a multi-index 
dfs = {}
for f in targetfiles: 
    filename = f.split(".")[0]
    dfs[filename] = pd.read_csv(zf.open(f))
    dfs[filename]['Date'] = pd.to_datetime(dfs[filename]['Date'])
    dfs[filename].set_index(['Team', 'Date', 'OffenseDefense'], inplace = True)

In [21]:
# Concatenate the data together and drop duplicate column names 
master = pd.concat([v for k, v in dfs.items()], axis = 1)
master = master.loc[:, ~master.columns.duplicated()]

In [22]:
# Create new variables 
master[['opponent', 'home', 'WinLoss','Overtime', 'team_score', 'opponent_score']] = master.apply(lambda x: createvariables(x), axis = 1)

In [23]:
master[master.index.get_level_values('OffenseDefense') == 'Offense']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Opponent,Result,G,RushNetYards,ReceivingYards,IntRYds,PuntRetYds,FRetYds,KORetYds,APY,...,PenaltyYards,PenaltyYdsPerGame,TotalOffYardsG,FumblesLost,opponent,home,WinLoss,Overtime,team_score,opponent_score
Team,Date,OffenseDefense,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Air Force Falcons,2013-08-31,Offense,Colgate,W 38 - 13,1/,409,72,,14,,,495,...,75,,,,Colgate,1,W,0,38,13
Air Force Falcons,2013-09-07,Offense,Utah St.,L 20 - 52,1/,162,108,,,,15,285,...,43,,,1,Utah St.,1,L,0,20,52
Air Force Falcons,2013-09-13,Offense,@ Boise St.,L 20 - 42,1/,188,99,17,,,33,337,...,23,23.0,287.0,,Boise St.,0,L,0,20,42
Air Force Falcons,2013-09-21,Offense,Wyoming,L 23 - 56,1/,346,127,,-8,,72,537,...,61,61.0,473.0,1,Wyoming,1,L,0,23,56
Air Force Falcons,2013-09-28,Offense,@ Nevada,L 42 - 45,1/,375,78,21,,,70,544,...,20,20.0,453.0,,Nevada,0,L,0,42,45
Air Force Falcons,2013-10-05,Offense,@ Navy,L 10 - 28,1/,231,82,,1,,61,375,...,5,5.0,313.0,1,Navy,0,L,0,10,28
Air Force Falcons,2013-10-10,Offense,San Diego St.,L 20 - 27,1/,169,150,,,,,319,...,45,45.0,319.0,1,San Diego St.,1,L,0,20,27
Air Force Falcons,2013-10-26,Offense,Notre Dame,L 10 - 45,1/,290,49,,16,,29,384,...,40,40.0,339.0,2,Notre Dame,1,L,0,10,45
Air Force Falcons,2013-11-02,Offense,Army West Point,W 42 - 28,1/,343,111,,,,27,505,...,31,31.0,454.0,1,Army West Point,1,W,0,42,28
Air Force Falcons,2013-11-08,Offense,@ New Mexico,L 37 - 45,1/,257,188,,,,162,607,...,30,30.0,445.0,,New Mexico,0,L,0,37,45


In [24]:
master.reset_index(inplace = True)

In [25]:
master['year'] = master['Date'].dt.year

In [26]:
roster_final2.head()

Unnamed: 0,Team,year,GP_Fr,GP_Jr,GP_So,GP_Sr,GS_Fr,GS_Jr,GS_So,GS_Sr
0,Air Force Falcons,2013,5.133333,7.783784,6.956522,9.0,0.666667,2.459459,3.521739,4.411765
1,Air Force Falcons,2014,1.0,8.0,6.363636,9.9,0.0,3.84,1.575758,4.466667
2,Air Force Falcons,2015,1.8,9.555556,6.714286,9.142857,0.0,4.611111,0.885714,5.047619
3,Air Force Falcons,2016,9.0,6.545455,3.848485,10.484848,0.0,2.454545,0.212121,5.939394
4,Akron Zips,2013,2.0,7.592593,9.043478,10.090909,0.241379,2.851852,3.173913,4.772727


In [27]:
tmp = pd.merge(master,
               roster_final2, 
               left_on = ['Team', 'year'], 
               right_on = ['Team', 'year'],
               how = 'left'
              )

In [28]:
def teamhistory(team, year, duration): 
    global history
    team = team
    target_year = year
    out = []
    # Individual 
    for yr in duration: 
        target_year_min = target_year - yr
        # Wins/Losses
        wins, losses = list(history[(history.Team == team) & 
                                    (history.year <= target_year) & 
                                    (history.year >= target_year_min)][['Wins', 'Losses']].sum()
                           )
        out.extend([wins, losses, wins/(wins+losses)])
    # Max 
    wins, losses = list(history[(history.Team == team)][['Wins', 'Losses']].sum())
    out.extend([wins, losses, wins/(wins+losses)])
    return pd.Series(out)

In [29]:
# Add in team historical data 
yrs = [1, 2, 3, 5, 10]
variables = ['history_wins_{yr}yrs', 'history_losses_{yr}yrs', 'history_WL_{yr}yrs']
years = [v.format(yr = yr) for yr in yrs for v in variables ]
years.extend([v.format(yr = 'max') for v in variables])

tmp[years] = tmp.apply(lambda x: teamhistory(x['Team'], x['year'], yrs), axis = 1)

In [30]:
# Add in coach info 
tmp = tmp.merge(coach_links, 
                left_on = ['Team', 'year'], 
                right_on = ['Team', 'year'], 
               )

In [33]:
def coach_history(coach, year): 
    global coaches
    coach_record = coaches[(coaches.coach == coach) & 
                           (coaches.year < year)
                          ][["coach","Wins","Losses","WL","year"]].groupby("coach").agg(['sum', 'count', 'mean'])
    coach_record.columns = [x[0] + "_" + x[1]  for x in coach_record.columns.values]
    coach_record.drop(['Wins_count', 'Wins_mean', 'Losses_count', 'Losses_mean', 'WL_sum', 'WL_count', 
                             'year_sum', 'year_mean'], axis=1, inplace = True)
    coach_record.rename(columns = {'Wins_sum':"Coach_wins", 
                                   "Losses_sum":"Coach_losses", 
                                   "WL_mean":"Coach_WL", 
                                   "year_count":"Coach_years"
                                  },
                        inplace = True
                       )
    return pd.Series(coach_record.to_dict(orient='record')[0])

In [34]:
coach_variables = ['Coach_wins', 'Coach_losses', 'Coach_WL', 'Coach_years']
tmp[coach_variables] = tmp.apply(lambda x: coach_history(x['coach'], x['year']), axis = 1)

In [35]:
tmp.Team.unique()

array(['Air Force Falcons', 'Appalachian St. Mountaineers',
       'Arizona St. Sun Devils', 'Arkansas St. Red Wolves',
       'Army West Point Black Knights', 'Auburn Tigers', 'BYU Cougars',
       'Ball St. Cardinals', 'Baylor Bears', 'Boise St. Broncos',
       'Boston College Eagles', 'Charlotte 49ers', 'Clemson Tigers',
       'Colorado St. Rams', 'Duke Blue Devils', 'East Carolina Pirates',
       'Eastern Mich. Eagles', 'FIU Panthers', 'Fla. Atlantic Owls',
       'Florida St. Seminoles', 'Fresno St. Bulldogs',
       'Ga. Southern Eagles', 'Georgia St. Panthers',
       'Georgia Tech Yellow Jackets', 'Iowa St. Cyclones',
       'Kansas St. Wildcats', 'Kent St. Golden Flashes',
       'LSU Fighting Tigers', 'La.-Monroe Warhawks',
       "Louisiana Ragin' Cajuns", 'Louisiana Tech Bulldogs',
       'Miami (FL) Hurricanes', 'Miami (OH) RedHawks',
       'Michigan St. Spartans', 'Middle Tenn. Blue Raiders',
       'Mississippi St. Bulldogs', 'NC State Wolfpack', 'Navy Midshipmen',
 