In [1]:
import zipfile, os, re, datetime
import pandas as pd
import numpy as np 

# Read in the fooball data

In [2]:
_file = os.path.join('ncaa_football_scrapy','Data', 'Data.zip')
zf = zipfile.ZipFile(_file)
filenames = zf.namelist()

# Create Program History DF

In [3]:
history = pd.read_csv(zf.open('history.csv'))

In [4]:
history['year'] = history.apply(lambda x: int(x['Year'][:4]), axis = 1)

# Coach links

In [5]:
coach_links = pd.read_csv(os.path.join("ncaa_football_scrapy", "Links", "links_teaminfo.csv"))
coach_links = coach_links.rename(columns={"txt":"coach", 'team':"Team"})
coach_links = coach_links[coach_links.key == 'people'][['Team', 'coach', 'year']]
coach_links['Team'] = coach_links.Team.str.replace("  ", " ")

# Account for coaches who get fired in the season

In [6]:
coach_dict = coach_links.to_dict(orient='record')

In [7]:
coach_fixed = {}
for x in coach_dict: 
    if x['year'] not in coach_fixed: 
        coach_fixed[x['year']] = {}
    if x['Team'] not in coach_fixed[x['year']]: 
        coach_fixed[x['year']][x['Team']] = []
    coach_fixed[x['year']][x['Team']].append(x['coach'])

In [8]:
coaches_list = []
for yr, teams in coach_fixed.items(): 
    for team, coaches in teams.items(): 
        tmp = {'year':yr, 'Team':team, 'coach':coaches}
        coaches_list.append(tmp)

In [9]:
coaches_list_fixed = pd.DataFrame(coaches_list)

# Create Coach DF 

In [10]:
coaches = pd.read_csv(zf.open('coaches.csv'))

In [11]:
coaches['year'] = coaches.apply(lambda x: int(x['Year'][:4]), axis = 1)
coaches.rename(columns = {'Name':'coach'}, inplace = True)

# Create Roster DF

In [12]:
roster = pd.read_csv(zf.open('roster.csv'))

In [13]:
roster['year'] = roster.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [14]:
def yrRecode(data): 
    yr = {'Fr':0, 'So':1, 'Jr':2, 'Sr':3}
    return yr[data]

In [15]:
roster_by_year = roster[['Team', 'Yr', 'year', 'GS']].groupby(['Team', 'year', 'Yr']).count().reset_index()

In [16]:
roster_games_started = roster[['Team', 'year', 'Yr', 'GS']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [17]:
roster_games_played = roster[['Team', 'year', 'Yr', 'GP']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [18]:
roster_final = pd.merge(roster_games_played, 
                        roster_games_started, 
                        left_on = ['Team', 'year', 'Yr'], 
                        right_on = ['Team', 'year', 'Yr'],
                        how = 'left'
                       )

In [19]:
roster_final2 = roster_final.set_index(['Team', 'year', 'Yr']).unstack().reset_index()

In [20]:
columns = [x[0] if x[0] in ['Team', 'year'] else x[0] + "_" + x[1] for x in roster_final2.columns.values]
roster_final2.columns = columns

In [21]:
def createvariables(data):
    """Create Opponent, Home, Win/Loss, Overtime, and Scores
    
    """
    regexs = {'WL': "[WL]",
              'team': ["\d+\s\-", "\d+"],
              'opponent_score':["\-\s\d+", '\d+'],
              'OT':["\(\d+OT\)", "\d+"], 
              'opponent':['[\w\s]+\.?\s\@\s\w+']
             }
    
    # Opponent 
    if '@' in data['Opponent']: 
        search = re.search(regexs['opponent'][0], data['Opponent']) 
        if search: 
            opponent = data['Opponent'].split("@")[0].strip()
            home = 0
        else: 
            opponent = data['Opponent'].replace("@","").strip()
            home = 0
    else: 
        opponent = data['Opponent']
        home = 1
    
    
    # Win/Loss
    WinLoss = re.search(regexs['WL'], data['Result']).group()
    
    # Overtime 
    Overtime = 0
    re_ot = re.search(regexs['OT'][0], data['Result'])
    if re_ot:
        Overtime = re.search(regexs['OT'][1], re_ot.group()).group()
        
    # Team Score
    team_score = None
    re_team = re.search(regexs['team'][0], data['Result'])
    if re_team:
        team_score = re.search(regexs['OT'][1], re_team.group()).group()
        
    # Opponent Score
    opponent_score = None
    re_opponent = re.search(regexs['opponent_score'][0], data['Result'])
    if re_opponent:
        opponent_score = re.search(regexs['OT'][1], re_opponent.group()).group()
        
    return pd.Series([opponent, home, WinLoss, Overtime, team_score, opponent_score])    

# Start merging

In [22]:
# Subset the gamebygame files
targetfiles = [file for file in filenames if 'gamebygame' in file ]

In [23]:
# Create a dict of dfs and create a multi-index 
dfs = {}
for f in targetfiles: 
    filename = f.split(".")[0]
    dfs[filename] = pd.read_csv(zf.open(f))
    dfs[filename]['Date'] = pd.to_datetime(dfs[filename]['Date'])
    dfs[filename].set_index(['Team', 'Date', 'OffenseDefense'], inplace = True)

In [24]:
# Concatenate the data together and drop duplicate column names 
master = pd.concat([v for k, v in dfs.items()], axis = 1)
master = master.loc[:, ~master.columns.duplicated()]

In [25]:
def fixTOP(row):
    """ Fix time of possession"""
    if isinstance(row, str) == True and ":" in row: 
        _all = row.split(":")
        _min = int(_all[0]) * 60 
        _sec = int(_all[1]) 
        total_seconds = _min + _sec
    else: 
        total_seconds = int(row)
    val = "{}".format(datetime.timedelta(seconds=total_seconds))
    return pd.Series([val])

In [26]:
def removeSlashes(row):
    """ Remove slashes from some of the variables"""
    global cols
    out = []
    for cell in cols[3:]:
        if cell == 'TOP': 
            out.append(row[cell])
        elif isinstance(row[cell], str) and '/' in row[cell]:
            tmp = row[cell].replace("/", "")
            out.append(float(tmp))
        else: 
            out.append(row[cell])
    return pd.Series(out)

In [27]:
# Fix Time of Possession
master['TOP'] = master.apply(lambda x: fixTOP(x['TOP']) if pd.isnull(x['TOP']) == False else x['TOP'], axis =1)
master['TOP'] = pd.to_timedelta(master['TOP'])

In [28]:
# Remove slashes and convert to numeric 
cols = list(master.columns)
TOP_index = list(cols).index('TOP')
cols.pop(TOP_index)

'TOP'

In [29]:
master[cols[3:]] = master.apply(lambda x: removeSlashes(x), axis =1)

In [30]:
master[cols[3:]] = master[cols[3:]].apply(pd.to_numeric)

In [31]:
#df.iloc[df.index.get_level_values('A') == 1]

master.iloc[(master.index.get_level_values('Team') == 'Air Force Falcons') & 
            (master.index.get_level_values('OffenseDefense') == 'Defense') & 
            (master.index.get_level_values('Date') == '2016-10-01 00:00:00' )
           ]['TOP']

Team               Date        OffenseDefense
Air Force Falcons  2016-10-01  Defense          00:28:00
Name: TOP, dtype: timedelta64[ns]

In [32]:
# Create new variables 
master[['opponent', 'home', 'WinLoss','Overtime', 'team_score', 'opponent_score']] = master.apply(lambda x: createvariables(x), axis = 1)

# Subset Offense Defense

In [33]:
defense = master[master.index.get_level_values('OffenseDefense') == 'Defense'].copy()

In [34]:
offense = master[master.index.get_level_values('OffenseDefense') == 'Offense'].copy()

In [35]:
assert len(defense) == len(offense),  "Datasets different sizes"

# Manipulate

In [36]:
offense.reset_index(inplace = True)

In [37]:
offense['year'] = offense['Date'].dt.year

In [38]:
base = pd.merge(offense,
               roster_final2, 
               left_on = ['Team', 'year'], 
               right_on = ['Team', 'year'],
               how = 'left'
              )

In [39]:
def teamhistory(team, year, duration): 
    global history
    team = team
    target_year = year
    out = []
    # Individual 
    for yr in duration: 
        target_year_min = target_year - yr
        # Wins/Losses
        wins, losses = list(history[(history.Team == team) & 
                                    (history.year <= target_year) & 
                                    (history.year >= target_year_min)][['Wins', 'Losses']].sum()
                           )
        out.extend([wins, losses, wins/(wins+losses)])
    # Max 
    wins, losses = list(history[(history.Team == team)][['Wins', 'Losses']].sum())
    out.extend([wins, losses, wins/(wins+losses)])
    return pd.Series(out)

In [40]:
# Add in team historical data 
yrs = [1, 2, 3, 5, 10]
variables = ['history_wins_{yr}yrs', 'history_losses_{yr}yrs', 'history_WL_{yr}yrs']
years = [v.format(yr = yr) for yr in yrs for v in variables ]
years.extend([v.format(yr = 'max') for v in variables])

base[years] = base.apply(lambda x: teamhistory(x['Team'], x['year'], yrs), axis = 1)

In [41]:
assert len(base) == len(offense), "Size change during merging"

In [42]:
coach_teams = coach_links.Team.unique()
for team in base.Team.unique(): 
    if team not in coach_teams: 
        print(team)

In [43]:
tmp = pd.merge(base, 
               coaches_list_fixed, 
               left_on = ['Team', 'year'], 
               right_on = ['Team', 'year'], 
               how = 'left'
              )

In [44]:
assert len(base) == len(tmp), "Dataset size changed"

# Fix the coach history to get the average of coaches

In [45]:
def coach_history(data, year): 
    global coaches
    t = []
    if isinstance(data, list) and len(data) >=1 :
        for coach in data: 
            coach_record = coaches[(coaches.coach == coach) & 
                                   (coaches.year < year)
                                  ][["coach","Wins","Losses","WL","year"]].groupby("coach").agg(['sum', 'count', 'mean'])
            coach_record.columns = [x[0] + "_" + x[1]  for x in coach_record.columns.values]
            coach_record.drop(['Wins_count', 'Wins_mean', 'Losses_count', 'Losses_mean', 'WL_sum', 'WL_count', 
                                     'year_sum', 'year_mean'], axis=1, inplace = True)
            coach_record.rename(columns = {'Wins_sum':"Coach_wins", 
                                           "Losses_sum":"Coach_losses", 
                                           "WL_mean":"Coach_WL", 
                                           "year_count":"Coach_years"
                                          },
                                inplace = True
                               )
            t.append(coach_record.to_dict(orient='record')[0])
    
        out = pd.Series(pd.DataFrame(t).mean().to_dict())
    else: 
        out = pd.Series({'Coach_wins':None, 
                         "Coach_losses":None, 
                         "Coach_WL":None, 
                         "Coach_years":None
                        })
    return out

In [46]:
coach_variables = ['Coach_wins', 'Coach_losses', 'Coach_WL', 'Coach_years']
tmp = tmp.copy()
tmp[coach_variables] = tmp.apply(lambda x: coach_history(x['coach'], x['year']), axis = 1)

# Edit Defense

In [47]:
defense.columns = ['def_' + x for x in defense.columns]
defense.reset_index(inplace = True)

In [48]:
defense['year'] = defense['Date'].dt.year

# Merge offense defense

In [49]:
assert len(tmp) == len(defense), "Dataset file difference"

In [50]:
final = pd.merge(tmp, 
                 defense, 
                 left_on = ['Team', 'Date'],
                 right_on = ['Team', 'Date'],
                 how = 'left'
                )

In [51]:
final['count'] = final.groupby(['Team', 'year_x']).cumcount()+1

# Game by game participation - Starting point


In [162]:
gamebygame_part = pd.read_csv(zf.open('gamebygame_participation.csv'))

In [163]:
gamebygame_part['Date'] = pd.to_datetime(gamebygame_part['Date'])
gamebygame_part['year'] = gamebygame_part['Date'].dt.year
gamebygame_part = gamebygame_part[gamebygame_part.OffenseDefense == 'Offense']

In [164]:
gamebygame_part['count'] = gamebygame_part.groupby(['Team', 'year']).cumcount() +1

# Previous Years

In [188]:
def previous_yrs(team, year, game, cols, debug = False): 
    global final
    cols = ['Team', 'year_y', 'count'] + cols
    out = []
    y = final[cols][(final['Team'] == 'Air Force Falcons') & 
                    (final['year_y'] == year) & 
                    (final['count'] <= game)
                   ]
    
    out.append(y)
    if game <= 3:
        x = final[cols][(final['Team'] == 'Air Force Falcons') & 
                        (final['year_y'] == year - 1)
                       ]
        
        out.append(x)

    if debug == False: 
        mean = pd.concat(out).mean().to_frame().T.drop(['year_y', 'count'], axis=1).to_dict(orient='record')[0]
        return pd.Series(mean)
    elif debug == True: 
        mean = pd.concat(out)
        return mean


In [189]:
#cols2 = ['RushNetYards', 'PassYards', 'Int', 'Interceptions', 'PassAttempts', 'Punts']
cols2 = ["RushNetYards","PassYards","Interceptions","PassAttempts","Punts","Points","FumblesLost",
         "Penalties","Plays","def_RushNetYards","def_PassYards","def_Interceptions","def_Punts",
         "def_FF","def_PenaltyYards","def_PDef","Sacks","Tackles","def_Plays",
        ]

In [191]:
gamebygame_part[cols2] = gamebygame_part[gamebygame_part.year == 2014].apply(lambda x: previous_yrs(team = x['Team'], 
                                                     year = x['year'], 
                                                     game = x['count'], 
                                                     cols = cols2, 
                                                     debug = False
                                                     ), axis = 1
                                                   )

In [193]:
gamebygame_part[gamebygame_part.year == 2014]

Unnamed: 0,Date,Opponent,Result,G,Team,OffenseDefense,year,count,RushNetYards,PassYards,...,def_RushNetYards,def_PassYards,def_Interceptions,def_Punts,def_FF,def_PenaltyYards,def_PDef,Sacks,Tackles,def_Plays
78,2014-08-30,Nicholls St.,W 44 - 16,1/,Air Force Falcons,Offense,2014,1,1.125000,1.333333,...,1.900000,77.769231,1.444444,1.000000,2.100000,236.615385,35.923077,75.307692,3.454545,236.153846
80,2014-09-06,@ Wyoming,L 13 - 17,1/,Air Force Falcons,Offense,2014,2,1.222222,1.333333,...,1.900000,76.214286,1.500000,1.000000,2.090909,239.857143,34.428571,74.428571,3.833333,222.928571
82,2014-09-13,@ Georgia St.,W 48 - 38,1/,Air Force Falcons,Offense,2014,3,1.222222,1.333333,...,2.090909,74.733333,1.500000,1.000000,2.090909,251.466667,36.266667,74.000000,3.769231,215.933333
84,2014-09-27,Boise St.,W 28 - 14,1/,Air Force Falcons,Offense,2014,4,2.000000,,...,3.666667,57.250000,2.000000,3.000000,1.500000,316.500000,46.500000,69.750000,5.250000,82.250000
86,2014-10-04,Navy,W 30 - 21,1/,Air Force Falcons,Offense,2014,5,2.000000,,...,3.500000,61.200000,2.000000,2.333333,1.666667,281.600000,48.200000,70.000000,5.200000,116.000000
88,2014-10-11,@ Utah St.,L 16 - 34,1/,Air Force Falcons,Offense,2014,6,2.000000,2.000000,...,3.600000,63.666667,1.666667,2.333333,3.000000,286.500000,47.333333,68.166667,5.333333,109.500000
90,2014-10-18,New Mexico,W 35 - 31,1/,Air Force Falcons,Offense,2014,7,1.750000,2.000000,...,3.166667,65.285714,1.500000,2.333333,2.800000,250.285714,42.714286,67.285714,5.285714,146.285714
92,2014-11-01,@ Army West Point,W 23 - 6,1/,Air Force Falcons,Offense,2014,8,1.750000,1.500000,...,3.285714,63.625000,1.500000,2.333333,2.833333,224.875000,42.500000,65.250000,5.375000,143.250000
94,2014-11-08,@ UNLV,W 48 - 21,1/,Air Force Falcons,Offense,2014,9,1.600000,1.500000,...,3.500000,63.777778,1.500000,2.333333,2.833333,233.888889,41.111111,66.333333,5.444444,131.000000
96,2014-11-15,Nevada,W 45 - 38 (1OT),1/,Air Force Falcons,Offense,2014,10,1.600000,1.500000,...,3.500000,66.000000,1.400000,2.000000,2.714286,238.000000,41.200000,67.800000,5.100000,140.200000
