In [1]:
import zipfile, os, re
import pandas as pd
import numpy as np 

# Read in the fooball data

In [2]:
_file = os.path.join('ncaa_football_scrapy','Data', 'Data.zip')
zf = zipfile.ZipFile(_file)
filenames = zf.namelist()

# Create Program History DF

In [3]:
history = pd.read_csv(zf.open('history.csv'))

In [4]:
history['year'] = history.apply(lambda x: int(x['Year'][:4]), axis = 1)

# Coach links

In [5]:
coach_links = pd.read_csv(os.path.join("ncaa_football_scrapy", "Links", "links_teaminfo.csv"))
coach_links = coach_links.rename(columns={"txt":"coach", 'team':"Team"})
coach_links = coach_links[coach_links.key == 'people'][['Team', 'coach', 'year']]
coach_links['Team'] = coach_links.Team.str.replace("  ", " ")

# Account for coaches who get fired in the season

In [6]:
coach_dict = coach_links.to_dict(orient='record')

In [7]:
coach_fixed = {}
for x in coach_dict: 
    if x['year'] not in coach_fixed: 
        coach_fixed[x['year']] = {}
    if x['Team'] not in coach_fixed[x['year']]: 
        coach_fixed[x['year']][x['Team']] = []
    coach_fixed[x['year']][x['Team']].append(x['coach'])

In [8]:
coaches_list = []
for yr, teams in coach_fixed.items(): 
    for team, coaches in teams.items(): 
        tmp = {'year':yr, 'Team':team, 'coach':coaches}
        coaches_list.append(tmp)

In [9]:
coaches_list_fixed = pd.DataFrame(coaches_list)

# Create Coach DF 

In [10]:
coaches = pd.read_csv(zf.open('coaches.csv'))

In [11]:
coaches['year'] = coaches.apply(lambda x: int(x['Year'][:4]), axis = 1)
coaches.rename(columns = {'Name':'coach'}, inplace = True)

# Create Roster DF

In [12]:
roster = pd.read_csv(zf.open('roster.csv'))

In [13]:
roster['year'] = roster.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [14]:
def yrRecode(data): 
    yr = {'Fr':0, 'So':1, 'Jr':2, 'Sr':3}
    return yr[data]

In [15]:
roster_by_year = roster[['Team', 'Yr', 'year', 'GS']].groupby(['Team', 'year', 'Yr']).count().reset_index()

In [16]:
roster_games_started = roster[['Team', 'year', 'Yr', 'GS']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [17]:
roster_games_played = roster[['Team', 'year', 'Yr', 'GP']].groupby(['Team', 'year', "Yr"]).mean().reset_index()

In [18]:
roster_final = pd.merge(roster_games_played, 
                        roster_games_started, 
                        left_on = ['Team', 'year', 'Yr'], 
                        right_on = ['Team', 'year', 'Yr'],
                        how = 'left'
                       )

In [19]:
roster_final2 = roster_final.set_index(['Team', 'year', 'Yr']).unstack().reset_index()

In [20]:
columns = [x[0] if x[0] in ['Team', 'year'] else x[0] + "_" + x[1] for x in roster_final2.columns.values]
roster_final2.columns = columns

In [21]:
def createvariables(data):
    """Create Opponent, Home, Win/Loss, Overtime, and Scores
    
    """
    regexs = {'WL': "[WL]",
              'team': ["\d+\s\-", "\d+"],
              'opponent_score':["\-\s\d+", '\d+'],
              'OT':["\(\d+OT\)", "\d+"], 
              'opponent':['[\w\s]+\.?\s\@\s\w+']
             }
    
    # Opponent 
    if '@' in data['Opponent']: 
        search = re.search(regexs['opponent'][0], data['Opponent']) 
        if search: 
            opponent = data['Opponent'].split("@")[0].strip()
            home = 0
        else: 
            opponent = data['Opponent'].replace("@","").strip()
            home = 0
    else: 
        opponent = data['Opponent']
        home = 1
    
    
    # Win/Loss
    WinLoss = re.search(regexs['WL'], data['Result']).group()
    
    # Overtime 
    Overtime = 0
    re_ot = re.search(regexs['OT'][0], data['Result'])
    if re_ot:
        Overtime = re.search(regexs['OT'][1], re_ot.group()).group()
        
    # Team Score
    team_score = None
    re_team = re.search(regexs['team'][0], data['Result'])
    if re_team:
        team_score = re.search(regexs['OT'][1], re_team.group()).group()
        
    # Opponent Score
    opponent_score = None
    re_opponent = re.search(regexs['opponent_score'][0], data['Result'])
    if re_opponent:
        opponent_score = re.search(regexs['OT'][1], re_opponent.group()).group()
        
    return pd.Series([opponent, home, WinLoss, Overtime, team_score, opponent_score])    

# Start merging

In [22]:
# Subset the gamebygame files
targetfiles = [file for file in filenames if 'gamebygame' in file ]

In [23]:
# Create a dict of dfs and create a multi-index 
dfs = {}
for f in targetfiles: 
    filename = f.split(".")[0]
    dfs[filename] = pd.read_csv(zf.open(f))
    dfs[filename]['Date'] = pd.to_datetime(dfs[filename]['Date'])
    dfs[filename].set_index(['Team', 'Date', 'OffenseDefense'], inplace = True)

In [24]:
# Concatenate the data together and drop duplicate column names 
master = pd.concat([v for k, v in dfs.items()], axis = 1)
master = master.loc[:, ~master.columns.duplicated()]

In [25]:
# Create new variables 
master[['opponent', 'home', 'WinLoss','Overtime', 'team_score', 'opponent_score']] = master.apply(lambda x: createvariables(x), axis = 1)

# Subset Offense Defense

In [26]:
defense = master[master.index.get_level_values('OffenseDefense') == 'Defense'].copy()

In [27]:
offense = master[master.index.get_level_values('OffenseDefense') == 'Offense'].copy()

In [28]:
assert len(defense) == len(offense),  "Datasets different sizes"

# Manipulate

In [29]:
offense.reset_index(inplace = True)

In [30]:
offense['year'] = offense['Date'].dt.year

In [31]:
base = pd.merge(offense,
               roster_final2, 
               left_on = ['Team', 'year'], 
               right_on = ['Team', 'year'],
               how = 'left'
              )

In [32]:
def teamhistory(team, year, duration): 
    global history
    team = team
    target_year = year
    out = []
    # Individual 
    for yr in duration: 
        target_year_min = target_year - yr
        # Wins/Losses
        wins, losses = list(history[(history.Team == team) & 
                                    (history.year <= target_year) & 
                                    (history.year >= target_year_min)][['Wins', 'Losses']].sum()
                           )
        out.extend([wins, losses, wins/(wins+losses)])
    # Max 
    wins, losses = list(history[(history.Team == team)][['Wins', 'Losses']].sum())
    out.extend([wins, losses, wins/(wins+losses)])
    return pd.Series(out)

In [33]:
# Add in team historical data 
yrs = [1, 2, 3, 5, 10]
variables = ['history_wins_{yr}yrs', 'history_losses_{yr}yrs', 'history_WL_{yr}yrs']
years = [v.format(yr = yr) for yr in yrs for v in variables ]
years.extend([v.format(yr = 'max') for v in variables])

base[years] = base.apply(lambda x: teamhistory(x['Team'], x['year'], yrs), axis = 1)

In [34]:
assert len(base) == len(offense), "Size change during merging"

In [35]:
coach_teams = coach_links.Team.unique()
for team in base.Team.unique(): 
    if team not in coach_teams: 
        print(team)

In [36]:
tmp = pd.merge(base, 
               coaches_list_fixed, 
               left_on = ['Team', 'year'], 
               right_on = ['Team', 'year'], 
               how = 'left'
              )

In [37]:
assert len(base) == len(tmp), "Dataset size changed"

# Fix the coach history to get the average of coaches

In [38]:
def coach_history(data, year): 
    global coaches
    t = []
    if isinstance(data, list) and len(data) >=1 :
        for coach in data: 
            coach_record = coaches[(coaches.coach == coach) & 
                                   (coaches.year < year)
                                  ][["coach","Wins","Losses","WL","year"]].groupby("coach").agg(['sum', 'count', 'mean'])
            coach_record.columns = [x[0] + "_" + x[1]  for x in coach_record.columns.values]
            coach_record.drop(['Wins_count', 'Wins_mean', 'Losses_count', 'Losses_mean', 'WL_sum', 'WL_count', 
                                     'year_sum', 'year_mean'], axis=1, inplace = True)
            coach_record.rename(columns = {'Wins_sum':"Coach_wins", 
                                           "Losses_sum":"Coach_losses", 
                                           "WL_mean":"Coach_WL", 
                                           "year_count":"Coach_years"
                                          },
                                inplace = True
                               )
            t.append(coach_record.to_dict(orient='record')[0])
    
        out = pd.Series(pd.DataFrame(t).mean().to_dict())
    else: 
        out = pd.Series({'Coach_wins':None, 
                         "Coach_losses":None, 
                         "Coach_WL":None, 
                         "Coach_years":None
                        })
    return out

In [39]:
coach_variables = ['Coach_wins', 'Coach_losses', 'Coach_WL', 'Coach_years']
tmp = tmp.copy()
tmp[coach_variables] = tmp.apply(lambda x: coach_history(x['coach'], x['year']), axis = 1)

# Edit Defense

In [40]:
defense.columns = ['def_' + x for x in defense.columns]
defense.reset_index(inplace = True)

In [41]:
defense['year'] = defense['Date'].dt.year

# Merge offense defense

In [42]:
assert len(tmp) == len(defense), "Dataset file difference"

In [43]:
final = pd.merge(tmp, 
                 defense, 
                 left_on = ['Team', 'Date'],
                 right_on = ['Team', 'Date'],
                 how = 'left'
                )

In [63]:
final['count'] = final.groupby(['Team', 'year_x']).cumcount()+1

# Game by game participation - Starting point


In [44]:
gamebygame_part = pd.read_csv(zf.open('gamebygame_participation.csv'))

In [45]:
gamebygame_part['Date'] = pd.to_datetime(gamebygame_part['Date'])
gamebygame_part['year'] = gamebygame_part['Date'].dt.year
gamebygame_part = gamebygame_part[gamebygame_part.OffenseDefense == 'Offense']

In [55]:
gamebygame_part['count'] = gamebygame_part.groupby(['Team', 'year']).cumcount() +1

# Lag

In [80]:
final[(final['Team'] == 'Air Force Falcons') & 
      (final['year_y'] == 2014) & 
      (final['count'] <= 12)
     ].mean().to_frame().T

Unnamed: 0,All-PurposeYardsG,CompletionsPerGame,YdsPerCompletion,Pct,PassYardsG,PuntAvg,ReceptionsPerGame,YardsPerReception,RecYdsG,RZScores,...,def_SackA,def_Pass2PtConv,def_Plays,def_YdsPlay,def_PenaltiesPerGame,def_PenaltyYdsPerGame,def_TotalOffYardsG,def_home,year_y,count
0,,9.333333,16.8385,,151.333333,42.741667,9.333333,16.8375,151.333333,4.166667,...,,,69.25,,4.166667,38.75,396.333333,0.5,2014.0,6.5


In [83]:
game =  3 
year = 2014
if game <=3: 
    out = final[(final['Team'] == 'Air Force Falcons') & 
                (final['year_y'] == year) & 
                (final['count'] <= game)
         ].mean()    




In [84]:
out

All-PurposeYardsG                 NaN
CompletionsPerGame          10.666667
YdsPerCompletion            12.764667
Pct                               NaN
PassYardsG                 156.333333
PuntAvg                     45.033333
ReceptionsPerGame           10.666667
YardsPerReception           12.763333
RecYdsG                    156.333333
RZScores                     4.666667
RZFGMade                     1.500000
RZEndGame                         NaN
SackUA                            NaN
SackA                             NaN
Pass2PtConv                       NaN
Plays                       81.000000
YdsPlay                           NaN
PenaltiesPerGame             5.000000
PenaltyYdsPerGame           51.333333
TotalOffYardsG             491.333333
home                         0.333333
year_x                    2014.000000
GP_Fr                        1.000000
GP_Jr                        8.000000
GP_So                        6.363636
GP_Sr                        9.900000
GS_Fr       

In [70]:
cols = final.columns

In [71]:
for col in cols: 
    print(col)

Team
Date
OffenseDefense_x
Opponent
Result
G
RushNetYards
ReceivingYards
IntRYds
PuntRetYds
FRetYds
KORetYds
APY
All-PurposeYardsG
FumblesRecovered
Int
Blkd
FF
FRet
FRetTD
FGM
FGA
FGBlocksAllowed
LongFGM
KO
KOYds
KOTB
KORet
KickRetTDs
PBU
IntRetTDs
PDef
PassAttempts
Completions
Interceptions
PassYards
PassTDs
PassEff
CompletionsPerGame
YdsPerCompletion
Pct
PassYardsG
Punts
PuntYds
PuntAvg
PuntTBs
LongPunt
PuntRet
PuntRetTDs
Rec
ReceptionsPerGame
YardsPerReception
RecTD
RecYdsG
RZScores
RZPts
RZRushTD
RZPassTD
RZFGMade
RZEndFGA
RZEndFumble
RZEndINT
RZEndDowns
RZEndHalf
RZEndGame
SackUA
SackA
SackYds
Sacks
TDs
KickPAT
PATAtt
RushPAT
Ru2PTAtt
ReceivingPAT
Pass2PtConv
Pass2PTAtt
KickReturnPAT
FumbRetPAT
FG
Safeties
Points
SoloTack
AsstTack
STFL
ATFL
TackleYds
Tackles
TOP
Plays
TotOff
YdsPlay
RushingFirstDowns
PassingFirstDowns
FirstDownsbyPenalty
Penalties
PenaltiesPerGame
PenaltyYards
PenaltyYdsPerGame
TotalOffYardsG
FumblesLost
opponent
home
WinLoss
Overtime
team_score
opponent_score
yea

In [67]:
gamebygame_part

Unnamed: 0,Date,Opponent,Result,G,Team,OffenseDefense,year,count
0,2015-09-05,Morgan St.,W 63 - 7,1/,Air Force Falcons,Offense,2015,1
2,2015-09-12,San Jose St.,W 37 - 16,1/,Air Force Falcons,Offense,2015,2
4,2015-09-19,@ Michigan St.,L 21 - 35,1/,Air Force Falcons,Offense,2015,3
6,2015-10-03,@ Navy,L 11 - 33,1/,Air Force Falcons,Offense,2015,4
8,2015-10-10,Wyoming,W 31 - 17,1/,Air Force Falcons,Offense,2015,5
10,2015-10-17,@ Colorado St.,L 23 - 38,1/,Air Force Falcons,Offense,2015,6
12,2015-10-24,Fresno St.,W 42 - 14,1/,Air Force Falcons,Offense,2015,7
14,2015-10-31,@ Hawaii,W 58 - 7,1/,Air Force Falcons,Offense,2015,8
16,2015-11-07,Army West Point,W 20 - 3,1/,Air Force Falcons,Offense,2015,9
18,2015-11-14,Utah St.,W 35 - 28,1/,Air Force Falcons,Offense,2015,10
