In [3]:
import zipfile, os, re
import pandas as pd
import numpy as np 

In [4]:
_file = os.path.join('Data', 'Data.zip')
zf = zipfile.ZipFile(_file)
filenames = zf.namelist()

In [5]:
filenames

['all-purposeyards.csv',
 'Coaches.csv',
 'defense.csv',
 'fumbles.csv',
 'gamebygame_allpurposeyards.csv',
 'gamebygame_defense.csv',
 'gamebygame_fumbles.csv',
 'gamebygame_kicking.csv',
 'gamebygame_kickoffsandkoreturns.csv',
 'gamebygame_participation.csv',
 'gamebygame_passesdefended.csv',
 'gamebygame_passing.csv',
 'gamebygame_punting.csv',
 'gamebygame_puntreturns.csv',
 'gamebygame_receiving.csv',
 'gamebygame_redzone.csv',
 'gamebygame_sacks.csv',
 'gamebygame_scoring.csv',
 'gamebygame_tackles.csv',
 'gamebygame_totaloffense.csv',
 'gamebygame_turnovermargin.csv',
 'History.csv',
 'individualleaders.csv',
 'kicking.csv',
 'kickoffsandkoreturns.csv',
 'participation.csv',
 'passesdefended.csv',
 'passing.csv',
 'punting.csv',
 'puntreturns.csv',
 'receiving.csv',
 'redzone.csv',
 'results.csv',
 'Roster.csv',
 'rushing.csv',
 'sacks.csv',
 'scoring.csv',
 'tackles.csv',
 'teamstats.csv',
 'totaloffense.csv',
 'turnovermargin.csv',
 'yearly_allpurposeyards.csv',
 'yearly_defen

# History

In [6]:
history = pd.read_csv(zf.open('History.csv'))

In [7]:
history['year_new'] = history.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [8]:
history_trimmed = history[['Team', "Wins", "Losses", "year_new", "WL"]]

In [80]:
team = 'Nebraska Cornhuskers, Huskers'
target_year = 2016
target_year_min = target_year - 3

In [81]:
history[(history.Team == team) & (history.year_new <= target_year) & (history.year_new >= target_year_min)
       ][['WL']].mean()

WL    0.6345
dtype: float64

# Coach

In [84]:
coaches = pd.read_csv(zf.open('Coaches.csv'))

In [85]:
coaches['year_new'] = coaches.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [86]:
coach = 'Bob Diaco'
target_year = 2016

In [113]:
coach_record = coaches[(coaches.Name == coach) & (coaches.year_new <= target_year)][["Name","Wins","Losses","WL","year_new"]].groupby("Name").agg(['sum', 'count', 'mean'])

In [114]:
coach_record.columns = [x[0] + "_" + x[1]  for x in coach_record.columns.values]

In [115]:
coach_record.drop(['Wins_count', 'Wins_mean', 'Losses_count', 'Losses_mean', 'WL_sum', 'WL_count', 
                             'year_new_sum', 'year_new_mean'], axis=1, inplace = True)

In [116]:
coach_record['total_games'] = coach_record.Wins_sum + coach_record.Losses_sum
coach_record.reset_index(inplace = True)

In [117]:
coach_record.head()

Unnamed: 0,Name,Wins_sum,Losses_sum,WL_mean,year_new_count,total_games
0,Bob Diaco,11.0,26.0,0.293,3,37.0


# Roster

In [20]:
roster = pd.read_csv(zf.open('Roster.csv'))

In [21]:
roster['year_new'] = roster.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [22]:
def yrRecode(data): 
    yr = {'Fr':0, 'So':1, 'Jr':2, 'Sr':3}
    return yr[data]

In [23]:
roster['Yr_N'] = roster.apply(lambda x: yrRecode(x['Yr']) if pd.notnull(x['Yr']) else None, axis = 1)

In [24]:
roster_by_year = roster[['Team', 'Yr', 'year_new', 'GS']].groupby(['Team', 'year_new', 'Yr']).count().reset_index()

In [25]:
roster_games_started = roster[['Team', 'year_new', 'Yr', 'GS']].groupby(['Team', 'year_new', "Yr"]).mean().reset_index()

In [26]:
roster_games_played = roster[['Team', 'year_new', 'Yr', 'GP']].groupby(['Team', 'year_new', "Yr"]).mean().reset_index()

In [30]:
roster_final = pd.merge(roster_games_played, 
                        roster_games_started, 
                        left_on = ['Team', 'year_new', 'Yr'], 
                        right_on = ['Team', 'year_new', 'Yr'],
                        how = 'left'
                       )

In [68]:
roster_final2 = roster_final.set_index(['Team', 'year_new', 'Yr']).unstack().reset_index()

In [75]:
columns = [x[0] if x[0] in ['Team', 'year_new'] else x[0] + "_" + x[1] for x in roster_final2.columns.values]
roster_final2.columns = columns

# Results

In [25]:
game_by_game_results = pd.read_csv(zf.open('results.csv'))

In [26]:
game_by_game_results['homeaway'] = game_by_game_results.apply(lambda x: 'Away' if '@' in x['Opponent'] else 'Home', axis = 1 )

In [30]:
def extractData(data): 
    regexs = {'WL': "[WL]",
              'team': ["\d+\s\-", "\d+"],
              'opponent':["\-\s\d+", '\d+'],
              'OT':["\(\d+OT\)", "\d+"], 
             }
    
    # Win/Loss
    WinLoss = re.search(regexs['WL'], data).group()
    
    # Overtime 
    Overtime = 0
    re_ot = re.search(regexs['OT'][0], data)
    if re_ot:
        Overtime = re.search(regexs['OT'][1], re_ot.group()).group()
        
    # Team
    team_score = None
    re_team = re.search(regexs['team'][0], data)
    if re_team:
        team_score = re.search(regexs['OT'][1], re_team.group()).group()
        
    # Opponent 
    opponent_score = None
    re_opponent = re.search(regexs['opponent'][0], data)
    if re_opponent:
        opponent_score = re.search(regexs['OT'][1], re_opponent.group()).group()
        
    return pd.Series([WinLoss, Overtime, team_score, opponent_score])    

In [31]:
game_by_game_results[['WinLoss','Overtime', 'team_score', 'opponent_score']]=game_by_game_results.apply(lambda x: extractData(x['Result']), axis = 1)

In [32]:
game_by_game_results

Unnamed: 0,Date,Opponent,Result,Team,homeaway,WinLoss,Overtime,team_score,opponent_score
0,08/30/2014,"West Virginia @ Atlanta, Georgia",W 33 - 23,Alabama Crimson Tide,Away,W,0,33,23
1,09/06/2014,Fla. Atlantic,W 41 - 0,Alabama Crimson Tide,Home,W,0,41,0
2,09/13/2014,Southern Miss.,W 52 - 12,Alabama Crimson Tide,Home,W,0,52,12
3,09/20/2014,Florida,W 42 - 21,Alabama Crimson Tide,Home,W,0,42,21
4,10/04/2014,@ Ole Miss,L 17 - 23,Alabama Crimson Tide,Away,L,0,17,23
5,10/11/2014,@ Arkansas,W 14 - 13,Alabama Crimson Tide,Away,W,0,14,13
6,10/18/2014,Texas A&M,W 59 - 0,Alabama Crimson Tide,Home,W,0,59,0
7,10/25/2014,@ Tennessee,W 34 - 20,Alabama Crimson Tide,Away,W,0,34,20
8,11/08/2014,@ LSU,W 20 - 13 (1OT),Alabama Crimson Tide,Away,W,1,20,13
9,11/15/2014,Mississippi St.,W 25 - 20,Alabama Crimson Tide,Home,W,0,25,20
