In [222]:
import zipfile, os
import pandas as pd
import numpy as np 

In [223]:
_file = os.path.join('Data', 'Data.zip')
zf = zipfile.ZipFile(_file)
filenames = zf.namelist()

In [224]:
filenames

['all-purposeyards.csv',
 'Coaches.csv',
 'defense.csv',
 'fumbles.csv',
 'gamebygame_allpurposeyards.csv',
 'gamebygame_defense.csv',
 'gamebygame_fumbles.csv',
 'gamebygame_kicking.csv',
 'gamebygame_kickoffsandkoreturns.csv',
 'gamebygame_participation.csv',
 'gamebygame_passesdefended.csv',
 'gamebygame_passing.csv',
 'gamebygame_punting.csv',
 'gamebygame_puntreturns.csv',
 'gamebygame_receiving.csv',
 'gamebygame_redzone.csv',
 'gamebygame_sacks.csv',
 'gamebygame_scoring.csv',
 'gamebygame_tackles.csv',
 'gamebygame_totaloffense.csv',
 'gamebygame_turnovermargin.csv',
 'History.csv',
 'individualleaders.csv',
 'kicking.csv',
 'kickoffsandkoreturns.csv',
 'participation.csv',
 'passesdefended.csv',
 'passing.csv',
 'punting.csv',
 'puntreturns.csv',
 'receiving.csv',
 'redzone.csv',
 'results.csv',
 'Roster.csv',
 'rushing.csv',
 'sacks.csv',
 'scoring.csv',
 'tackles.csv',
 'teamstats.csv',
 'totaloffense.csv',
 'turnovermargin.csv',
 'yearly_allpurposeyards.csv',
 'yearly_defen

# History

In [302]:
history = pd.read_csv(zf.open('History.csv'))

In [303]:
history['year_new'] = history.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [304]:
history_trimmed = history[['Team', "Wins", "Losses", "year_new", "WL"]]

In [319]:
team = 'Nebraska Cornhuskers, Huskers'
target_year = 2016

In [323]:
history[(history.Team == team) & (history.year_new <= target_year)][['WL']].mean()

WL    0.689882
dtype: float64

# Coach

In [408]:
coaches = pd.read_csv(zf.open('Coaches.csv'))

In [409]:
coaches['year_new'] = coaches.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [410]:
coach = 'Bob Diaco'
target_year = 2016

In [411]:
coaches[(coaches.Name == coach) & (coaches.year_new <= target_year)][['WL']].mean()

WL    0.293
dtype: float64

In [412]:
coach_record = coaches[(coaches.year_new <= target_year)][["Name","Wins","Losses","WL","year_new"]].groupby("Name").agg(['sum', 'count', 'mean'])

In [413]:
coach_record.columns = [x[0] + "_" + x[1]  for x in coach_record.columns.values]

In [414]:
coach_record.drop(['Wins_count', 'Wins_mean', 'Losses_count', 'Losses_mean', 'WL_sum', 'WL_count', 
                             'year_new_sum', 'year_new_mean'], axis=1, inplace = True)

In [415]:
coach_record['total_games'] = coach_record.Wins_sum + coach_record.Losses_sum
coach_record.reset_index(inplace = True)

In [419]:
coach_record.Name.unique()

array(['Adam Scheier', 'Barney Cotton', 'Barry Alvarez', 'Barry Odom',
       'Bill Blankenship', "Bill O'Brien", 'Bill Snyder', 'Blake Anderson',
       'Bob Diaco', 'Bob Gregory', 'Bob Stoops', 'Bobby Wilder',
       'Brad Lambert', 'Brian Polian', 'Brian Wright', 'Bryan McClendon',
       'Carl Pelini', 'Chad Morris', 'Charley Molnar', 'Charlie Partridge',
       'Chris Ash', 'Chris Naeole', 'Clay Helton', 'Clint Bowen',
       'Curtis Johnson', 'Dabo Swinney', 'Dan Mullen', 'Dana Holgorsen',
       'Danny Barrett', 'Dave Christensen', 'David Beaty', 'David Gibbs',
       'David Shaw', 'Dell McGee', 'Derek Mason', 'Don Treadwell',
       'Eric Kiesau', 'Frank Wilson', 'Garrick McGee', 'Gary Patterson',
       'Gerad Parker', 'Jason Candle', 'Jim L. Mora', 'Jimbo Fisher',
       'Joe Moglia', 'Joe Rudolph', 'John "Doc" Holliday', 'John Bonamego',
       'Kalani Sitake', 'Ken Niumatalolo', 'Kevin Wilson', 'Kirby Smart',
       'Kliff Kingsbury', 'Kyle Flood', 'Kyle Whittingham',
     

# Roster

In [307]:
roster = pd.read_csv(zf.open('Roster.csv'))

In [308]:
roster['year_new'] = roster.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [309]:
def yrRecode(data): 
    yr = {'Fr':0, 'So':1, 'Jr':2, 'Sr':3}
    return yr[data]

In [310]:
roster['Yr_N'] = roster.apply(lambda x: yrRecode(x['Yr']) if pd.notnull(x['Yr']) else None, axis = 1)

In [311]:
roster_by_year = roster[['Team', 'Yr', 'year_new', 'GS']].groupby(['Team', 'year_new', 'Yr']).count().reset_index()

In [312]:
roster_games_started = roster[['Team', 'year_new', 'Yr', 'GS']].groupby(['Team', 'year_new', "Yr"]).mean().reset_index()

In [313]:
roster_games_played = roster[['Team', 'year_new', 'Yr', 'GP']].groupby(['Team', 'year_new', "Yr"]).mean().reset_index()

# Results

In [288]:
game_by_game_results = pd.read_csv(zf.open('results.csv'))

In [289]:
game_by_game_results['homeaway'] = game_by_game_results.apply(lambda x: 'Away' if '@' in x['Opponent'] else 'Home', axis = 1 )

In [290]:
def extractData(data): 
    regexs = {'WL': "[WL]",
              'team': ["\d+\s\-", "\d+"],
              'opponent':["\-\s\d+", '\d+'],
              'OT':["\(\d+OT\)", "\d+"], 
             }
    
    # Win/Loss
    WinLoss = re.search(regexs['WL'], data).group()
    
    # Overtime 
    Overtime = 0
    re_ot = re.search(regexs['OT'][0], data)
    if re_ot:
        Overtime = re.search(regexs['OT'][1], re_ot.group()).group()
        
    # Team
    team_score = None
    re_team = re.search(regexs['team'][0], data)
    if re_team:
        team_score = re.search(regexs['OT'][1], re_team.group()).group()
        
    # Opponent 
    opponent_score = None
    re_opponent = re.search(regexs['opponent'][0], data)
    if re_opponent:
        opponent_score = re.search(regexs['OT'][1], re_opponent.group()).group()
        
    return pd.Series([WinLoss, Overtime, team_score, opponent_score])    

In [291]:
game_by_game_results[['WinLoss','Overtime', 'team_score', 'opponent_score']]=game_by_game_results.apply(lambda x: extractData(x['Result']), axis = 1)