In [1]:
import zipfile, os, re
import pandas as pd
import numpy as np 

In [4]:
_file = os.path.join('ncaa_football_scrapy','Data', 'Data.zip')
zf = zipfile.ZipFile(_file)
filenames = zf.namelist()

In [5]:
filenames

['all-purposeyards.csv',
 'coaches.csv',
 'defense.csv',
 'fumbles.csv',
 'gamebygame_allpurposeyards.csv',
 'gamebygame_defense.csv',
 'gamebygame_fumbles.csv',
 'gamebygame_kicking.csv',
 'gamebygame_kickoffsandkoreturns.csv',
 'gamebygame_participation.csv',
 'gamebygame_passesdefended.csv',
 'gamebygame_passing.csv',
 'gamebygame_punting.csv',
 'gamebygame_puntreturns.csv',
 'gamebygame_receiving.csv',
 'gamebygame_redzone.csv',
 'gamebygame_sacks.csv',
 'gamebygame_scoring.csv',
 'gamebygame_tackles.csv',
 'gamebygame_totaloffense.csv',
 'gamebygame_turnovermargin.csv',
 'history.csv',
 'individualleaders.csv',
 'kicking.csv',
 'kickoffsandkoreturns.csv',
 'participation.csv',
 'passesdefended.csv',
 'passing.csv',
 'punting.csv',
 'puntreturns.csv',
 'receiving.csv',
 'redzone.csv',
 'results.csv',
 'roster.csv',
 'rushing.csv',
 'sacks.csv',
 'scoring.csv',
 'tackles.csv',
 'teamstats.csv',
 'totaloffense.csv',
 'turnovermargin.csv',
 'yearly_allpurposeyards.csv',
 'yearly_defen

# History

In [7]:
history = pd.read_csv(zf.open('history.csv'))

In [8]:
history['year_new'] = history.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [9]:
history_trimmed = history[['Team', "Wins", "Losses", "year_new", "WL"]]

In [10]:
team = 'Nebraska Cornhuskers, Huskers'
target_year = 2016
target_year_min = target_year - 3

In [11]:
history[(history.Team == team) & (history.year_new <= target_year) & (history.year_new >= target_year_min)
       ][['WL']].mean()

WL    0.6345
dtype: float64

# Coach

In [12]:
coaches = pd.read_csv(zf.open('coaches.csv'))

In [13]:
coaches['year_new'] = coaches.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [14]:
coach = 'Bob Diaco'
target_year = 2016

In [15]:
coach_record = coaches[(coaches.Name == coach) & (coaches.year_new <= target_year)][["Name","Wins","Losses","WL","year_new"]].groupby("Name").agg(['sum', 'count', 'mean'])

In [16]:
coach_record.columns = [x[0] + "_" + x[1]  for x in coach_record.columns.values]

In [17]:
coach_record.drop(['Wins_count', 'Wins_mean', 'Losses_count', 'Losses_mean', 'WL_sum', 'WL_count', 
                             'year_new_sum', 'year_new_mean'], axis=1, inplace = True)

In [18]:
coach_record['total_games'] = coach_record.Wins_sum + coach_record.Losses_sum
coach_record.reset_index(inplace = True)

In [19]:
coach_record.head()

Unnamed: 0,Name,Wins_sum,Losses_sum,WL_mean,year_new_count,total_games
0,Bob Diaco,11.0,26.0,0.293,3,37.0


# Roster

In [20]:
roster = pd.read_csv(zf.open('roster.csv'))

In [21]:
roster['year_new'] = roster.apply(lambda x: int(x['Year'][:4]), axis = 1)

In [22]:
def yrRecode(data): 
    yr = {'Fr':0, 'So':1, 'Jr':2, 'Sr':3}
    return yr[data]

In [23]:
roster['Yr_N'] = roster.apply(lambda x: yrRecode(x['Yr']) if pd.notnull(x['Yr']) else None, axis = 1)

In [24]:
roster_by_year = roster[['Team', 'Yr', 'year_new', 'GS']].groupby(['Team', 'year_new', 'Yr']).count().reset_index()

In [25]:
roster_games_started = roster[['Team', 'year_new', 'Yr', 'GS']].groupby(['Team', 'year_new', "Yr"]).mean().reset_index()

In [26]:
roster_games_played = roster[['Team', 'year_new', 'Yr', 'GP']].groupby(['Team', 'year_new', "Yr"]).mean().reset_index()

In [27]:
roster_final = pd.merge(roster_games_played, 
                        roster_games_started, 
                        left_on = ['Team', 'year_new', 'Yr'], 
                        right_on = ['Team', 'year_new', 'Yr'],
                        how = 'left'
                       )

In [28]:
roster_final2 = roster_final.set_index(['Team', 'year_new', 'Yr']).unstack().reset_index()

In [29]:
columns = [x[0] if x[0] in ['Team', 'year_new'] else x[0] + "_" + x[1] for x in roster_final2.columns.values]
roster_final2.columns = columns

# Results

In [30]:
game_by_game_results = pd.read_csv(zf.open('results.csv'))

In [31]:
game_by_game_results['homeaway'] = game_by_game_results.apply(lambda x: 'Away' if '@' in x['Opponent'] else 'Home', axis = 1 )

In [32]:
def extractData(data): 
    regexs = {'WL': "[WL]",
              'team': ["\d+\s\-", "\d+"],
              'opponent':["\-\s\d+", '\d+'],
              'OT':["\(\d+OT\)", "\d+"], 
             }
    
    # Win/Loss
    WinLoss = re.search(regexs['WL'], data).group()
    
    # Overtime 
    Overtime = 0
    re_ot = re.search(regexs['OT'][0], data)
    if re_ot:
        Overtime = re.search(regexs['OT'][1], re_ot.group()).group()
        
    # Team
    team_score = None
    re_team = re.search(regexs['team'][0], data)
    if re_team:
        team_score = re.search(regexs['OT'][1], re_team.group()).group()
        
    # Opponent 
    opponent_score = None
    re_opponent = re.search(regexs['opponent'][0], data)
    if re_opponent:
        opponent_score = re.search(regexs['OT'][1], re_opponent.group()).group()
        
    return pd.Series([WinLoss, Overtime, team_score, opponent_score])    

In [33]:
game_by_game_results[['WinLoss','Overtime', 'team_score', 'opponent_score']]=game_by_game_results.apply(lambda x: extractData(x['Result']), axis = 1)

In [46]:
def opponent(data): 
    regex = '[\w\s]+\.?\s\@\s\w+'
    if '@' in data: 
        search = re.search(regex, data) 
        if search: 
            return data.split("@")[0].strip()
        else: 
            return data.replace("@","").strip()
    else: 
        return data

In [47]:
game_by_game_results['opp2'] = game_by_game_results.apply(lambda x: opponent(x['Opponent']), axis = 1)

In [49]:
game_by_game_results.head()

Unnamed: 0,Date,Opponent,Result,Team,homeaway,WinLoss,Overtime,team_score,opponent_score,opp2
0,08/29/2013,@ UCF,L 7 - 38,Akron Zips,Away,L,0,7,38,UCF
1,09/07/2013,James Madison,W 35 - 33,Akron Zips,Home,W,0,35,33,James Madison
2,09/14/2013,@ Michigan,L 24 - 28,Akron Zips,Away,L,0,24,28,Michigan
3,09/21/2013,Louisiana,L 30 - 35,Akron Zips,Home,L,0,30,35,Louisiana
4,09/28/2013,@ Bowling Green,L 14 - 31,Akron Zips,Away,L,0,14,31,Bowling Green
