In [17]:
import os
import pandas as pd
import requests, scrapy

In [18]:
data = pd.read_csv(os.path.join("Links", 'gamebygamelinks.csv'))

In [20]:
data = data.to_dict(orient='record')

In [21]:
data

[{'link': 'http://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=11520&org_id=721&stats_player_seq=-100',
  'team': 'Air Force Falcons',
  'year': '2013-14'}]

In [64]:
test = {}
for x in data: 
    r = requests.get(x['Link'])
    tables = pd.read_html(r.content)
    table = tables[2]
    stop_index = [1 if 'Unnamed: ' in column  else 0 for idx, column in enumerate(table.columns)].index(1)
    columnNamesFixed = {col:col.replace(" ","").replace("/","").replace(".","") for col in table.iloc[:,:stop_index].columns}
    table.rename(columns = columnNamesFixed, inplace = True)
    final_table = table[:-3].iloc[:,:stop_index]
    test[x['Stat']] = final_table

In [19]:
def table_cleaner(html, target_table, trim_n_rows = 0):
    """Read HTML string and return a table
       html : HTML string
       target_table : Target table number
       trim_n_rows : Trim X number of rows from the end of the table (enter as positive number)
    """
    tables = pd.read_html(html)
    table = tables[target_table]
    stop_index = [1 if 'Unnamed: ' in column  else 0 for idx, column in enumerate(table.columns)].index(1)
    columnNamesFixed = {col:col.replace(" ","").replace("/","").replace(".","") for col in table.iloc[:,:stop_index].columns}
    table.rename(columns = columnNamesFixed, inplace = True)
    return table[:-trim_n_rows].iloc[:,:stop_index]
    

In [22]:
data[0]

{'link': 'http://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=11520&org_id=721&stats_player_seq=-100',
 'team': 'Air Force Falcons',
 'year': '2013-14'}

In [24]:
url = "http://stats.ncaa.org/player/index?id=11520&org_id=721&stats_player_seq=-100&year_stat_category_id=10392"
r = requests.get(url)

In [25]:
tables = pd.read_html(r.content)

In [198]:
def yearlystats(table):
    """Clean up the yearly game-to-game stats table
        table : pandas DF you want to clean 
    """
    # Drop unneeded header row
    tmp = table.iloc[1:,]
    # Create a header row removing spaces, /'s, and x's
    tmp.columns = [x.replace(" ", "").replace("/","").replace(".","") for x in tmp.iloc[0]]
    # Drop the row used to create header row
    tmp = tmp.drop(tmp.index[0])
    # Forward fill the year for analysis later 
    tmp['Year'].fillna(method='ffill', inplace = True)
    # Create a new offense/defense variable 
    tmp['OffenseDefense'] = tmp['Team']
    # Figure out which team we are working with 
    curr_team = tmp.iloc[:1,1:3].values[0][0]
    # Create a new team variable
    tmp['Team'] = curr_team
    # In the offense defense variable, fill in the offense defense variable 
    tmp['OffenseDefense'] = tmp['OffenseDefense'].apply(lambda x: 'Offense' if x == curr_team else x)
    return tmp 

In [199]:
yearlystats(tables[2])

Unnamed: 0,Year,Team,GP,G,PassAttempts,Completions,Interceptions,PassYards,PassTDs,PassEff,CompletionsPerGame,YdsPerCompletion,CompletionPct,PassYardsG,OffenseDefense
2,2013-14,Air Force,12,12,174,93,4,1259,8,124.8,7.75,13.538,0.534,104.92,Offense
3,2013-14,Air Force,0,12,326,237,4,2876,26,170.67,19.75,12.135,0.727,239.67,Defensive
4,2014-15,Air Force,13,13,206,117,4,1893,15,154.13,9.0,16.179,0.568,145.62,Offense
5,2014-15,Air Force,0,13,434,239,10,3369,20,130.87,18.38,14.096,0.551,259.15,Defensive
6,2015-16,Air Force,14,14,170,87,12,1847,15,157.44,6.21,21.23,0.512,131.93,Offense
7,2015-16,Air Force,0,14,387,205,11,2942,23,130.76,14.64,14.351,0.53,210.14,Defensive
8,2016-17,Air Force,13,13,150,68,8,1752,14,163.58,5.23,25.765,0.453,134.77,Offense
9,2016-17,Air Force,0,13,383,201,15,3259,23,1778.72,201.0,232.815,6.467,3259.0,Defensive
10,2017-18,Air Force,12,12,143,69,6,1321,10,140.53,5.75,19.145,0.483,110.08,Offense
11,2017-18,Air Force,0,12,243,135,5,2053,16,144.14,11.25,15.207,0.556,171.08,Defensive


In [196]:
def gamestats(table, curr_team):
    """ Clean up game stats table
        table : pandas DF you want to clean 
        curr_team : team name you want to add 
    """
    
    # Drop unneeded header 
    tmp = table.iloc[1:,]
    # Fix the column names by reading line 0
    tmp.columns = [x.replace(" ", "").replace("/","").replace(".","") for x in tmp.iloc[0]]
    # Drop row zero which held the header row
    tmp = tmp.drop(tmp.index[0])
    # Forward fill the dates for defensive split later 
    tmp['Date'].fillna(method='ffill', inplace = True)
    # Add in the team 
    tmp['Team'] = curr_team
    # Create an offense/defense variable
    tmp['OffenseDefense'] = tmp['Opponent']
    # If it's not a defensive total then it's offense - set that in the offensedefense variable
    tmp['OffenseDefense'] = tmp['OffenseDefense'].apply(lambda x: "Defense" if x == "Defensive Totals" else "Offense")
    # Set the defensive totals in the opponent varaible to nullls
    tmp['Opponent'] = tmp['Opponent'].apply(lambda x: None if x == "Defensive Totals" else x)
    # Forward fill the opponents in for analysis later
    tmp['Opponent'].fillna(method='ffill', inplace = True)
    # Forward fill the results in for analysis later 
    tmp['Result'].fillna(method='ffill', inplace = True)
    return tmp 
    