In [85]:
import pandas as pd
import numpy as np
import random

<h3>Load in historical NFL scores from www.pro-football-reference.com</h3>

In [31]:
DATA_URL = 'http://www.pro-football-reference.com/years/{year}/games.htm'

def load_score_data(min_year, max_year):
    """load NFL score data for given year range"""
    data = []
    for year in range(min_year, max_year):
        try:
            df = pd.read_html(DATA_URL.format(year=year))[0]
            df['year'] = year
            data.append(df)
        except IndexError:
            print('error loading data for year {}'.format(year))
    df = pd.concat(data)
    df = df[(df['PtsW'] != 'PtsW') & pd.notnull(df['PtsW'])] #drop extra header rows
    return df

df = load_score_data(2000, 2015)

In [32]:
df.head()

Unnamed: 0,Week,Day,Date,Unnamed: 3,Winner/tie,Unnamed: 5,Loser/tie,PtsW,PtsL,YdsW,TOW,YdsL,TOL,year
0,1,Sun,September 3,boxscore,Washington Redskins,,Carolina Panthers,20,17,396,0,236,1,2000
1,1,Sun,September 3,boxscore,Tampa Bay Buccaneers,@,New England Patriots,21,16,296,1,278,1,2000
2,1,Sun,September 3,boxscore,Atlanta Falcons,,San Francisco 49ers,36,28,359,1,339,1,2000
3,1,Sun,September 3,boxscore,Miami Dolphins,,Seattle Seahawks,23,0,308,1,143,6,2000
4,1,Sun,September 3,boxscore,Oakland Raiders,,San Diego Chargers,9,6,233,0,255,4,2000


<h3>Calculate square frequencies and run simulation</h3>

In [86]:
def score_frequencies(frame):
    """calculate score frequencies given data of scores"""
    score_pairs = pd.Series(list(zip(frame['PtsW'], frame['PtsL'])))
    return score_pairs.value_counts() / len(score_pairs)

def sb_square_frequencies(frame):
    """calculate frequency of a given super bowl squares pair
    occuring in given data of historial scores
    
    randomly permute win and loss digits 50% of the time because
    do not know if winning team on row or column axis
    
    """
    random.seed(242016)
    win_digit = frame['PtsW'].map(lambda x: int(x) % 10)
    loss_digit = frame['PtsL'].map(lambda x: int(x) % 10 )
    pairs = pd.Series(list(zip(win_digit, loss_digit)))
    pairs = pairs.map(lambda pair: (pair[1], pair[0]) if random.random() < 0.5 else pair)
    return pairs.value_counts() / len(pairs)

In [100]:
same_results = []
diff_results = []
for i in range(10000):
    first_choice = random.choice(freqs.index)
    same_row_col_set = [pair for pair in freqs.index if pair != first_choice
                        and (pair[0] == first_choice[0] or pair[1] == first_choice[1])]
    diff_row_col_set = [pair for pair in freqs.index if pair[0] != first_choice[0]
                        and pair[1] != first_choice[1]]
    same_choice = random.choice(same_row_col_set)
    diff_choice = random.choice(diff_row_col_set)
    same_win_pct = freqs[first_choice] + freqs[same_choice]
    diff_win_pct = freqs[first_choice] + freqs[diff_choice]
    same_results.append(same_win_pct)
    diff_results.append(diff_win_pct)

In [101]:
np.mean(same_results), np.mean(diff_results)

(0.019865404863374278, 0.020139984958636246)