# Logistic Regression


Adapted from Jonathan Stray's risk-ratios repo
https://github.com/jstray/risk-ratios

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
pd.set_option('max_columns', None)



Consider this recent [Miami Herald story](https://www.miamiherald.com/sports/nfl/article258302943.html) about the hiring of Black head coaches in the NFL:

> The chances of landing an NFL head coaching position were three times better for white candidates compared to their non-white counterparts — even after including the most recent hires and controlling for age, number of opportunities, previous coaching position and years of experience in the league.

How do we get to this conclusion? Let's look at the data, which can be downloaded from the Herald [here](https://docs.google.com/spreadsheets/d/1lVPgIu7OKg40trVMnVlzg5EvnAByxXJCWQqGL1qDBis/edit#gid=0). For this exercise we'll use a slightly reformmated version of the data, with one row per candidate per year (candidates often apply for multiple jobs in the same year).

In [2]:
url = "https://raw.githubusercontent.com/jstray/risk-ratios/main/FINAL_coaches_by_year.csv"
df = pd.read_csv(url)

df

Unnamed: 0,Unit_of_Analysis,Coach_ID,Name,Age,Hired,Year,Number_of_Interviews_That_Year,Previous_Job,Previous_Job_Coded,NFL_Playing_Experience,NFL_Coaching_Experience,Total_NFL_Experience,Black,White,Minority,OC,DC,HC
0,Aaron Glenn 2021,1,Aaron Glenn,48,0,2021,1,Other NFL Job,5,15,8,23,1,0,1,0,0,0
1,Aaron Glenn 2022,1,Aaron Glenn,49,0,2022,2,Defensive Coordinator,4,15,9,24,1,0,1,0,1,0
2,Adam Gase 2015,2,Adam Gase,36,0,2015,5,Offensive Coordinator,3,0,12,12,0,1,0,1,0,0
3,Adam Gase 2016,2,Adam Gase,37,1,2016,4,Offensive Coordinator,3,0,13,13,0,1,0,1,0,0
4,Adam Gase 2019,2,Adam Gase,40,1,2019,2,Head Coach Previous Season,1,0,16,16,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Vic Fangio 2018,119,Vic Fangio,59,0,2018,1,Defensive Coordinator,4,0,31,31,0,1,0,0,1,0
204,Vic Fangio 2019,119,Vic Fangio,60,1,2019,1,Defensive Coordinator,4,0,32,32,0,1,0,0,1,0
205,Vic Fangio 2022,119,Vic Fangio,63,0,2022,1,Head Coach Previous Season,1,0,35,35,0,1,0,0,0,1
206,Winston Moss 2018,120,Winston Moss,52,0,2018,1,Other NFL Job,5,11,12,23,1,0,1,0,0,0


# Exercise 1:  Exploratory pivot tables

In [3]:
piv = df.pivot_table(index='Hired', columns='White', aggfunc='count', values='Coach_ID')
display(piv)

# pct_hired = piv.apply(lambda x: x/sum(x)).round(2)
# display(pct_hired)

White,0,1
Hired,Unnamed: 1_level_1,Unnamed: 2_level_1
0,61,91
1,11,45


*  What does the pivot table above tell us?

👉 _(your answer here)_



### Now you try!
Create a few more pivot tables to help explain the probability of getting hired, for each pivot table you make...write a sentence explaining your takeaway. You can copy and paste the code from above and switch around the "index" and "columns" fields as needed.

_note: Pivot tables help you quickly see the relationships between categorical variables. If you decide to investigate continuous variables, a histogram faceted by "Hired" may be more helpful than a pivot table._

In [4]:
df.pivot_table(index='Hired', columns='Black', aggfunc='count', values='Coach_ID')

Black,0,1
Hired,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93,59
1,48,8


In [5]:
df.pivot_table(index='Hired', columns='Minority', aggfunc='count', values='Coach_ID')

Minority,0,1
Hired,Unnamed: 1_level_1,Unnamed: 2_level_1
0,92,60
1,45,11


In [6]:
# would probably need age buckets
df.pivot_table(index='Hired', columns='Age', aggfunc='count', values='Coach_ID')

Age,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,69,70
Hired,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
0,1.0,1.0,2.0,1.0,3.0,3.0,5.0,4.0,7.0,5.0,8.0,8.0,5.0,6.0,6.0,5.0,4.0,6.0,6.0,8.0,9.0,8.0,5.0,3.0,3.0,4.0,5.0,6.0,1.0,1.0,3.0,3.0,2.0,1.0,,1.0,1.0,1.0,1.0
1,1.0,,,,1.0,1.0,4.0,5.0,4.0,1.0,,4.0,1.0,4.0,2.0,2.0,,3.0,1.0,1.0,3.0,4.0,1.0,3.0,,3.0,1.0,1.0,,2.0,,,1.0,,1.0,1.0,,,


In [7]:
# would also probably need buckets
df.pivot_table(index='Hired', columns='NFL_Coaching_Experience', aggfunc='count', values='Coach_ID')

NFL_Coaching_Experience,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,35
Hired,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
0,5.0,2.0,3.0,4.0,2.0,2.0,4.0,4.0,5.0,10.0,6.0,6.0,12.0,7.0,11.0,5.0,9.0,7.0,8.0,7.0,6.0,3.0,5.0,2.0,4.0,1.0,2.0,4.0,2.0,2.0,1.0,,1.0
1,2.0,1.0,,1.0,2.0,,1.0,2.0,2.0,2.0,2.0,2.0,7.0,6.0,2.0,3.0,2.0,2.0,4.0,1.0,2.0,4.0,,1.0,,1.0,2.0,1.0,,,,1.0,


In [8]:
pd.crosstab(df.Hired, df.Black, normalize='index')

Black,0,1
Hired,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.611842,0.388158
1,0.857143,0.142857


In [9]:
pd.crosstab(df.Hired, df.Minority, normalize='index')

Minority,0,1
Hired,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.605263,0.394737
1,0.803571,0.196429


# Exercise 2: Odds Ratio

Calculate the odds ratio of the chances of landing an NFL head coaching position white candidates compared to their non-white counterparts


In [10]:
df.pivot_table(index='Hired', columns='White', aggfunc='count', values='Coach_ID')

White,0,1
Hired,Unnamed: 1_level_1,Unnamed: 2_level_1
0,61,91
1,11,45


In [11]:
a = 45 # a: are white and did get hired
b = 11 # b: not white and did get hired
c = 91 # c: white and not hired
d = 61 # d: not white and not hired

odds_ratio_of_coaching_job_if_white = (a/b) / (c/d)

odds_ratio_of_coaching_job_if_white

# the odds of a white coach getting hired if you were interviewed are 2.7 times *higher* than if you were a non-white coach
# (don't say more likely, more likely is PROBABILITY)

2.742257742257742



**Bonus** (optional) 🤖

Calculate the risk ratio. How did the two compare?


In [12]:
risk_ratio_of_coaching_job_if_white = (a/(a+b)) / (c/(c+d))

risk_ratio_of_coaching_job_if_white

# the *chance* of a white coach being hired after being interviewed would be 1.34 times *more likely* than an
# interviewed, non-white coach
# risk ratios are PERCENTS

1.3422291993720565

# Exercise 3: Logistic Regression

Try to calculate an odds ratio of landing an NFL head coaching job, but controlling for various other factors using a logistic regression. You can reference this guide at investigate.ai for python syntax https://investigate.ai/regression/logistic-regression-quickstart/

In [13]:
import statsmodels.formula.api as smf

# Here is a basic logistic regression to get you started. 
# You'll need to replace Y And X. Remember, you can add more variables like this
# Y ~ X1 + X2 + X3

model = smf.logit("White ~ Hired", data=df)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.625583
         Iterations 5


0,1,2,3
Dep. Variable:,White,No. Observations:,208.0
Model:,Logit,Df Residuals:,206.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 19 Apr 2022",Pseudo R-squ.:,0.03015
Time:,15:46:25,Log-Likelihood:,-130.12
converged:,True,LL-Null:,-134.17
Covariance Type:,nonrobust,LLR p-value:,0.004448

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.4000,0.165,2.417,0.016,0.076,0.724
Hired,1.0088,0.375,2.691,0.007,0.274,1.743


Hmmm...that coefficient is a logit. How do we make sense of that? 

You can exponentiate in order to convert the coefficient into an odds ratio
https://investigate.ai/regression/logistic-regression-quickstart/#Converting-coefficient-to-odds-ratio


In [14]:
coefs = pd.DataFrame({
    'coef': results.params.values,
    'odds ratio': np.exp(results.params.values),
    'pvalue': results.pvalues,
    'name': results.params.index
})

coefs

# A white coach who was interviewed is nearly three times more likely to be hired than a non-white coach who was interviewed

Unnamed: 0,coef,odds ratio,pvalue,name
Intercept,0.399986,1.491803,0.015641,Intercept
Hired,1.008782,2.742258,0.007121,Hired


In [15]:
df['predicted_proba'] = results.predict()
# note that logistic regression gives you predicted probabilitites
# but there is no such thing as a residual here as the outcome is binary...
df.sort_values(by='predicted_proba', ascending=False)

Unnamed: 0,Unit_of_Analysis,Coach_ID,Name,Age,Hired,Year,Number_of_Interviews_That_Year,Previous_Job,Previous_Job_Coded,NFL_Playing_Experience,NFL_Coaching_Experience,Total_NFL_Experience,Black,White,Minority,OC,DC,HC,predicted_proba
207,Zac Taylor 2019,121,Zac Taylor,35,1,2019,3,Other NFL Job,5,0,6,6,0,1,0,0,0,0,0.803571
172,Robert Saleh 2021,102,Robert Saleh,42,1,2021,6,Defensive Coordinator,4,0,16,16,0,0,1,0,1,0,0.803571
98,John Fox 2015,61,John Fox,60,1,2015,1,Head Coach Previous Season,1,0,26,26,0,1,0,0,0,1,0.803571
136,Matt LaFleur 2019,80,Matt LaFleur,39,1,2019,1,Offensive Coordinator,3,0,10,10,0,1,0,1,0,0,0.803571
40,David Culley 2021,24,David Culley,65,1,2021,1,Other NFL Job,5,0,27,27,1,0,1,0,0,0,0.803571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,Jim Caldwell 2021,51,Jim Caldwell,66,0,2021,1,Former Head Coach,2,0,18,18,1,0,1,0,0,0,0.598684
85,Jim Caldwell 2022,51,Jim Caldwell,67,0,2022,2,Former Head Coach,2,0,18,18,1,0,1,0,0,0,0.598684
86,Jim Harbaugh 2022,52,Jim Harbaugh,58,0,2022,1,Former Head Coach,2,14,6,20,0,1,0,0,0,0,0.598684
87,Jim Schwartz 2015,53,Jim Schwartz,48,0,2015,1,Defensive Coordinator,4,0,22,22,0,1,0,0,1,0,0.598684


## Trying to find the expected values

In [17]:
# assume equal opportunity means random selection from this set
# total people hired
df.Hired.sum()

56

In [18]:
# total people
len(df)

208

In [19]:
# percent of people hired (hired/total) times the number of Black interviewees --> is the prediction
56/208 * df.Black.sum()

18.038461538461537

In [None]:
# 

# Discussion

What can we conclude based on our analysis so far?


What can't we conclude based on our analysis so far?


What questions does this analysis leave us with?
- reporting questions
- quant questions