-
Notifications
You must be signed in to change notification settings - Fork 9
/
simpsons_paradox.py
611 lines (469 loc) · 23.2 KB
/
simpsons_paradox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
"""Automatic Simpson's Paradox Detector"""
from itertools import permutations
from IPython.display import display
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.preprocessing import KBinsDiscretizer
import utilities
class SimpsonsParadox:
"""A class to automatically detect Simpson's Paradox in a dataset"""
def __init__(self, df, dv, model='', ignore_columns=None, bin_columns=None,
bin_method='quantile', min_corr=0.01, max_pvalue=0.05,
min_coeff=0.00001, standardize=True, output_plots=False,
target_category=None, weighting=True, quiet=False):
"""
Attributes
----------
df: dataset to check for Simpson's pairs in
dv: name of the dataset's dependent variable
model: type of regression models to build
ignore_columns: list of variables to ignore
bin_columns: list of variables to bin
bin_method: method to bin large variables
min_corr: minimum correlation to check for Simpson's Paradox
max_pvalue: maximum p-value to filter out Simpson's pairs
min_coeff: minimum coefficient to filter out Simpson's pairs
standardize: option to z-score standardize continuous variables
output_plots: option to display plots and summary statistics
target_category: target category for 1-versus-all regression
weighting: option to include or exclude weak Simpson's pairs
quiet: option to silence all warnings and verbosity
Methods
-------
encode_variables():
Encodes string and Boolean columns to integers.
standardize_variables(columns_to_scale=[]):
Standardizes variables with high cardinality.
get_correlations():
Computes correlations for numeric variables.
bin_variable(variable=''):
Discretizes a large variable into 5 bins.
linear_regression(dv='', iv=''):
Builds and outputs linear regression model.
logistic_regression(dv='', iv=''):
Builds and outputs logistic regression model.
build_model(dv='', iv=''):
Builds and outputs regression model results.
create_subgroups(iv='', cv=''):
Disaggregates dataset and builds models.
plot_trendline(cv='', iv='', predictions=[]):
Plots trendlines using linear regression.
get_simpsons_pairs():
Checks every pair for Simpson's Paradox.
"""
self.df = df
self.dv = dv
self.model = model
self.ignore_columns = ignore_columns
self.bin_columns = bin_columns
self.output_plots = output_plots
self.standardize = standardize
self.bin_method = bin_method
self.max_pvalue = max_pvalue
self.min_coeff = min_coeff
self.min_corr = min_corr
self.target_category = target_category
self.weighting = weighting
self.quiet = quiet
def encode_variables(self):
"""Encodes string or Boolean columns to integers.
Returns:
df: analysis dataset with encoded columns
"""
columns_to_encode = self.df.select_dtypes(include=[
'bool', 'object']).columns
self.df[columns_to_encode] = pd.DataFrame(data={
column: self.df[column].astype('category').cat.codes
for column in columns_to_encode}, index=self.df.index)
return self.df
def standardize_variables(self, columns_to_scale):
"""Standardizes variables with cardinality greater than 10.
Assumes all the variables in the dataset are numeric.
Args:
columns_to_scale: dataset's independent variables
Returns:
df: analysis dataset with standardized columns
"""
columns_to_scale = [column for column in columns_to_scale
if self.df[column].nunique() > 10]
def scale(values):
return (values - np.mean(values)) / np.std(values)
if columns_to_scale != []:
scaled_columns = [column+'_scaled' for column in columns_to_scale]
self.df[scaled_columns] = self.df[columns_to_scale].apply(scale)
return self.df
def get_correlations(self):
"""Computes correlations for numeric variables.
Returns:
corr_matrix: correlation matrix
"""
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_variables = self.df.select_dtypes(include=numerics).columns
if len(numeric_variables) > 0:
corr_matrix = self.df[numeric_variables].corr(method='pearson')
else:
corr_matrix = pd.DataFrame([])
return corr_matrix
def bin_variable(self, variable):
"""Function to discretize a variable into 5 bins.
The choices are: 'quantile', 'kmeans', 'uniform'
Args:
variable: variable to bin
Returns:
binned variable
"""
return KBinsDiscretizer(encode='ordinal',
strategy=self.bin_method).fit_transform(
variable)
def linear_regression(self, df, iv):
"""Builds linear regression model.
Args:
df: dataset or subset of dataset
iv: desired independent variable
Returns:
predictions: model's predictions
output_df: model's summary table
"""
# Fit linear regression
X, y = df[iv], df[self.dv]
x_ = sm.add_constant(X)
model = sm.OLS(y, x_).fit()
# Get coefficients and their p-values
coefficients, pvalues = model.params, model.pvalues
# Get model predictions
predictions = model.predict()
# Build model output table
output_df = pd.DataFrame(data={
'coefficient': coefficients, 'pvalue': pvalues
}).reset_index()
output_df = output_df.rename(columns={'index': 'variable'})
output_df = output_df[output_df.variable != 'const']
return predictions, output_df
def logistic_regression(self, df, iv):
"""Builds logistic regression model.
Args:
df: dataset or subset of dataset
iv: desired independent variable
Returns:
predictions: model's predictions
output_df: model's summary table
"""
# Fit logistic regression
X, y = df[iv], df[self.dv]
x_ = sm.add_constant(X)
model = sm.Logit(y, x_).fit(method='bfgs', disp=False)
# Get coefficients and their p-values
coefficients, pvalues = model.params, model.pvalues
# Get odds and their confidence intervals
odds, odds_conf = np.exp(coefficients), np.exp(model.conf_int())
lower_conf = odds_conf[odds_conf.index != 'const'][0].values[0]
upper_conf = odds_conf[odds_conf.index != 'const'][1].values[0]
# Get model predictions
predictions = model.predict()
# Build model output table
output_df = pd.DataFrame(data={
'coefficient': coefficients, 'pvalue': pvalues,
'-95%': lower_conf, 'odds': odds, '+95%': upper_conf
}).reset_index()
output_df = output_df.rename(columns={'index': 'variable'})
output_df = output_df[output_df.variable != 'const']
return predictions, output_df
def build_model(self, df, iv):
"""Builds single-variable regression model and stores model output.
If user hasn't specified model type: Builds logistic regression if
the DV is binary, one-versus-all logistic regression if the DV is
discrete, and linear regression otherwise.
Uses 'bfgs' for optimization to prevent singular matrix error.
Args:
df: dataset or subset of dataset
iv: desired independent variable
Returns:
predictions: model's predictions
output_df: model's summary table
"""
if self.model == 'logistic' and df[self.dv].nunique() == 2:
# Build logistic model
predictions, output_df = self.logistic_regression(df, iv)
elif self.model == 'logistic' and self.target_category is not None:
# Create one-versus-all if the target category is defined
df[self.dv] = np.where(
df[self.dv] == self.target_category, 1, 0)
# Build logistic model
predictions, output_df = self.logistic_regression(df, iv)
elif self.model == 'logistic' and 2 < df[self.dv].nunique() <= 10:
raise ValueError('You have a non-binary DV. Pass a value to the '
'target_category in the function or re-bin your '
'DV prior to using the function.')
elif self.model == 'linear':
# Build linear model
predictions, output_df = self.linear_regression(df, iv)
else:
# If user specified a target category
if self.target_category is not None:
# Create one-versus-all
df[self.dv] = np.where(
df[self.dv] == self.target_category, 1, 0)
# Build logistic model
predictions, output_df = self.logistic_regression(df, iv)
elif (self.target_category is None and
2 < df[self.dv].nunique() <= 10):
raise ValueError('You have a non-binary DV. Pass a value to '
'the target_category in the function or '
're-bin your DV prior to using the function.')
elif df[self.dv].nunique() == 2:
# Build logistic model
predictions, output_df = self.logistic_regression(df, iv)
else:
# Build linear model
predictions, output_df = self.linear_regression(df, iv)
return predictions, output_df
def create_subgroups(self, iv, cv):
"""Creates summary statistics table for subgroup models.
Builds regression models using the scaled IV,
and stores model summaries for every subgroup.
Args:
iv: desired independent variable
cv: desired conditioning variable
Returns:
reg_table: model's summary table for all subgroups
coef_sign_sum: sum of coefficient signs of all subgroups
wt_coef_sign_sum: weighted sum of all coefficient signs
df: analysis dataset with model predictions column
"""
coef_sign_sum, wt_coef_sign_sum = 0, 0
bin_sizes, bin_names = [], []
reg_table = pd.DataFrame([])
# For each subgroup...
for subgroup in self.df[cv].unique():
subgroup_df = self.df[self.df[cv] == subgroup]
# If the subgroup does not have constant IV and DV
if (subgroup_df[iv].nunique() > 1 and
subgroup_df[self.dv].nunique() > 1):
# Build a regression model for the subgroup
predictions, model_df = self.build_model(subgroup_df, iv)
# Add model output to the table
reg_table = reg_table.append(model_df)
# Add IV coefficient sign
coef_sign_sum += np.sign(model_df.loc[(
model_df['variable'] == iv), :]['coefficient'].values[0])
# Add IV coefficient sign, weighted by subgroup size
wt_coef_sign_sum += np.multiply(
np.sign(
model_df.loc[(model_df['variable'] == iv),
:]['coefficient'].values[0]),
subgroup_df.shape[0])
# Store subgroup title and size
bin_names.append('bin '+str(subgroup))
bin_sizes.append(subgroup_df.shape[0])
# Store DV predictions from this IV for current subgroup
self.df.loc[(
self.df[cv] == subgroup), iv+'_predictions'] = predictions
# Add subgroup titles and sizes
reg_table['bin_size'], reg_table['variable'] = bin_sizes, bin_names
return reg_table, coef_sign_sum, wt_coef_sign_sum, self.df
def plot_trendline(self, cv, iv, predictions):
"""Plots trend lines for scaled IV and predicted DV on an
unscaled IV x-axis.
If IV has less than 10 categories, creates x-axis ticks
based on the number of categories.
If the DV is binary, sets the y-axis scale at (0, 1).
Args:
cv: conditioning variable for disaggregated trend lines
iv: unscaled independent variable to plot on the x-axis
predictions: model-generated predictions using scaled IV
"""
if not cv:
# Plot trend line for aggregate data
sns.regplot(x=iv, y=predictions, data=self.df, logistic=False,
ci=None, scatter_kws={'s': 0})
# Set binary y-axis for binary DV
if self.df[self.dv].nunique() == 2:
plt.ylim(0, 1)
# Set x-axis ticks for discrete IV
if self.df[iv].nunique() < 10:
plt.locator_params(axis='x', nbins=self.df[iv].nunique()-1)
# Set plot labels
plt.title(label="Predicting {} by {}".format(self.dv, iv))
plt.ylabel(self.dv)
else:
# Sets color palette based on number of subgroups
colors = sns.color_palette(palette='bright',
n_colors=self.df[cv].nunique())
_, ax = plt.subplots()
# For each subgroup...
for i, subgroup in enumerate(self.df[cv].unique()):
subgroup_df = self.df[self.df[cv] == subgroup]
# If the IV and DV values aren't constant
if (subgroup_df[iv].nunique() > 1 and
subgroup_df[self.dv].nunique() > 1):
# Plot trend line for subgroup
sns.regplot(x=iv, y=predictions, data=subgroup_df,
logistic=False, ci=None, scatter_kws={'s': 0},
ax=ax, color=colors[i],
line_kws={
'label': "bin {}".format(int(subgroup))})
# Set binary y-axis for binary DV
if self.df[self.dv].nunique() == 2:
plt.ylim(0, 1)
# Set x-axis ticks for discrete IV
if self.df[iv].nunique() < 10:
plt.locator_params(axis='x',
nbins=self.df[iv].nunique()-1)
# Set plot labels
plt.legend(loc='lower right', prop={'size': 12})
plt.title("Predicting {} by {}, conditioned on {}".format(
self.dv, iv, cv))
plt.ylabel(self.dv)
return plt.draw()
def get_simpsons_pairs(self):
"""Checks all pairs of variables in a dataset for Simpson's Paradox.
Uses the standardized IV for model-building and plotting trendlines,
but keeps the unstandardized IV for the plot's x-axis scale.
Returns:
simpsons_pairs: a list of Simpson's Pairs
"""
# Suppress warnings
if self.quiet:
utilities.suppress_warnings()
# Ignore columns
if self.ignore_columns:
self.df = self.df.drop(self.ignore_columns, axis=1)
# Encode variables
self.df = self.encode_variables()
# Get correlations
corr_matrix = self.get_correlations()
# Create candidate pairs list
variables = [variable for variable in self.df if variable != self.dv]
candidate_pairs = list(permutations(variables, 2))
# Add standardized columns
if self.standardize:
self.df = self.standardize_variables(variables)
simpsons_pairs = []
# For each candidate pair...
for pair in candidate_pairs:
# Unpack pair
iv, cv = pair[0], pair[1]
# Use scaled IV column name
if iv+'_scaled' in self.df:
ind_var = iv+'_scaled'
else:
ind_var = iv
# Filter out pair if little correlation
binary_dv = self.df[self.dv].nunique() == 2
if binary_dv:
if all(variable in corr_matrix for variable in pair):
if np.abs(corr_matrix.loc[pair]) < self.min_corr:
continue
else:
if (all(variable in corr_matrix for variable in pair) and
self.dv in corr_matrix):
first_corr = corr_matrix.loc[pair]
second_corr = corr_matrix.loc[(cv, self.dv)]
if (np.abs(first_corr) < self.min_corr or
np.abs(second_corr) < self.min_corr):
continue
# Build single-variable regression
predictions, simple_reg = self.build_model(self.df, ind_var)
self.df[iv+'_predictions'] = predictions
# If no columns specified to bin
if not self.bin_columns:
# And there's many categories in this CV
if self.df[cv].nunique() > 10:
# Bin the CV and build the models
self.df[cv+'_bin'] = self.bin_variable(self.df[[cv]])
(multiple_reg, coef_sign_sum, wt_coef_sign_sum,
self.df) = self.create_subgroups(ind_var, cv+'_bin')
# Otherwise, build the models without binning
else:
(multiple_reg, coef_sign_sum, wt_coef_sign_sum,
self.df) = self.create_subgroups(ind_var, cv)
else:
# If user hasn't specified this CV for binning, and it's small
if cv not in self.bin_columns and self.df[cv].nunique() <= 10:
# Build the models without binning
(multiple_reg, coef_sign_sum, wt_coef_sign_sum,
self.df) = self.create_subgroups(ind_var, cv)
# If user hasn't specified, but it's big
elif cv not in self.bin_columns and self.df[cv].nunique() > 10:
if not self.quiet:
# Give warning and build without binning
print('Warning: You are building disaggregate models '
'using a conditioning variable that has more '
'than 10 categories. Consider adding this '
'variable to the list of columns to bin, '
'or binning prior to using this function.')
(multiple_reg, coef_sign_sum, wt_coef_sign_sum,
self.df) = self.create_subgroups(ind_var, cv)
# Otherwise, bin and build the models
else:
self.df[cv+'_bin'] = self.bin_variable(self.df[[cv]])
(multiple_reg, coef_sign_sum, wt_coef_sign_sum,
self.df) = self.create_subgroups(ind_var, cv+'_bin')
# If all subgroups in pair have
# constant DV or IV skip pair
if multiple_reg.shape[0] == 0:
continue
# Get coefficient and p-value from single-variable regression
coef = simple_reg.loc[(
simple_reg['variable'] == ind_var), :]['coefficient'].values[0]
pvalue = simple_reg.loc[(
simple_reg['variable'] == ind_var), :]['pvalue'].values[0]
# Get odds confidence intervals from single-variable regression
if binary_dv:
lower_odds = simple_reg.loc[(
simple_reg['variable'] == ind_var), :]['-95%'].values[0]
upper_odds = simple_reg.loc[(
simple_reg['variable'] == ind_var), :]['+95%'].values[0]
else:
lower_odds, upper_odds = False, False
# Check if weighted sum of coefficient signs reverses
if self.weighting:
weighted_reverses = np.sign(coef) != np.sign(wt_coef_sign_sum)
else:
weighted_reverses = True
# Store as Simpson's pair if meets this criteria:
# 1. Sign of the IV coefficients reverses
# 2. Sign of the IV coefficient is non-zero
# 3. IV coefficient is greater than minimum
# 4. IV coef p-value is smaller than maximum
# 5. Weighted IV coefficient sign reverses
if (np.sign(coef) != np.sign(coef_sign_sum) and
np.sign(coef_sign_sum) != 0 and
(np.abs(coef) > self.min_coeff or
(lower_odds < 1 < upper_odds)) and
pvalue < self.max_pvalue and
weighted_reverses):
simpsons_pairs.append(pair)
if not self.quiet:
print('================================================='
'==============================\nWarning! Simpson’s '
'Paradox was detected in this pair of variables:'
'\n{}\n============================================='
'=================================='.format(pair))
if self.output_plots:
# Plot single-variable regression
print('Independent Variable:', iv)
display(simple_reg)
self.plot_trendline(cv=None, iv=iv,
predictions=iv+'_predictions')
# Plot single-variable regressions
print('Conditioned on:', cv)
display(multiple_reg)
if cv+'_bin' in self.df:
self.plot_trendline(cv=cv+'_bin', iv=iv,
predictions=ind_var+'_predictions')
else:
self.plot_trendline(cv=cv, iv=iv,
predictions=ind_var+'_predictions')
plt.show()
if not self.quiet:
if simpsons_pairs == []:
print('Congratulations! No Simpson’s Pairs were detected (e.g.'
' Simpson’s Paradox was not detected for your dataset).')
else:
print('%d Simpson’s Pair(s) were detected in your dataset.' %
len(simpsons_pairs))
return simpsons_pairs