In [None]:
# If using jupyter lab: Run this command and rerun jupyter lab to display the progress bar
# If an error occurs, try to re-run the cell again
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from simpsons_paradox import SimpsonsParadox

In [None]:
plt.rcParams['figure.figsize'] = (10, 7)
sns.set(style='darkgrid')
sns.set(font_scale=1.5)

In [None]:
# Suppress pandas SettingWithCopyWarning
pd.options.mode.chained_assignment = None
# Suppress scikit-learn KBinsDiscretizer warning
warnings.simplefilter(action='ignore', category=UserWarning)

### Get Simpson's Pairs

In [None]:
# ================================================================================================================== #
# ==================================================== Load Data =================================================== #
# ================================================================================================================== #

## Example 1: Disease Data
df = pd.read_csv('data/SP_Data.txt')
dv = 'Survived'
bin_columns = []
ignore_columns = ['PatientId']

## Example 2: Census Data 
## Needs weighting=False to display weak Simpson's pair
# df = pd.read_csv('data/CensusModified.csv')
# dv = 'Income'
# bin_columns = []
# ignore_columns = []

## Example 3: Baseball Data 
## Needs max_pvalue=1
# df = pd.read_csv('data/baseball_data.csv')
# dv = 'outcome'
# bin_columns = []
# ignore_columns = ['Unnamed: 0']

## Example 4: Admissions Data
# df = pd.read_csv('data/admissions_data.csv')
# dv = 'Admit'
# bin_columns = []
# ignore_columns = ['Unnamed: 0']

## Example 5: Khan Academy Data
# df = pd.read_csv('data/small_khancademy.csv')
# dv = 'performance'
# bin_columns = ['timestamp', 'solve_time', 'attempts', 'tspp', 'session_num', 'session_index',
#               'session_length', 'all_first_attempts', 'signup_duration', 'total_solving_time',
#               'all_attempts', 'all_problems', 'all_sequences', 'month', 'join_month']
# ignore_columns = ["user_id", "problem_id"]

## Example 6: Raw Continuous Data
## Needs max_pvalue=1
# x_b = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10]
# y_b = [10, 10.5, 11, 11.5, 12, 12.5, 13, 13.5, 14, 14.5, 15, 15.5, 16, 16.5, 17, 17.5, 18, 18.5, 19]
# cluster_b = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# blue = pd.DataFrame(np.column_stack([x_b, y_b, cluster_b]))
# x_r = [5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10, 10.5, 11, 11.5, 12, 12.5, 13, 13.5, 14]
# y_r = [1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10, 10.5]
# cluster_r = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
# red = pd.DataFrame(np.column_stack([x_r, y_r, cluster_r]))
# df = blue.append(red)
# df.columns = ['x', 'y', 'subgroups']
# dv = 'y'
# bin_columns = []
# ignore_columns = []

## Example 7: Coffee Data
# df = pd.read_csv('data/coffee_data.csv')
# dv = 'neuroticism'
# bin_columns = []
# ignore_columns = ['Unnamed: 0']

## Example 8: COVID-19 Data
## No Simpson's pairs expected
# df = pd.read_csv('data/conposcovidloc.csv')
# dv = 'Outcome1'
# bin_columns = []
# ignore_columns = ['Row_ID']

## Example 9: Student Performance Data
## No Simpson's pairs expected
# df = pd.read_csv('data/student-mat.csv', sep=";") 
# df['final_grade'] = np.where(df['G3'] >= 10, 1, 0)
# dv = 'final_grade'
# bin_columns = []
# ignore_columns = []

## Example 10: T.A. Evaluation Data
## No Simpson's pairs expected
# df = pd.read_csv('data/TeachingAssistant_EvaluationDataset.csv')
# dv = 'Score'
# bin_columns = []
# ignore_columns = []

## Example 11: Breast Cancer Data
## No Simpson's pairs expected
# df = pd.read_csv('data/breastcancer_data.csv')
# dv = 'class'
# bin_columns = []
# ignore_columns = []

# ================================================================================================================== #
# =============================================== Get Simpson's Pairs ============================================== #
# ================================================================================================================== #

params = {
    'df': df,
    'dv': dv,
    'model': 'logistic',
    'ignore_columns': ignore_columns,
    'bin_columns': bin_columns,
    'output_plots': True
}

simpsons_pairs = SimpsonsParadox(**params).get_simpsons_pairs()