In [3]:
import pandas as pd
from scipy import stats
import numpy as np

# This is just a random assortment of 100 people with the following answers to some yes or no question

data = [['Young', 'yes'], ['Young', 'no'], 
        ['Old', 'yes'], ['Old', 'yes'], 
        ['Old', 'no'], ['Old', 'no'], ['Old', 'no'], 
        ['Old', 'no'], ['Old', 'no'], ['Old', 'no']]

df_raw = pd.DataFrame(data, columns = ['Age', 'Answer'])
  
ct_raw = pd.crosstab(df_raw.Age, df_raw.Answer)
    
ct_raw_margins = pd.crosstab(df_raw.Age, df_raw.Answer, margins = True)*10
ct_raw_margins_percent = pd.crosstab(df_raw.Age, df_raw.Answer, margins = True, normalize = True)

ct_raw_margins_percent = ct_raw_margins_percent.style.format('{:.0%}')

negative_answer_significance_raw = np.array(
    [ct_raw_margins.loc['Old'][:2], 
     ct_raw_margins.loc['Young'][:2]])

p_value_raw = stats.chi2_contingency(negative_answer_significance_raw)[1]

In [5]:
df_raw

Unnamed: 0,Age,Answer
0,Young,yes
1,Young,no
2,Old,yes
3,Old,yes
4,Old,no
5,Old,no
6,Old,no
7,Old,no
8,Old,no
9,Old,no


# Raw Data

In [4]:
ct_raw_margins

Answer,no,yes,All
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Old,60,20,80
Young,10,10,20
All,70,30,100


In [3]:
ct_raw_margins_percent

Answer,no,yes,All
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Old,60%,20%,80%
Young,10%,10%,20%
All,70%,30%,100%


This leaves us with 70% of the sample saying 'no" and 30% saying "yes."

In [4]:
p_value_raw

0.05620966783119149

The difference between old and young people who say "no" is not statistically significant.

# Current Weighting

Currently our weighting (as I understand it) might multiply young people up by 2 and divide old people down by 2

In [5]:
current_weight = pd.Series([2, 1/2], index = ['Young', 'Old'])

ct_current_weighting = pd.DataFrame(
    [ct_raw_margins.loc['Old']*current_weight.loc['Old'],
    ct_raw_margins.loc['Young']*current_weight.loc['Young']]
)

ct_current_weighting.loc['All'] = ct_current_weighting.sum()

ct_current_weighting_respondents = ct_current_weighting.All.loc['All']

ct_current_weighting_percent = ct_current_weighting/ct_current_weighting_respondents

ct_current_weighting

Answer,no,yes,All
Old,30.0,10.0,40.0
Young,20.0,20.0,40.0
All,50.0,30.0,80.0


This leaves us with 80 respondents of the original 100 and...

In [6]:
ct_current_weighting_percent.style.format('{:.0%}')

Answer,no,yes,All
Old,38%,12%,50%
Young,25%,25%,50%
All,62%,38%,100%


...62% of the sample saying "no" and 38% saying "yes"...

In [7]:
negative_answer_significance_current_weighting = np.array(
    [ct_current_weighting.loc['Old'][:2], 
     ct_current_weighting.loc['Young'][:2]])

p_value_current_weighting = stats.chi2_contingency(negative_answer_significance_current_weighting)[1]

p_value_current_weighting

0.03766692222862869

...and the difference is statistically significant.

# Divide Down Weighting

If instead we divided the old folks *down* to the younger folks (in this case, divide by four):

In [8]:
dd_weight = pd.Series([1/4], index = ['Old'])

ct_dd_weighting = pd.DataFrame(
    [ct_raw_margins.loc['Old']*dd_weight.loc['Old'],
    ct_raw_margins.loc['Young']]
)

ct_dd_weighting.loc['All'] = ct_dd_weighting.sum()

ct_dd_weighting_respondents = ct_dd_weighting.All.loc['All']

ct_dd_weighting_percent = ct_dd_weighting/ct_dd_weighting_respondents

ct_dd_weighting

Answer,no,yes,All
Old,15.0,5.0,20.0
Young,10.0,10.0,20.0
All,25.0,15.0,40.0


This leaves us with 40 respondents of the original 100 and...

In [9]:
ct_dd_weighting_percent.style.format('{:.0%}')

Answer,no,yes,All
Old,38%,12%,50%
Young,25%,25%,50%
All,62%,38%,100%


...the same props as the current weighting method for "no" and "yes"...

In [10]:
negative_answer_significance_dd_weighting = np.array(
    [ct_dd_weighting.loc['Old'][:2], 
     ct_dd_weighting.loc['Young'][:2]])

p_value_dd_weighting = stats.chi2_contingency(negative_answer_significance_dd_weighting)[1]

p_value_dd_weighting

0.19141842523760358

...and a difference that is NOT statistically significant/more conservative.

This would be equivalent to taking the average of many subsets of a randomly selected group of 20 of the original 80 old people. The advantage to the dividing down method is maintaining computational simplicity for live data manipulation in the InsightStore.