In [1]:
from scipy import stats
import numpy as np
import pandas as pd

import sys
import os

In [2]:
def check_srm(df, groups, proportion, dimensions=None):
    """Function takes pandas data frame, column name for groups names, expected proportion between the groups, and names of dimensions
    and returns data frame with dimension name, p_value and adjusted p_value.
    df - pandas data frame with data for experiment;  
    groups - name of a column in data frame that contains user groups names;
    proportion - dict of expected proportion of each user group. Sum of proportions should be equal to 1;
    dimensions - list of columns that indicate dimensions (like country, os, demographical categories etc). None by default.
    """
    
    results = {
        'split': [],
        'p value': []
    }
    
    # calculate for groups
    observed = []
    expected = []
    
    # for every class in the column calculate observed and expected number of records
    for key in proportion.keys():
        observed.append(df[df[groups]==key].shape[0])
        expected.append(proportion[key] * df.shape[0])
    
    contingency_table = np.array([observed, expected])
    
    # execute chi2 test and get p value
    p_value = stats.chi2_contingency(contingency_table, correction=False)[1]
    
    results['split'].append('groups')
    results['p value'].append(round(p_value, 3))
    
    
    # if dimensions are specified, compare distribution of classes between test and control groups
    if dimensions is not None:
        for dimension in dimensions:
            # get unique classes
            classes = set(df[dimension])
            
            # create dict to store proportions
            groups_dimension = {}
            
            # for each group 
            for key in proportion.keys():
                groups_dimension[key] = []
                
                for c in classes:
                    # append number of records for the group & class combination
                    groups_dimension[key].append(df[(df[groups]==key) & (df[dimension]==c)].shape[0])
            
            
            contingency_table = np.array([groups_dimension[key] for key in groups_dimension.keys()])
            
            p_value = stats.chi2_contingency(contingency_table, correction=False)[1]
    
            results['split'].append(dimension)
            results['p value'].append(round(p_value, 3))
    
    return pd.DataFrame(results)

### Generate synthetic data with data_generator to test the function  
data generator: https://github.com/dzianissokalau/data_generator


In [3]:
sys.path.append(os.path.realpath('../../'))
import data_generator

In [12]:
# set the dataset parameters
params = {
     'categorical': {
         'group': {
             'categories': ['experimental', 'control'],
             'probs': [0.5, 0.5]
         },
         'country': {
             'categories': ['UK', 'DE', 'FR', 'IT'],
             'probs': [0.31, 0.27, 0.23, 0.19]
         },
        'platform': {
            'categories': ['android', 'ios', 'web'],
            'probs': [0.41, 0.29, 0.3]
        }
    },
    'values': {
        'payment': {
            'distribution': 'binomial',
            'parameters': {
                'n': 1, 
                'p': 0.2
            },
            'indices': False
        }
    }
}


df = data_generator.create_dataset(rows=100000, params=params, seed=1)
df.head()

Unnamed: 0,group,country,platform,payment
0,control,IT,android,0
1,control,UK,android,1
2,experimental,UK,ios,0
3,experimental,UK,ios,0
4,control,FR,web,0


### Test SRM

In [13]:
check_srm(
    df=df, 
    groups='group', 
    proportion={'experimental':0.5, 'control':0.5}, 
    dimensions=['country', 'platform'])

Unnamed: 0,split,p value
0,groups,0.035
1,country,0.114
2,platform,0.714


The function doesn't see problems in country and platform dimensions, but see that something can be wrong with the users split by group. let's check the numbers.

In [15]:
df.groupby('group')['payment'].count()

group
control         50472
experimental    49528
Name: payment, dtype: int64