In [1]:
import pandas as pd
import numpy as np

In [5]:
df1 = pd.DataFrame(data={'segmenter':['alpha','alpha','alpha','alpha','alpha','omega','omega','omega','omega','omega'],
                         'lookup1':['a',np.nan,'c','d','e','a',np.nan,'c','d','e'],
                         'lookup2':['apple',np.nan,'banana','pear','pineapple','apple',np.nan,'banana','pear','pineapple'],
                         'custs':[1,2,3,4,5,1,2,3,4,5]})
df2 = pd.DataFrame(data={'lookup1':['a',np.nan,'i','o','u'],'lookup2':['apple',np.nan,'hyena','pear','pineapple'],
                         'custs':[2,4,6,8,10]})

In [51]:
def divider(df1, column_list, segmenter):

    """
    Divides the last column of a row in a df by the sum of all the other rows that have the same combination of NaN/not NaN
    in their other columns. This is designed for profiling - if you have two columns, one showing one profile, and one
    showing another, rows will only be divided by the other relevant rows
    """
    
    # replaces missing values
    df1.replace(to_replace=np.nan,value='missing value',inplace=True)
    df_check = pd.DataFrame(columns=column_list)
    
    # generates boolean list of true/false for row's values - later we will divide each row's last column by the sum
        # of all the other columns that have their same true/false list
    for row in df1[column_list].itertuples(index=False):
        
        blank_check = pd.Series([i == "missing value" for i in list(row)],index = column_list)
        
        # appends this list into df, so you have two dfs - one indicating whether the other has NaNs
        df_check = df_check.append(blank_check,ignore_index=True)
    
    # changes the missing values back to NaN
    df1.replace(to_replace='missing value',value=np.nan,inplace=True)

    #################################
    
    # for a row, generates a list of all the indices from df_check that have the same true/false booleans
    
    matching_indices = []
    
    for row in df_check.iterrows():
    
        index = row[0]
        row = row[1]
        
        matching_indices.append([i[0] for i in df_check.iterrows() if i[1].all() == row.all()])
    
#     divides the value in the last column for that row by the sum of all the columns that have the same null/not null
#     truthfulness

    df_out = pd.DataFrame(columns = column_list)
    
    for row in df1.iterrows():
        
        index = row[0]
        row = row[1]    
        
        row['%'] = row['custs'] / df1.iloc[matching_indices[index]]['custs'].sum()
        
        df_out = df_out.append(row)
        
        # reorders to put the % value as last
        df_out = df_out[[col for col in df_out if col != '%']+['%']]
        
    return df_out

In [52]:
test = divider(df1,df1.columns[:-1],segmenter = True)

TypeError: 'bool' object is not subscriptable

In [48]:
# identifies the number of columns in the dataframe, ignoring the first one if the argument 'segmenter' is true
def indexer(df1,df2,segmenter=False):
    '''
    divides the values in the last column of df1 by the values in the last column in df2, joining on all the other columns in 
    the table (nulls will be joined to nulls). All columns need to match to calculate the index, otherwise NaN is returned
    
    Arguments:
        df1 - the numerator table for your index calculation.
        df2 - the denominator table for your index calculation.
        segmenter - indicates whether the first column is showing levels in a segmentation, and therefore whether
                    the first column should be included when looking for what to join on.
    '''
    #creates the list of columns that the dfs will be joined on   
    column_list = list(df1.columns)[:-1]

    df1 = divider(df1, column_list, segmenter)
    
    if segmenter:
        df2 = divider(df2, column_list[1:], segmenter = False)
    else:
        df2 = divider(df2, column_list, segmenter)
    
    # joins two dfs together - now the last two columns will be the columns to be divided
    if segmenter:
        result_df = df1.merge(df2,how='left',left_on = column_list[1:], right_on = column_list[1:])
    else:
        result_df = df1.merge(df2,how='left',left_on = column_list, right_on = column_list)
    
    # calculates index by dividing the second last column by the last column
    result_df['index'] = result_df[result_df.columns[-1]]/result_df[result_df.columns[-3]]
        
    # retains just the columns we're interested in
    column_list.append('index')
    result_df = result_df[[i for i in column_list]]
    
    return result_df