In [None]:
def GenerateColumnCombinations(df, column_names):
    """
    Generate all combinations of unique values across specified columns,
    and return them as a dictionary of dictionaries.

    Example return:
    {
        0: {'CITY': 'Toronto', 'BRANCHNAME': 'Branch A'},
        1: {'CITY': 'Vancouver', 'BRANCHNAME': 'Branch B'},
        ...
    }
    """
    # List of lists of unique values
    value_lists = [df[col].dropna().unique().tolist() for col in column_names]
    
    # Cartesian product of all value combinations
    combos = list(product(*value_lists))
    
    # Build dictionary of dictionaries
    combo_dict = {
        idx: dict(zip(column_names, values))
        for idx, values in enumerate(combos)
    }

    return combo_dict

    
def ColumnStatisticalReview(df,
                            column_name,
                            partitions=10,
                            top_x_records=10,
                            exclude_blanks_from_segments=1,
                            exclude_zeroes_from_segments=1):

    '''
    Function to Conduct a Simple Statistical Review of a Column, Including Understanding the positional distribution
    of values. 

    Args:
        column_name (str): Name of Column

        partitions (int): Number of partitions to include (Decile 10)

        exclude_blanks_from_segments (int): Binary Flag, whether to exclude Blank Values from Segment determination.
        If blank values are excluded it gives a better representation for the members of the set, however it might 
        provide a misleading representation of the population.

        exclude_zeroes_from_segments (int): As above, with respect to 0 values. Is processed after exclude_blanks, as
        such it can include both blanks and true 0 values. 

    '''
    temp_dict = {}
    
    is_numeric = pd.api.types.is_numeric_dtype(df[column_name])
    
    if is_numeric:
        temp_dict['SUM'] = df[column_name].sum()
        temp_dict['MEAN'] = df[column_name].mean()
        temp_dict['STD_DEV'] =  df[column_name].std()
        temp_dict['MEDIAN'] = df[column_name].median()
        temp_dict['MAX'] = df[column_name].max()
        temp_dict['MIN'] = df[column_name].min()
        
    temp_dict['TOTAL_RECORDS'] = len(df)
    temp_dict['UNIQUE_RECORDS'] = len(df.drop_duplicates(column_name))
    temp_dict['NA_RECORDS'] = len(df[df[column_name].isna()])
    temp_dict['NULL_RECORDS'] = len(df[df[column_name].isnull()])
    
    if is_numeric:
        temp_dict['ZERO_RECORDS'] = len(df[df[column_name]==0])
        temp_dict['NON_ZERO_RECORDS'] = len(df[df[column_name]!=0])    

    temp_df = pd.DataFrame(temp_dict.values(),index=temp_dict.keys(),columns=[column_name])

    if temp_dict['TOTAL_RECORDS']==len(df[df[column_name].isnull()]):
        return temp_df   
    
    # Add top X records Based on Frequency
    if top_x_records>0:
        top_instances = pd.DataFrame(df[column_name].value_counts(dropna=False).head(top_x_records)).reset_index()
        
        if len(top_instances)>0:
            top_instances[column_name] = top_instances.apply(lambda row: f"Value: {row[column_name]}, Frequency: {row['count']}", axis=1)
            top_instances['index'] = [f"Top {x+1}" for x in range(len(top_instances[column_name]))]
            top_instances = top_instances.drop('count',axis=1).set_index('index')
            temp_df = pd.concat([temp_df,top_instances])
        
    if (partitions>0)&(pd.api.types.is_numeric_dtype(df[column_name]))&(temp_dict['UNIQUE_RECORDS']>1):
        segment_df = ColumnPartitioner(df=df,
                                       column_name=column_name,
                                       partitions=partitions,
                                       exclude_blanks=exclude_blanks_from_segments,
                                       exclude_zeros=exclude_zeroes_from_segments,
                                       return_value='')
        
        seg_val_df = ColumnPartitioner(df=df,
                                           column_name=column_name,
                                           partitions=partitions,
                                           exclude_blanks=exclude_blanks_from_segments,
                                           exclude_zeros=exclude_zeroes_from_segments,
                                           return_value='agg_value').rename(columns={'VALUE':column_name})

        return pd.concat([temp_df,segment_df.T,seg_val_df])
    return temp_df

In [73]:
import pandas as pd
import numpy as np
import sys
sys.path.append("/Users/derekdewald/Documents/Python/Github_Repo/d_py_functions/")
from pandas.api.types import is_numeric_dtype

pd.options.display.float_format = '{:.2f}'.format

In [91]:
from FeatureEngineering import CreateRandomDFColumn,BinaryComplexEquivlancey
from DFProcessing import ColumnPartitioner

In [161]:
from DataSets import GenerateFakeMemberDF

df = GenerateFakeMemberDF(1000,2)
branch_list = [f'BRANCH_{x}' for x in range(0,50)]
city_list = ['Burnaby','Vancouver','Kelowna','Whistler']

df0 = df[df['MONTH']==0].drop('MONTH',axis=1).copy()
df1 = df[df['MONTH']==1].drop('MONTH',axis=1).copy()
df1.rename(columns={x:f'{x}_' for x in df1.columns if x not in ['MEMBERNBR']},inplace=True)

df2 = df0.merge(df1,on='MEMBERNBR',how='outer')
branch_list = [f'BRANCH_{x}' for x in range(0,10)]
city = ['Burnaby','Vancouver','Kelowna','Whistler']

CreateRandomDFColumn(df2,branch_list,'BRANCHNAME')
CreateRandomDFColumn(df2,city_list,'CITY')
CreateRandomDFColumn(df2,city_list,'CITY2')

df2.head(2)

Unnamed: 0,MEMBERNBR,CLASSIFICATION,OUTLOOK,DEPOSIT,LENDING,TXN_COUNT,TXN_VALUE,ACTIVE,CLASSIFICATION_,OUTLOOK_,DEPOSIT_,LENDING_,TXN_COUNT_,TXN_VALUE_,ACTIVE_,BRANCHNAME,CITY,CITY2
0,0,Borrower,0.0,34262.0,979555.0,2.0,398.0,1.0,Borrower,0,0,0,0,0,0,BRANCH_3,Burnaby,Kelowna
1,1,Borrower,0.0,29964.0,100000.0,0.0,0.0,1.0,Borrower,0,29386,104453,0,0,1,BRANCH_1,Vancouver,Vancouver


In [None]:
CreateRandomDFColumn(df2,city_list,'CITY')

In [178]:

def RecordElementCompare(df,
                         column_name,
                         column_name1,
                         primary_key,
                         summary_columns,
                         bracketing=[-10000,-1000,-1,0,1,1000,10000],
                         groupby_filter={'top':10,'minimum':5}):
    '''
    
    Function which takes a dataframe with 2 Columns which a desired Comparison in Necessary.
    Initial Use Case Required Minimal Number Values, focused on TEXT
        
    Parameters:
    
    
    Returns:
    
    
    Example Usage:
        
        df= df[[START_BAL, START_BAL_, ACCTNBR]],
        column_name='START_BAL'

    '''
    # Make a Copy.

    start_cols = primary_key + summary_columns + [column_name] + [column_name1]
    account_df = df[start_cols].copy()

    if is_numeric_dtype(account_df[column_name]):
        account_df[column_name] = np.where(account_df[column_name] == "", 0, account_df[column_name])
        account_df[column_name1] = np.where(account_df[column_name1] == "", 0, account_df[column_name1])
    else:
        account_df[column_name] = np.where(account_df[column_name] == "", None, account_df[column_name])
        account_df[column_name1] = np.where(account_df[column_name1] == "", None, account_df[column_name1])

    # Change Names of Individual Columns to Something Generic so datasets can be Concatenated.
    account_df = account_df.rename(columns={column_name:'DF',column_name1:'DF1'}).copy()
    
    # Calculate DIfference
    
    BinaryComplexEquivlancey(account_df,'DF','DF1','VALUES_EQUAL')
    account_df['VALUES_NOT_EQUAL'] = np.where(account_df['VALUES_EQUAL']==0,1,0)
    account_df['NULL_RECORD_DF'] = np.where(account_df['DF'].isnull(),1,0)
    account_df['NULL_RECORD_DF1'] = np.where(account_df['DF1'].isnull(),1,0)
        
    try:
        account_df['RECORD_COUNT']
    except:
        account_df['RECORD_COUNT']=1
    
    total_columns_summary = summary_columns + ['RECORD_COUNT','VALUES_EQUAL','VALUES_NOT_EQUAL','NULL_RECORD_DF','NULL_RECORD_DF1']

    summary_df = pd.DataFrame(account_df[['RECORD_COUNT','VALUES_EQUAL','VALUES_NOT_EQUAL','NULL_RECORD_DF','NULL_RECORD_DF1']].sum()).T
    summary_df['COLUMN_NAME'] = column_name
    summary_df['PERC_EQUAL'] = (summary_df['VALUES_EQUAL'] /summary_df['RECORD_COUNT'])*100
    
    total_columns_groupby = total_columns_summary + ['DF','DF1']
    gb_columns = summary_columns + ['DF','DF1']
    groupby_df = account_df[total_columns_groupby].groupby(gb_columns,dropna=False).sum().reset_index().sort_values("RECORD_COUNT",ascending=False)

    groupby_df['CUM'] = groupby_df.groupby(summary_columns,dropna=False).cumcount() + 1

    groupby_df = groupby_df[(groupby_df['CUM']<= groupby_filter['top'])&(groupby_df['RECORD_COUNT']>= groupby_filter['minimum'])]
    groupby_df['COLUMN_NAME'] = column_name
        
    return account_df,summary_df,groupby_df

account_df,summary_df,groupby_df = RecordElementCompare(df2,
                                                        column_name='LENDING',
                                                        column_name1='LENDING_',
                                                        primary_key=['MEMBERNBR'],
                                                        summary_columns=['CITY'])

In [179]:
account_df.head()

Unnamed: 0,MEMBERNBR,CITY,DF,DF1,VALUES_EQUAL,VALUES_NOT_EQUAL,NULL_RECORD_DF,NULL_RECORD_DF1,RECORD_COUNT
0,0,Burnaby,979555.0,0,0,1,0,0,1
1,1,Vancouver,100000.0,104453,0,1,0,0,1
2,2,Burnaby,16442.0,16651,0,1,0,0,1
3,3,Burnaby,0.0,0,1,0,0,0,1
4,4,Burnaby,100000.0,105222,0,1,0,0,1


In [180]:
summary_df

Unnamed: 0,RECORD_COUNT,VALUES_EQUAL,VALUES_NOT_EQUAL,NULL_RECORD_DF,NULL_RECORD_DF1,COLUMN_NAME,PERC_EQUAL
0,1056,194,862,56,0,LENDING,18.37


In [None]:

a = GenerateColumnCombinations(df2,['CITY','BRANCHNAME'])


In [225]:

b = GenerateColumnCombinations(df2,['CITY'])
b

{0: {'CITY': 'Burnaby'},
 1: {'CITY': 'Vancouver'},
 2: {'CITY': 'Whistler'},
 3: {'CITY': 'Kelowna'}}

In [233]:
def FilterDictionary(df,CombinationDictionaries,column_names):

    dict_list = list(CombinationDictionaries.values())
    keys = list(dict_list[0].keys())
    print(keys)

    final_df = pd.DataFrame()

    if len(keys)==1:
        col = keys[0]

        for list_ in dict_list:
            print(list_)
            temp_df = df[(df[col]==list_[col])]
            for column in column_names:
                temp = ColumnStatisticalReview(temp_df,column).T.reset_index().rename(columns={'index':'CALCULATED_COLUMN'})
                temp[col] = list_[col]
                final_df = pd.concat([final_df,temp])
        
        
        return final_df

    elif len(keys)==2:
        col  = keys[0]
        col1 = keys[1]

        for list_ in dict_list:
            print(list_)
            temp_df = df[(df[col]==list_[col])&(df[col1]==list_[col1])]
            for column in column_names:
                temp = ColumnStatisticalReview(temp_df,column).T.reset_index().rename(columns={'index':column})
                temp[col] = list_[col]
                temp[col1] = list_[col1]
                return temp
    elif len(keys)==3:
        
        col  = keys[0]
        col1 = keys[1]
        col2 = keys[2]

        for list_ in dict_list:
            print(list_)
            temp_df = df[(df[col]==list_[col])&(df[col1]==list_[col1])&(df[col2]==list_[col2])]

            
            return ColumnStatisticalReview(temp_df,'DEPOSIT')
        
v = FilterDictionary(df2,b,['DEPOSIT'])

v

['CITY']
{'CITY': 'Burnaby'}
{'CITY': 'Vancouver'}
{'CITY': 'Whistler'}
{'CITY': 'Kelowna'}


Unnamed: 0,CALCULATED_COLUMN,SUM,MEAN,STD_DEV,MEDIAN,MAX,MIN,TOTAL_RECORDS,UNIQUE_RECORDS,NA_RECORDS,...,Total Balance in Partion 2,Total Balance in Partion 3,Total Balance in Partion 4,Total Balance in Partion 5,Total Balance in Partion 6,Total Balance in Partion 7,Total Balance in Partion 8,Total Balance in Partion 9,Total Balance in Partion 10,CITY
0,DEPOSIT,57382271.0,228614.63,278470.85,87122.0,978145.0,2.0,266.0,248.0,15.0,...,67910.0,357115.0,924969.0,1773996.0,3705512.0,7509926.0,11035174.0,16599325.0,15407176.0,Burnaby
0,DEPOSIT,49204637.0,205877.14,268305.09,64903.0,999494.0,0.0,251.0,235.0,12.0,...,49072.0,226898.0,632080.0,1240095.0,2213848.0,4951842.0,8452206.0,12885013.0,18552585.0,Vancouver
0,DEPOSIT,71875141.0,271226.95,315811.72,91486.0,991618.0,3.0,279.0,265.0,14.0,...,133852.0,354043.0,935257.0,1767951.0,4430235.0,9475990.0,14523903.0,20043506.0,20202538.0,Whistler
0,DEPOSIT,56917471.0,232316.21,293072.82,68385.0,973124.0,1.0,260.0,243.0,15.0,...,84005.0,329079.0,712850.0,1308005.0,3007814.0,6799958.0,9799915.0,16909636.0,17965109.0,Kelowna


In [237]:
from scipy import stats
import numpy as np

def one_sample_statistical_test(values, popmean=0, alpha=0.05):
    """
    Performs a one-sample t-test on a list of numeric values.
    
    Args:
        values (list or array): The numeric data to test
        popmean (float): The population mean to test against (default is 0)
        alpha (float): Significance level (default is 0.05)
        
    Returns:
        dict: {
            'mean': float,
            'test': 'one-sample t-test',
            't_statistic': float,
            'p_value': float,
            'significant': bool
        }
    """
    values = np.array(values)
    t_stat, p_value = stats.ttest_1samp(values, popmean, nan_policy='omit')
    
    return {
        'mean': np.nanmean(values),
        'test': 'one-sample t-test',
        't_statistic': t_stat,
        'p_value': p_value,
        'significant': p_value < alpha
    }

one_sample_statistical_test(v['MEAN'].tolist())

{'mean': np.float64(234508.73077262472),
 'test': 'one-sample t-test',
 't_statistic': np.float64(17.29000274692827),
 'p_value': np.float64(0.0004215801967977757),
 'significant': np.True_}

[228614.62549800795, 205877.14225941422, 271226.94716981135, 232316.2081632653]

In [234]:
v.T

Unnamed: 0,0,0.1,0.2,0.3
CALCULATED_COLUMN,DEPOSIT,DEPOSIT,DEPOSIT,DEPOSIT
SUM,57382271.00,49204637.00,71875141.00,56917471.00
MEAN,228614.63,205877.14,271226.95,232316.21
STD_DEV,278470.85,268305.09,315811.72,293072.82
MEDIAN,87122.00,64903.00,91486.00,68385.00
MAX,978145.00,999494.00,991618.00,973124.00
MIN,2.00,0.00,3.00,1.00
TOTAL_RECORDS,266.00,251.00,279.00,260.00
UNIQUE_RECORDS,248.00,235.00,265.00,243.00
NA_RECORDS,15.00,12.00,14.00,15.00


In [210]:
%history -f text2.txt