In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro
from scipy.stats import mannwhitneyu
from scipy.stats import trimboth
import itertools
from functools import reduce
import warnings
warnings.filterwarnings(action='once')

In [2]:
#Reading in data from file.
df_us = pd.read_csv('10_11_20_enriched.csv')
#Keeping records in metro groups with at least 30 records in them.
df_us = df_us.groupby('StdGeographyName').filter(lambda x: len(x) >= 30)
#Creating shorter metro names.
df_us.loc[:, 'Metro'] = df_us.loc[:, 'StdGeographyName'].str.split('Metropolitan', expand = True)[0].str.strip()

In [3]:
class Metro():
    
    def __init__(self, df, col):
        #DF filtered by select metro.
        self.df = df
        #Numeric column to assess.
        self.col = col
        #Metro name.
        self.geog = df.Metro.unique()[0]
        self.sw = self.shap_wilk()
    
    def shap_wilk(self, a = .05, d = None):
        '''Shapiro-Wilk test for normality. Returns True if Gaussian. Runs upon init.'''
        #Convention enables df2 values to be tested in CompareMetro() class.
        vals = self.df.loc[:, self.col] if d is None else d.loc[:, self.col]
        stat, p = shapiro(vals)
        return True if p > a else False
    
    def plt_met(self):
        sns.set_style('whitegrid')
        sns.displot(self.df, x = self.col, hue = 'Metro', kind = 'hist', fill = True, height = 6, aspect = 1)
    
class CompareMetro(Metro):
    
    def __init__(self, df, col, df2):
        super().__init__(df, col)
        #DF segmented by metro to compare against first DF.
        self.df2 = df2
        #Name of second metro comparing against the first.
        self.geog2 = df2.Metro.unique()[0]
        self.sw2 = self.shap_wilk(d = self.df2)
        #True = distributions the same. False = different.
        self.mw, self.mw_pval = self.man_whit()
        self.mw_trim = False
        
    def man_whit(self, a = .05, t_amt = .025):
        '''Performs Mann-Whitney U test to see if distributions are the same. Attempts trimming
        default 2.5% distribution ends if distributions differ. Can vary trimmed amount.'''
        mw_s, mw_p = mannwhitneyu(self.df.loc[:, self.col], self.df2.loc[:, self.col])
        res = False
        if mw_p > a:
            res = True
        else:
            self.mw_trim = True
            x1 = trimboth(tuple(self.df.loc[:, self.col]), t_amt)
            x2 = trimboth(tuple(self.df2.loc[:, self.col]), t_amt)
            mw_s, mw_p = mannwhitneyu(x1, x2)
            if mw_p > a:
                res = True
        return res, mw_p
    
    def plt_dists(self):
        '''Helper function to quickly plot comparison of original distributions.'''
        sns.set_style('whitegrid')
        conc = pd.concat([self.df, self.df2], axis = 0)
        sns.displot(conc, x = self.col, hue = 'Metro', kind = 'hist', fill = True, height = 6, aspect = 1)
        
def pair_combo(l):
    '''Returns all possible pairwise combinations of a list.'''
    return list(itertools.combinations(l, 2))

def mean_median(df, cols, med = False):
    '''Returns 2 column df with mean or median depending on med argument. One col is a category and the other
    is the number to be aggregated.'''
    res = df[cols].groupby(cols[0]).mean().sort_values(cols[1]).reset_index()
    res.loc[:, cols[1]] = res.loc[:, cols[1]].round(2)
    return df[cols].groupby(cols[0]).median().sort_values(cols[1]).reset_index() if med else res

In [4]:
pairs = pair_combo(df_us.Metro.unique())
# len(pairs) #496 pairs

In [5]:
normal = []
mw_test_same = []
mw_same_trim = []

match1 = []
match2 = []
pval = []

for p in pairs:
    #Unpacking pair
    first, second = p
    #Creating comparison instance.
    cm_inst = CompareMetro(
        df_us[df_us.loc[:, 'Metro'] == first],
        'Total Compensation',
        df_us[df_us.loc[:, 'Metro'] == second]
    )
    if cm_inst.sw:
        normal.append(first)
    if cm_inst.sw2:
        normal.append(second)
    if cm_inst.mw:
        mw_test_same.append(p)
        match1.append(cm_inst.geog)
        match2.append(cm_inst.geog2)
        pval.append(cm_inst.mw_pval)
    if cm_inst.mw_trim:
        mw_same_trim.append(p)

  return atmp[sl]


In [85]:
len(mw_test_same) #107
# set(normal) #Only Pittsburgh Metro is normally distributed.
# len(mw_same_trim) #No arrays were trimmed to support analyses.
# pval #107 values.

107

In [64]:
tc = ['Metro', 'Total Compensation']
#Comparison dataframe of total comp mean, median, and n
compare = reduce(
    lambda f, s: pd.merge(f, s, on = tc[0]),
    [
        mean_median(df_us, tc).rename(columns = {tc[1]: f'Mean {tc[1]}'}),
        mean_median(df_us, tc, med = True).rename(columns = {tc[1]: f'Median {tc[1]}'}),
        df_us[tc].groupby(tc[0]).count().reset_index().rename(columns = {tc[1]: 'n'})
    ]
)

In [87]:
cols = ['Metro', 'CRMCYTOTC', 'DIVINDX_CY', 'X9001_I', 'HAI_CY', 'SQMI'] + [c for c in df_us.columns if 'sqmi' in c]

In [89]:
compare = pd.merge(compare, df_us[cols].drop_duplicates(), on = 'Metro')

In [8]:
#Matches based on mann whitney u test. An edgelist for graph.
matches = pd.DataFrame({'Match1': match1, 'Match2': match2, 'pval': pval})

In [90]:
#Writing files to review, compare, etc.
compare.to_csv('comparisons.csv', index = False)

In [91]:
matches.to_csv('matches.csv', index = False)