# Demographic Data in the Cluster Weighted by Zipcode Population

In [1]:
import numpy as np
import pandas as pd
import copy

### Read the Cluster Dataframe

In [2]:
df = pd.read_csv("labeled SD zipcode 6 clusters.csv")
df['Female %'] = 100 - df['Male %']
df

Unnamed: 0,Zipcode,Population,Land Area (Sq. Miles),Population Density (People per Square Mile),Median Age,Population By Age % (Under 18 Years),Population By Age % (18 to 34),Population By Age % (35 to 64),Population By Age % (65 and Over),Male %,...,Motorcycle %,Bicycle %,Walked %,Other %.1,Worked at Home %,Health Insurance Coverage %,Married %,Median House Value $,label,Female %
0,91902,17759.0,11.22,1583.23,45.8,17.62,20.69,38.98,22.72,48.33,...,0.16,0.37,0.00,1.53,10.71,94.78,52.66,732000.0,1,51.67
1,91910,76291.0,12.59,6061.25,38.2,21.70,23.58,39.95,14.77,49.92,...,0.52,0.29,0.95,0.57,8.75,91.27,48.09,549100.0,3,50.08
2,91911,88589.0,12.73,6960.37,35.8,24.01,24.85,38.28,12.86,48.26,...,0.18,0.19,1.23,1.04,5.67,90.78,49.91,485300.0,3,51.74
3,91913,53725.0,9.10,5906.59,35.0,25.91,24.07,39.45,10.58,50.70,...,0.26,0.00,1.29,2.28,11.35,93.92,53.91,578600.0,1,49.30
4,91914,17742.0,6.64,2670.34,37.1,25.17,20.57,44.35,9.92,52.68,...,0.08,0.43,7.09,3.51,10.57,93.01,61.59,739300.0,1,47.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,92081,29944.0,8.94,3351.09,37.2,22.70,23.30,40.92,13.08,51.13,...,0.95,0.11,1.22,0.44,12.79,91.90,49.03,599400.0,1,48.87
92,92083,38875.0,5.55,7008.14,33.1,26.01,27.68,36.28,10.03,50.05,...,0.65,0.18,1.25,2.40,5.81,86.26,47.16,467600.0,3,49.95
93,92084,49759.0,28.34,1755.69,35.1,23.92,25.97,37.22,12.89,51.87,...,0.03,0.28,0.78,0.84,9.61,87.58,47.87,600600.0,1,48.13
94,92091,1331.0,1.29,1031.56,58.2,14.73,10.74,39.52,35.01,43.35,...,0.00,0.00,0.00,0.00,22.47,93.09,49.87,1359400.0,0,56.65


In [3]:
sum_values = ['label',
              'Population', 
              'Land Area (Sq. Miles)', 
              'Population Density (People per Square Mile)']

weighted_df = df.filter(items=sum_values).groupby('label').sum()

weighted_df

Unnamed: 0_level_0,Population,Land Area (Sq. Miles),Population Density (People per Square Mile)
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,597291.0,535.62,48550.53
1,1080171.0,490.93,92181.71
2,107883.0,1738.57,1214.26
3,1219481.0,403.93,146298.97
4,410933.0,103.01,81906.77
5,2779.0,914.88,3.04


### Function to Transform a Variable Group into Proportions

In [4]:
def group_to_proportions(cols_lst):
    
    copy_df = copy.deepcopy(df).filter(items=['label', 'Population'] + cols_lst)

    for col in cols_lst:
        copy_df[col] *= copy_df['Population']

    copy_df = copy_df.groupby('label').sum()
    
    if len(cols_lst) > 1:
        copy_df['total'] = copy_df[cols_lst[0]]
        for i in range(1, len(cols_lst)):
            copy_df['total'] += copy_df[cols_lst[i]]

        for col in cols_lst:
            copy_df[col] /= copy_df['total']
            copy_df[col] *= 100
    
        copy_df = round(copy_df, 2)
        
    elif len(cols_lst) == 1:
        copy_df[cols_lst[0]] /= copy_df['Population']
        copy_df[cols_lst[0]] = round(copy_df[cols_lst[0]], 2)
    
    for col in cols_lst:
        weighted_df[col] = copy_df[col]


### Convert Groups of Demographic Features into Proportions Using the Function Above

In [5]:
# Median Age
median_age = ['Median Age']
group_to_proportions(median_age)

In [6]:
# Percentages of Range of Ages
age_percent = ['Population By Age % (Under 18 Years)', 
               'Population By Age % (18 to 34)', 
               'Population By Age % (35 to 64)', 
               'Population By Age % (65 and Over)']
group_to_proportions(age_percent)

In [7]:
# Pecentages of Genders
gender_percent = ['Male %', 'Female %']
group_to_proportions(gender_percent)

In [8]:
# Pecentages of Ethnicity
ethnicity_percent = ['White %', 'Black %', 'Asian %', 'Hispanic %', 'Other %']
group_to_proportions(ethnicity_percent)

In [9]:
# Pecentages of Levels of Education
edu_percent = ['Less Than High School %', 
               'High School Grad or Higher %', 
               'Bachelor\'s Degree or Higher %']
group_to_proportions(edu_percent)

In [10]:
# Median Household Income
income = ['Media Household Income $']
group_to_proportions(income)

In [11]:
# Gini Index
gini = ['Gini Index']
group_to_proportions(gini)

In [12]:
# Pecentage of Unemployment
unemploy = ['Unemployed % ']
group_to_proportions(unemploy)

In [13]:
# Percentage of Poverty
poverty = ['Poverty %']
group_to_proportions(poverty)

In [14]:
# Average Commute to Work in Minutes
commute_len = ['Average Commute to Work (min)']
group_to_proportions(commute_len)

In [15]:
# Percentages of Means of Commute
commute_means = ['Drove Alone %', 'Carpooled %', 
                 'Public Transit %', 'Motorcycle %', 
                 'Bicycle %', 'Walked %', 'Other %.1', 'Worked at Home %']
group_to_proportions(commute_means)

In [16]:
# Pecentage of Health Insurance Coverage
health_cover = ['Health Insurance Coverage %']
group_to_proportions(health_cover)

In [17]:
# Pecentage Married
married = ['Married %']
group_to_proportions(married)

In [18]:
# Median House Value
house_value = ['Median House Value $']
group_to_proportions(house_value)

In [19]:
weighted_df

Unnamed: 0_level_0,Population,Land Area (Sq. Miles),Population Density (People per Square Mile),Median Age,Population By Age % (Under 18 Years),Population By Age % (18 to 34),Population By Age % (35 to 64),Population By Age % (65 and Over),Male %,Female %,...,Carpooled %,Public Transit %,Motorcycle %,Bicycle %,Walked %,Other %.1,Worked at Home %,Health Insurance Coverage %,Married %,Median House Value $
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,597291.0,535.62,48550.53,41.86,23.25,17.5,42.33,16.91,49.28,50.72,...,6.19,0.96,0.4,0.53,1.5,0.9,19.56,96.27,59.61,962804.79
1,1080171.0,490.93,92181.71,37.17,22.57,24.45,38.43,14.55,49.92,50.08,...,8.17,1.53,0.44,0.3,1.24,1.15,11.62,93.53,52.28,614750.49
2,107883.0,1738.57,1214.26,43.09,21.91,19.49,39.97,18.62,51.7,48.3,...,6.72,0.24,0.26,0.0,1.3,0.55,10.71,93.22,58.02,584929.67
3,1219481.0,403.93,146298.97,34.43,16.95,19.65,26.13,37.27,50.07,49.93,...,10.93,4.23,0.41,0.27,2.06,1.08,7.51,89.41,46.58,496914.38
4,410933.0,103.01,81906.77,35.82,12.76,36.55,36.98,13.72,52.34,47.66,...,5.65,3.29,0.55,1.61,5.13,1.34,17.06,94.14,40.11,837774.98
5,2779.0,914.88,3.04,62.8,10.69,9.86,32.82,46.64,42.75,57.25,...,0.1,0.0,0.0,0.0,0.2,0.0,35.98,94.6,69.04,320600.0


### Fix Column Names

In [20]:
# 'Other %' -> 'Other Ethnicity %'
weighted_df = weighted_df.rename(columns={'Other %': 'Other Ethnicity %'})

# 'Unemployed % ' -> 'Unemployed %'
weighted_df = weighted_df.rename(columns={'Unemployed % ': 'Unemployed %'})

# 'Unemployed % ' -> 'Unemployed %'
weighted_df = weighted_df.rename(columns={'Unemployed % ': 'Unemployed %'})

# 'Other %.1' -> 'Other Means of Commute %'
weighted_df = weighted_df.rename(columns={'Other %.1': 'Other Means of Commute %'})

# 'Media Household Income $' -> 'Median Household Income $'
weighted_df = weighted_df.rename(columns={'Media Household Income $': 
                                          'Median Household Income $'})

In [21]:
weighted_df

Unnamed: 0_level_0,Population,Land Area (Sq. Miles),Population Density (People per Square Mile),Median Age,Population By Age % (Under 18 Years),Population By Age % (18 to 34),Population By Age % (35 to 64),Population By Age % (65 and Over),Male %,Female %,...,Carpooled %,Public Transit %,Motorcycle %,Bicycle %,Walked %,Other Means of Commute %,Worked at Home %,Health Insurance Coverage %,Married %,Median House Value $
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,597291.0,535.62,48550.53,41.86,23.25,17.5,42.33,16.91,49.28,50.72,...,6.19,0.96,0.4,0.53,1.5,0.9,19.56,96.27,59.61,962804.79
1,1080171.0,490.93,92181.71,37.17,22.57,24.45,38.43,14.55,49.92,50.08,...,8.17,1.53,0.44,0.3,1.24,1.15,11.62,93.53,52.28,614750.49
2,107883.0,1738.57,1214.26,43.09,21.91,19.49,39.97,18.62,51.7,48.3,...,6.72,0.24,0.26,0.0,1.3,0.55,10.71,93.22,58.02,584929.67
3,1219481.0,403.93,146298.97,34.43,16.95,19.65,26.13,37.27,50.07,49.93,...,10.93,4.23,0.41,0.27,2.06,1.08,7.51,89.41,46.58,496914.38
4,410933.0,103.01,81906.77,35.82,12.76,36.55,36.98,13.72,52.34,47.66,...,5.65,3.29,0.55,1.61,5.13,1.34,17.06,94.14,40.11,837774.98
5,2779.0,914.88,3.04,62.8,10.69,9.86,32.82,46.64,42.75,57.25,...,0.1,0.0,0.0,0.0,0.2,0.0,35.98,94.6,69.04,320600.0


### Export the Weighted Clusters DataFrame

In [22]:
# Export the dataframe as csv and Excel files
csv_filename = '6 clusters weighted by zipcode population.csv'
excel_filename = '6 clusters weighted by zipcode population.xlsx'

weighted_df.to_csv(csv_filename, header=True)
weighted_df.to_excel(excel_filename, header=True)