# Age & Sex Statistics

## Import Needed Libraries

In [310]:
import pandas as pd
from openpyxl import workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import numpy as np
import os
import math

## Loading file to dataframes

List of dataframes and their proper order

In [311]:
# Creating ordered list of DFs to name each sheet coming in from workbook
df_names = ['DF_total_all','DF_male_all','DF_female_all','DF_total_whi',
            'DF_total_male_whi','DF_total_female_whi','DF_total_baa',
            'DF_total_male_baa','DF_total_female_baa','DF_total_aian',
            'DF_total_male_aian','DF_total_female_aian','DF_total_aa',
            'DF_total_male_aa','DF_total_female_aa','DF_total_nhop',
            'DF_total_male_nhop','DF_total_female_nhop','DF_total_sor',
            'DF_total_male_sor','DF_total_female_sor','DF_total_tom',
            'DF_total_male_tom','DF_total_female_tom','DF_total_hol',
            'DF_total_male_hol','DF_total_female_hol'
            ]

### Variables for column names and type

In [312]:
cols_int = ['Under 1 Year','1 Year','2 Years','3 Years','4 Years','5 Years',
            '6 Years','7 Years','8 Years','9 Years','10 Years','11 Years',
            '12 Years','13 Years','14 Years','15 Years','16 Years','17 Years',
            '18 Years','19 Years','20 Years','21 Years','22 Years','23 Years',
            '24 Years','25 Years','26 Years','27 Years','28 Years','29 Years',
            '30 Years','31 Years','32 Years','33 Years','34 Years','35 Years',
            '36 Years','37 Years','38 Years','39 Years','40 Years','41 Years',
            '42 Years','43 Years','44 Years','45 Years','46 Years','47 Years',
            '48 Years','49 Years','50 Years','51 Years','52 Years','53 Years',
            '54 Years','55 Years','56 Years','57 Years','58 Years','59 Years',
            '60 Years','61 Years','62 Years','63 Years','64 Years','65 Years',
            '66 Years','67 Years','68 Years','69 Years','70 Years','71 Years',
            '72 Years','73 Years','74 Years','75 Years','76 Years','77 Years',
            '78 Years','79 Years','80 Years','81 Years','82 Years','83 Years',
            '84 Years','85 Years','86 Years','87 Years','88 Years','89 Years',
            '90 Years','91 Years','92 Years','93 Years','94 Years','95 Years',
            '96 Years','97  Years','98  Years','99  Years','100 to 104  Years',
            '105 to 109  Years','110  Years and Over'
            ]

cols_str = ['Location', 'State', 'County', 'FIPS']

### Loading process

In [313]:
# Dictionary to store the dataframes
dfs = {}

# Get the path of the current working directory
cwd = os.getcwd()

# Construct the full path to the .xlsx file
file_path = os.path.join(cwd, '2020_agesex_data.xlsx')

# Load each sheet of the .xlsx file into a named dataframe
for name in df_names:
    df = pd.read_excel(file_path, sheet_name=name, header=None)  # Specify header=None to treat the first row as data
    df.columns = df.iloc[0]  # Set the first row as the column headers
    df = df[1:]  # Exclude the first row from the data
    df.reset_index(drop=True, inplace=True)  # Reset the index
    
    # Change column types based on column names
    for column in cols_int:
        if column in df.columns:
            df[column] = df[column].astype(int)
    
    for column in cols_str:
        if column in df.columns:
            df[column] = df[column].astype(str)
    
    dfs[name] = df

### Verifying

Print all the name and shape of the dataframes loaded in

In [None]:
# Print the shape of the dataframes in 'dfs'
print("Data Frames in 'dfs':")
for name in dfs.keys():
    print(name)
    print(dfs[name].shape)

Print all the names & dtype of the dataframes loaded in

In [None]:
# Print the names & dtype of the dataframes in 'dfs'
print("Data Frames in 'dfs':")
for name in dfs.keys():
    print(name)
    print(dfs[name].dtypes)

Print all the column names of the dataframes loaded in

In [None]:
# Iterate over the dataframes in 'dfs'
for key, df in dfs.items():
    print(f"Columns of dataframe '{key}': {list(df.columns)}")

## Building Percent to Total column
Percent to total for male & female age groups

### Grouping Dataframes with like structures

In [306]:
perc_df = ['DF_male_all','DF_female_all','DF_total_male_whi',
             'DF_total_female_whi','DF_total_male_baa','DF_total_female_baa',
             'DF_total_male_aian','DF_total_female_aian','DF_total_male_aa',
             'DF_total_female_aa','DF_total_male_nhop','DF_total_female_nhop',
             'DF_total_male_sor','DF_total_female_sor','DF_total_male_tom',
             'DF_total_female_tom','DF_total_male_hol','DF_total_female_hol'
             ]

perc_df_2 = ['DF_total_all','DF_total_whi','DF_total_baa','DF_total_aian',
             'DF_total_aa','DF_total_nhop','DF_total_sor','DF_total_tom',
             'DF_total_hol'
             ]

### Finding percent to total

In [None]:
# Iterate over the dictionary of dataframes
for df_name, df in dfs.items():
    if df_name in perc_df:
        # Get the list of columns in the dataframe that match cols_int
        columns = [col for col in df.columns if col in cols_int]
        updated_columns = []

        # Reset the index to consolidate memory layout
        df.reset_index(drop=True, inplace=True)

        # Calculate the percentage values for the new column
        for col in columns:
            new_col_name = f'{col}_perc'
            updated_columns.extend([col, new_col_name])

            df[new_col_name] = [0 if total == 0 else (value / total)
                                for value, total in zip(df[col], df['Total'])]
        # Append ['Location', 'State', 'County', 'FIPS'] to updated_columns
        updated_columns.extend(['Total','Location', 'State', 'County', 'FIPS'])
        # Reorder the columns in the dataframe
        df = df[updated_columns]

        # Update the dataframe in the 'dfs' dictionary
        dfs[df_name] = df

### Verifying

In [183]:
# Print the list of columns
print(len(dfs['DF_male_all'].columns.tolist()))
print(dfs['DF_male_all'].columns.tolist())

211
['Under 1 Year', 'Under 1 Year_perc', '1 Year', '1 Year_perc', '2 Years', '2 Years_perc', '3 Years', '3 Years_perc', '4 Years', '4 Years_perc', '5 Years', '5 Years_perc', '6 Years', '6 Years_perc', '7 Years', '7 Years_perc', '8 Years', '8 Years_perc', '9 Years', '9 Years_perc', '10 Years', '10 Years_perc', '11 Years', '11 Years_perc', '12 Years', '12 Years_perc', '13 Years', '13 Years_perc', '14 Years', '14 Years_perc', '15 Years', '15 Years_perc', '16 Years', '16 Years_perc', '17 Years', '17 Years_perc', '18 Years', '18 Years_perc', '19 Years', '19 Years_perc', '20 Years', '20 Years_perc', '21 Years', '21 Years_perc', '22 Years', '22 Years_perc', '23 Years', '23 Years_perc', '24 Years', '24 Years_perc', '25 Years', '25 Years_perc', '26 Years', '26 Years_perc', '27 Years', '27 Years_perc', '28 Years', '28 Years_perc', '29 Years', '29 Years_perc', '30 Years', '30 Years_perc', '31 Years', '31 Years_perc', '32 Years', '32 Years_perc', '33 Years', '33 Years_perc', '34 Years', '34 Years

### Verifying dataframes after percent of total

Print all the name and shape of the dataframes after percent of total

In [309]:
# Print the shape of the dataframes in 'dfs'
print("Data Frames in 'perc_df':")
for df_name, df in dfs.items():
    if df_name in perc_df:
        print("Dataframe Name:", df_name)
        print("Shape:", df.shape)
        print("Columns:", df.columns.tolist())

Data Frames in 'perc_df':


## Preparing for weighted average

Removing modified dataframes from dict dfs for further processing

In [308]:
# Initialize the new dictionary 'dfs_perc'
dfs_perc = {}

# Iterate over the dataframe names in 'perc_df' list
for df_name in perc_df:
    # Check if the dataframe name exists in 'dfs' dictionary
    if df_name in dfs:
        # Move the matching dataframe from 'dfs' to 'dfs_perc'
        dfs_perc[df_name] = dfs.pop(df_name)

Printing list of both dataframe dictionaries to verify last step

We should have 9 in dict 'dfs' and 18 in new dict 'perc_df'

In [None]:
print("Dictionary 'dfs':")
print(len(dfs.keys()))
print(list(dfs.keys()))

print("\nDictionary 'perc_df':")
print(len(dfs_perc.keys()))
print(list(dfs_perc.keys()))

Dropping columns to new dict with corresponding dataframes

In [None]:
dropped_columns = ['Under 1 Year_perc','1 Year_perc','2 Years_perc',
                  '3 Years_perc','4 Years_perc','5 Years_perc','6 Years_perc',
                  '7 Years_perc','8 Years_perc','9 Years_perc','10 Years_perc',
                  '11 Years_perc','12 Years_perc','13 Years_perc',
                  '14 Years_perc','15 Years_perc','16 Years_perc',
                  '17 Years_perc','18 Years_perc','19 Years_perc',
                  '20 Years_perc','21 Years_perc','22 Years_perc',
                  '23 Years_perc','24 Years_perc','25 Years_perc',
                  '26 Years_perc','27 Years_perc','28 Years_perc',
                  '29 Years_perc','30 Years_perc','31 Years_perc',
                  '32 Years_perc','33 Years_perc','34 Years_perc',
                  '35 Years_perc','36 Years_perc','37 Years_perc',
                  '38 Years_perc','39 Years_perc','40 Years_perc',
                  '41 Years_perc','42 Years_perc','43 Years_perc',
                  '44 Years_perc','45 Years_perc','46 Years_perc',
                  '47 Years_perc','48 Years_perc','49 Years_perc',
                  '50 Years_perc','51 Years_perc','52 Years_perc',
                  '53 Years_perc','54 Years_perc','55 Years_perc',
                  '56 Years_perc','57 Years_perc','58 Years_perc',
                  '59 Years_perc','60 Years_perc','61 Years_perc',
                  '62 Years_perc','63 Years_perc','64 Years_perc',
                  '65 Years_perc','66 Years_perc','67 Years_perc',
                  '68 Years_perc','69 Years_perc','70 Years_perc',
                  '71 Years_perc','72 Years_perc','73 Years_perc',
                  '74 Years_perc','75 Years_perc','76 Years_perc',
                  '77 Years_perc','78 Years_perc','79 Years_perc',
                  '80 Years_perc','81 Years_perc','82 Years_perc',
                  '83 Years_perc','84 Years_perc','85 Years_perc',
                  '86 Years_perc','87 Years_perc','88 Years_perc',
                  '89 Years_perc','90 Years_perc','91 Years_perc',
                  '92 Years_perc','93 Years_perc','94 Years_perc',
                  '95 Years_perc','96 Years_perc','97  Years_perc',
                  '98  Years_perc','99  Years_perc','100 to 104  Years_perc',
                  '105 to 109  Years_perc','110  Years and Over_perc','Total',
                  'Location','State','County','FIPS'
                  ]

In [291]:
dropped_dropped_columns = {}

# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dfs_perc.items():
    # Store the dropped columns' data
    dropped_dropped_columns[df_name] = df[dropped_columns]

    # Drop the specified columns from each dataframe
    dfs_perc[df_name] = df.drop(dropped_columns, axis=1)

verifying operation

In [None]:
print("Dictionary 'dfs_perc':")
for df_name, df in dfs_perc.items():
    print(f"DataFrame '{df_name}' column names:")
    print(list(df.columns))

print("\nDictionary 'dropped_columns':")
for df_name, df in dropped_string_columns.items():
    print(f"DataFrame '{df_name}' column names:")
    print(list(df.columns))

Creating variable for weight columns to be moved to new dataframes

In [292]:
weight_columns = ['Under 1 Year_perc','1 Year_perc','2 Years_perc',
                  '3 Years_perc','4 Years_perc','5 Years_perc','6 Years_perc',
                  '7 Years_perc','8 Years_perc','9 Years_perc','10 Years_perc',
                  '11 Years_perc','12 Years_perc','13 Years_perc',
                  '14 Years_perc','15 Years_perc','16 Years_perc',
                  '17 Years_perc','18 Years_perc','19 Years_perc',
                  '20 Years_perc','21 Years_perc','22 Years_perc',
                  '23 Years_perc','24 Years_perc','25 Years_perc',
                  '26 Years_perc','27 Years_perc','28 Years_perc',
                  '29 Years_perc','30 Years_perc','31 Years_perc',
                  '32 Years_perc','33 Years_perc','34 Years_perc',
                  '35 Years_perc','36 Years_perc','37 Years_perc',
                  '38 Years_perc','39 Years_perc','40 Years_perc',
                  '41 Years_perc','42 Years_perc','43 Years_perc',
                  '44 Years_perc','45 Years_perc','46 Years_perc',
                  '47 Years_perc','48 Years_perc','49 Years_perc',
                  '50 Years_perc','51 Years_perc','52 Years_perc',
                  '53 Years_perc','54 Years_perc','55 Years_perc',
                  '56 Years_perc','57 Years_perc','58 Years_perc',
                  '59 Years_perc','60 Years_perc','61 Years_perc',
                  '62 Years_perc','63 Years_perc','64 Years_perc',
                  '65 Years_perc','66 Years_perc','67 Years_perc',
                  '68 Years_perc','69 Years_perc','70 Years_perc',
                  '71 Years_perc','72 Years_perc','73 Years_perc',
                  '74 Years_perc','75 Years_perc','76 Years_perc',
                  '77 Years_perc','78 Years_perc','79 Years_perc',
                  '80 Years_perc','81 Years_perc','82 Years_perc',
                  '83 Years_perc','84 Years_perc','85 Years_perc',
                  '86 Years_perc','87 Years_perc','88 Years_perc',
                  '89 Years_perc','90 Years_perc','91 Years_perc',
                  '92 Years_perc','93 Years_perc','94 Years_perc',
                  '95 Years_perc','96 Years_perc','97  Years_perc',
                  '98  Years_perc','99  Years_perc','100 to 104  Years_perc',
                  '105 to 109  Years_perc','110  Years and Over_perc'
                  ]

building new dataframes from weight columns to be used for weight calc

In [293]:
dropped_weight_columns = {}

# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dfs_perc.items():
    # Store the dropped columns' data
    dropped_weight_columns[df_name] = df[weight_columns]

    # Drop the specified columns from each dataframe
    dfs_perc[df_name] = df.drop(weight_columns, axis=1)

verifying operation

In [None]:
print("Dictionary 'dfs_perc':")
for df_name, df in dfs_perc.items():
    print(f"DataFrame '{df_name}' column names:")
    print(list(df.columns))

print("\nDictionary 'dropped_columns':")
for df_name, df in dropped_string_columns.items():
    print(f"DataFrame '{df_name}' column names:")
    print(list(df.columns))

print("\nDictionary 'dropped_weight_columns':")
for df_name, df in dropped_weight_columns.items():
    print(f"DataFrame '{df_name}' column names:")
    print(list(df.columns))

Changing column names and types for dict 'dfs_perc'

In [294]:
cols_int = ['Under 1 Year','1 Year','2 Years','3 Years','4 Years','5 Years',
            '6 Years','7 Years','8 Years','9 Years','10 Years','11 Years',
            '12 Years','13 Years','14 Years','15 Years','16 Years','17 Years',
            '18 Years','19 Years','20 Years','21 Years','22 Years','23 Years',
            '24 Years','25 Years','26 Years','27 Years','28 Years','29 Years',
            '30 Years','31 Years','32 Years','33 Years','34 Years','35 Years',
            '36 Years','37 Years','38 Years','39 Years','40 Years','41 Years',
            '42 Years','43 Years','44 Years','45 Years','46 Years','47 Years',
            '48 Years','49 Years','50 Years','51 Years','52 Years','53 Years',
            '54 Years','55 Years','56 Years','57 Years','58 Years','59 Years',
            '60 Years','61 Years','62 Years','63 Years','64 Years','65 Years',
            '66 Years','67 Years','68 Years','69 Years','70 Years','71 Years',
            '72 Years','73 Years','74 Years','75 Years','76 Years','77 Years',
            '78 Years','79 Years','80 Years','81 Years','82 Years','83 Years',
            '84 Years','85 Years','86 Years','87 Years','88 Years','89 Years',
            '90 Years','91 Years','92 Years','93 Years','94 Years','95 Years',
            '96 Years','97  Years','98  Years','99  Years','100 to 104  Years',
            '105 to 109  Years','110  Years and Over'
            ]
cols_mod = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,
            25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
            47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,
            69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
            91,92,93,94,95,96,97,98,99,102,107,110
            ] 


Changing column labels and type of columns in dataframes in dict 'dfs_perc'

In [295]:
# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dfs_perc.items():
    # Change the column names to the list of integers
    df.columns = cols_mod
    
    # Convert the column type to integer
    df[cols_mod] = df[cols_mod].astype(int)

verify operation

Changing column labels and type of columns in dataframes in dict 'dfs_perc'

In [296]:
# # Iterate over the dataframes in 'dfs_perc'
# for df_name, df in dropped_weight_columns.items():
#     # Change the column names to the list of integers
#     df.columns = cols_mod
    
#     # Convert the column type to integer
#     df[cols_mod] = df[cols_mod].astype(int)

verify operation

In [None]:
# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dropped_weight_columns.items():
    print(f"DataFrame '{df_name}' column names:")
    print(list(df.columns))
    print(f"DataFrame '{df_name}' column types:")
    print(df.dtypes)
    print()

### Building Weighted Average Column

In [None]:
# Iterate over each key-value pair in dfs_perc
for df_name, df in dfs_perc.items():
    # Get the column names as an array of integers
    values = np.array(df.columns, dtype=int)

    # Initialize an empty list to store the results
    results = []

    # Iterate over each row in the dataframe
    for _, row in df.iterrows():
        # Get the row entries as an array of integers
        weights = np.array(row.values, dtype=int)

        # Perform element-wise multiplication of values and weights
        weighted_values = values * weights

        # Sum the products of the multiplications
        weighted_sum = np.sum(weighted_values)

        # Sum all items in the weights array
        weights_sum = np.sum(weights)

        # Calculate the weighted average
        weighted_average = weighted_sum / weights_sum

        # Append the weighted average to the results list
        results.append(weighted_average)

    # Add the 'Average_Age' column to the dataframe
    df['Average_Age'] = results

    # Reset the index of the dataframe
    df.reset_index(drop=True, inplace=True)

verifying output to dataframes in dictionary 'weighted_averages'

In [298]:
# Iterate over the dictionaries in 'dropped_string_columns'
for df_name, df_dict in dfs_perc.items():
    print(f"DataFrame '{df_name}' column names:")
    print(list(df_dict.keys()))
    print()

DataFrame 'DF_male_all' column names:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 102, 107, 110, 'Average_Age']

DataFrame 'DF_female_all' column names:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 102, 107, 110, 'Average_Age']

DataFrame 'DF_total_male_whi' column names:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,

verifying values in column of specific dataframe in 'dfs_perc'

In [299]:
# Check if 'DF_male_all' exists in 'dfs_perc' dictionary
if 'DF_male_all' in dfs_perc:
    print("DataFrame 'DF_male_all':")
    print(dfs_perc['DF_male_all'])
else:
    print("DataFrame 'DF_male_all' does not exist in 'dfs_perc' dictionary.")

DataFrame 'DF_male_all':
        0    1    2    3    4    5    6    7    8    9  ...  94  95  96  97  \
0     437  450  511  484  513  525  576  559  549  564  ...   6   8   3   3   
1      70   70   84   91   80   90  116   67   97   81  ...   0   2   0   0   
2      72   80   83   60   82   78   69   84   67   90  ...   3   1   4   3   
3     157  203  182  171  192  161  202  186  158  184  ...   4   7   0   4   
4      64   51   65   60   62   79   85   71   76   71  ...   2   0   2   0   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ..  ..  ..  ..   
3216   98   86  120  100  109  120  115   96  114   96  ...   3   2   4   0   
3217  183  124  170  184  175  185  185  160  188  197  ...   5   5   2   1   
3218  190  218  223  195  287  247  211  223  237  185  ...  26   6   3   6   
3219   20   10    4   14    9   10   16   17   18   11  ...   0   0   0   0   
3220  593  582  669  639  691  704  658  650  687  618  ...  13   5   3   9   

      98  99  102  107  11