# Age & Sex Statistics

## Import Needed Libraries

In [120]:
import pandas as pd
from openpyxl import workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import numpy as np
import os
import math

## Loading file to dataframes

List of dataframes and their proper order

In [121]:
# Creating ordered list of DFs to name each sheet coming in from workbook
df_names = ['DF_total_all','DF_male_all','DF_female_all','DF_total_whi',
            'DF_total_male_whi','DF_total_female_whi','DF_total_baa',
            'DF_total_male_baa','DF_total_female_baa','DF_total_aian',
            'DF_total_male_aian','DF_total_female_aian','DF_total_aa',
            'DF_total_male_aa','DF_total_female_aa','DF_total_nhop',
            'DF_total_male_nhop','DF_total_female_nhop','DF_total_sor',
            'DF_total_male_sor','DF_total_female_sor','DF_total_tom',
            'DF_total_male_tom','DF_total_female_tom','DF_total_hol',
            'DF_total_male_hol','DF_total_female_hol'
            ]

### Variables for column names and type

In [122]:
cols_int = ['Under 1 Year','1 Year','2 Years','3 Years','4 Years','5 Years',
            '6 Years','7 Years','8 Years','9 Years','10 Years','11 Years',
            '12 Years','13 Years','14 Years','15 Years','16 Years','17 Years',
            '18 Years','19 Years','20 Years','21 Years','22 Years','23 Years',
            '24 Years','25 Years','26 Years','27 Years','28 Years','29 Years',
            '30 Years','31 Years','32 Years','33 Years','34 Years','35 Years',
            '36 Years','37 Years','38 Years','39 Years','40 Years','41 Years',
            '42 Years','43 Years','44 Years','45 Years','46 Years','47 Years',
            '48 Years','49 Years','50 Years','51 Years','52 Years','53 Years',
            '54 Years','55 Years','56 Years','57 Years','58 Years','59 Years',
            '60 Years','61 Years','62 Years','63 Years','64 Years','65 Years',
            '66 Years','67 Years','68 Years','69 Years','70 Years','71 Years',
            '72 Years','73 Years','74 Years','75 Years','76 Years','77 Years',
            '78 Years','79 Years','80 Years','81 Years','82 Years','83 Years',
            '84 Years','85 Years','86 Years','87 Years','88 Years','89 Years',
            '90 Years','91 Years','92 Years','93 Years','94 Years','95 Years',
            '96 Years','97  Years','98  Years','99  Years','100 to 104  Years',
            '105 to 109  Years','110  Years and Over'
            ]

cols_str = ['Location', 'State', 'County', 'FIPS']

### Loading process

In [123]:
# Dictionary to store the dataframes
dfs = {}

# Get the path of the current working directory
cwd = os.getcwd()

# Construct the full path to the .xlsx file
file_path = os.path.join(cwd, '2020_agesex_data.xlsx')

# Load each sheet of the .xlsx file into a named dataframe
for name in df_names:
    df = pd.read_excel(file_path, sheet_name=name, header=None)  # Specify header=None to treat the first row as data
    df.columns = df.iloc[0]  # Set the first row as the column headers
    df = df[1:]  # Exclude the first row from the data
    df.reset_index(drop=True, inplace=True)  # Reset the index
    
    # Change column types based on column names
    for column in cols_int:
        if column in df.columns:
            df[column] = df[column].astype(int)
    
    for column in cols_str:
        if column in df.columns:
            df[column] = df[column].astype(str)
    
    dfs[name] = df

### Verifying

Print all the names, shapes, and dtypes of the dataframes loaded in

In [None]:
# # Print the names, shapes, and dtypes of the dataframes in 'dfs'
# print("Data Frames in 'dfs':")
# for name, df in dfs.items():
#     print(name)
#     print("Shape:", df.shape)
#     print("Dtypes:")
#     for column, dtype in df.dtypes.items():
#         print(f"{column}: {dtype}")
#     print()

## Building Percent to Total column
Percent to total for male & female age groups

### Grouping Dataframes with like structures

In [None]:
perc_df = ['DF_male_all','DF_female_all','DF_total_male_whi',
             'DF_total_female_whi','DF_total_male_baa','DF_total_female_baa',
             'DF_total_male_aian','DF_total_female_aian','DF_total_male_aa',
             'DF_total_female_aa','DF_total_male_nhop','DF_total_female_nhop',
             'DF_total_male_sor','DF_total_female_sor','DF_total_male_tom',
             'DF_total_female_tom','DF_total_male_hol','DF_total_female_hol'
             ]

perc_df_2 = ['DF_total_all','DF_total_whi','DF_total_baa','DF_total_aian',
             'DF_total_aa','DF_total_nhop','DF_total_sor','DF_total_tom',
             'DF_total_hol'
             ]

### Percent to total process

In [None]:
# Iterate over the dictionary of dataframes
for df_name, df in dfs.items():
    if df_name in perc_df:
        # Get the list of columns in the dataframe that match cols_int
        columns = [col for col in df.columns if col in cols_int]
        updated_columns = []

        # Reset the index to consolidate memory layout
        df.reset_index(drop=True, inplace=True)

        # Calculate the percentage values for the new column
        for col in columns:
            new_col_name = f'{col}_perc'
            updated_columns.extend([col, new_col_name])

            df[new_col_name] = [0 if total == 0 else (value / total)
                                for value, total in zip(df[col], df['Total'])]
        # Append ['Location', 'State', 'County', 'FIPS'] to updated_columns
        updated_columns.extend(['Total','Location', 'State', 'County', 'FIPS'])
        # Reorder the columns in the dataframe
        df = df[updated_columns]

        # Update the dataframe in the 'dfs' dictionary
        dfs[df_name] = df

### Verifying

Print all the name,shape,# of cols, col names of the dataframes

In [None]:
# # Print the shape of the dataframes in 'dfs'
# print("Data Frames in 'perc_df':")
# for df_name, df in dfs.items():
#     if df_name in perc_df:
#         print("Dataframe Name:", df_name)
#         print("Shape:", df.shape)
#         print(f"Number of Columns: {len(df.columns)}")
#         print("Columns:", df.columns.tolist())

## Building Weighted Average Column for'perc_df' listed dfs

Removing modified dataframes from dict dfs for further processing

In [None]:
# Initialize the new dictionary 'dfs_perc'
dfs_perc = {}

# Iterate over the dataframe names in 'perc_df' list
for df_name in perc_df:
    # Check if the dataframe name exists in 'dfs' dictionary
    if df_name in dfs:
        # Move the matching dataframe from 'dfs' to 'dfs_perc'
        dfs_perc[df_name] = dfs.pop(df_name)

### Verifying

In [None]:
# # Printing list of both dataframe dictionaries to verify last step

# # We should have 9 in dict 'dfs' and 18 in new dict 'perc_df'

# print("Dictionary 'dfs':")
# print(len(dfs.keys()))
# print(list(dfs.keys()))

# print("\nDictionary 'perc_df':")
# print(len(dfs_perc.keys()))
# print(list(dfs_perc.keys()))

### Variables for columns to drop to new dictionary

In [None]:
columns_to_drop = ['Under 1 Year_perc','1 Year_perc','2 Years_perc',
                  '3 Years_perc','4 Years_perc','5 Years_perc','6 Years_perc',
                  '7 Years_perc','8 Years_perc','9 Years_perc','10 Years_perc',
                  '11 Years_perc','12 Years_perc','13 Years_perc',
                  '14 Years_perc','15 Years_perc','16 Years_perc',
                  '17 Years_perc','18 Years_perc','19 Years_perc',
                  '20 Years_perc','21 Years_perc','22 Years_perc',
                  '23 Years_perc','24 Years_perc','25 Years_perc',
                  '26 Years_perc','27 Years_perc','28 Years_perc',
                  '29 Years_perc','30 Years_perc','31 Years_perc',
                  '32 Years_perc','33 Years_perc','34 Years_perc',
                  '35 Years_perc','36 Years_perc','37 Years_perc',
                  '38 Years_perc','39 Years_perc','40 Years_perc',
                  '41 Years_perc','42 Years_perc','43 Years_perc',
                  '44 Years_perc','45 Years_perc','46 Years_perc',
                  '47 Years_perc','48 Years_perc','49 Years_perc',
                  '50 Years_perc','51 Years_perc','52 Years_perc',
                  '53 Years_perc','54 Years_perc','55 Years_perc',
                  '56 Years_perc','57 Years_perc','58 Years_perc',
                  '59 Years_perc','60 Years_perc','61 Years_perc',
                  '62 Years_perc','63 Years_perc','64 Years_perc',
                  '65 Years_perc','66 Years_perc','67 Years_perc',
                  '68 Years_perc','69 Years_perc','70 Years_perc',
                  '71 Years_perc','72 Years_perc','73 Years_perc',
                  '74 Years_perc','75 Years_perc','76 Years_perc',
                  '77 Years_perc','78 Years_perc','79 Years_perc',
                  '80 Years_perc','81 Years_perc','82 Years_perc',
                  '83 Years_perc','84 Years_perc','85 Years_perc',
                  '86 Years_perc','87 Years_perc','88 Years_perc',
                  '89 Years_perc','90 Years_perc','91 Years_perc',
                  '92 Years_perc','93 Years_perc','94 Years_perc',
                  '95 Years_perc','96 Years_perc','97  Years_perc',
                  '98  Years_perc','99  Years_perc','100 to 104  Years_perc',
                  '105 to 109  Years_perc','110  Years and Over_perc','Total',
                  'Location','State','County','FIPS'
                  ]

### Dropping process

In [None]:
dropped_columns = {}

# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dfs_perc.items():
    # Store the dropped columns' data
    dropped_columns[df_name] = df[columns_to_drop]

    # Drop the specified columns from each dataframe
    dfs_perc[df_name] = df.drop(columns_to_drop, axis=1)

#### Verifying

In [None]:
# print("Dictionary 'dfs_perc':")
# for df_name, df in dfs_perc.items():
#     print(f"DataFrame '{df_name}' column names:")
#     print(list(df.columns))

# print("\nDictionary 'dropped_columns':")
# for df_name, df in dropped_columns.items():
#     print(f"DataFrame '{df_name}' column names:")
#     print(list(df.columns))

### Variables for columns names

In [None]:
cols_int = ['Under 1 Year','1 Year','2 Years','3 Years','4 Years','5 Years',
            '6 Years','7 Years','8 Years','9 Years','10 Years','11 Years',
            '12 Years','13 Years','14 Years','15 Years','16 Years','17 Years',
            '18 Years','19 Years','20 Years','21 Years','22 Years','23 Years',
            '24 Years','25 Years','26 Years','27 Years','28 Years','29 Years',
            '30 Years','31 Years','32 Years','33 Years','34 Years','35 Years',
            '36 Years','37 Years','38 Years','39 Years','40 Years','41 Years',
            '42 Years','43 Years','44 Years','45 Years','46 Years','47 Years',
            '48 Years','49 Years','50 Years','51 Years','52 Years','53 Years',
            '54 Years','55 Years','56 Years','57 Years','58 Years','59 Years',
            '60 Years','61 Years','62 Years','63 Years','64 Years','65 Years',
            '66 Years','67 Years','68 Years','69 Years','70 Years','71 Years',
            '72 Years','73 Years','74 Years','75 Years','76 Years','77 Years',
            '78 Years','79 Years','80 Years','81 Years','82 Years','83 Years',
            '84 Years','85 Years','86 Years','87 Years','88 Years','89 Years',
            '90 Years','91 Years','92 Years','93 Years','94 Years','95 Years',
            '96 Years','97 Years','98 Years','99 Years','100 to 104 Years',
            '105 to 109 Years','110 Years and Over'
            ]
cols_mod = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,
            25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
            47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,
            69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
            91,92,93,94,95,96,97,98,99,102,107,110
            ] 


### Column change process

In [None]:
# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dfs_perc.items():
    # Change the column names to the list of integers
    df.columns = cols_mod
    
    # Convert the column type to integer
    df[cols_mod] = df[cols_mod].astype(int)

#### Verifying

In [None]:
# # Iterate over the dataframes in 'dfs_perc'
# for df_name, df in dfs_perc.items():
#     print(f"DataFrame '{df_name}' column names:")
#     print(list(df.columns))
#     print(f"DataFrame '{df_name}' column types:")
#     print(df.dtypes)
#     print()

## Finding weighted Average

In [None]:
# Iterate over each key-value pair in dfs_perc
for df_name, df in dfs_perc.items():
    # Get the column names as an array of integers
    values = np.array(df.columns, dtype=int)

    # Initialize an empty list to store the results
    results = []

    # Iterate over each row in the dataframe
    for _, row in df.iterrows():
        # Get the row entries as an array of integers
        weights = np.array(row.values, dtype=int)

        # Perform element-wise multiplication of values and weights
        weighted_values = values * weights

        # Sum the products of the multiplications
        weighted_sum = np.sum(weighted_values)

        # Sum all items in the weights array
        weights_sum = np.sum(weights)

        # Calculate the weighted average
        weighted_average = weighted_sum / weights_sum

        # Append the weighted average to the results list
        results.append(weighted_average)

    # Add the 'Average_Age' column to the dataframe
    df['Average_Age'] = results

    # Reset the index of the dataframe
    df.reset_index(drop=True, inplace=True)

### Verifying

In [None]:
# # Iterate over the dictionaries in 'dropped_string_columns'
# for df_name, df_dict in dfs_perc.items():
#     print(f"DataFrame '{df_name}' column names:")
#     print(list(df_dict.keys()))
#     print()

verifying values in column of specific dataframe in 'dfs_perc'

In [None]:
# # Check if 'DF_male_all' exists in 'dfs_perc' dictionary
# if 'DF_male_all' in dfs_perc:
#     print("DataFrame 'DF_male_all':")
#     print(dfs_perc['DF_male_all'])
# else:
#     print("DataFrame 'DF_male_all' does not exist in 'dfs_perc' dictionary.")

## Joining Dataframes

In [None]:
# Iterate over the dataframes in 'dfs_perc'
for df_name, df_perc in dfs_perc.items():
    # Get the corresponding dataframe from 'dropped_columns'
    df_dropped = dropped_columns[df_name]

    # Concatenate the dataframes width-wise
    joined_df = pd.concat([df_perc, df_dropped], axis=1)

    # Update the 'dfs_perc' dataframe in place with the joined dataframe
    dfs_perc[df_name] = joined_df

    # Reset the index of the dataframe
    dfs_perc[df_name].reset_index(drop=True, inplace=True)

### Verifying

In [None]:
# # Print the names, shapes, and dtypes of the dataframes in 'dfs_perc'
# print("Data Frames in 'dfs_perc':")
# for name, df in dfs_perc.items():
#     print(name)
#     print("Shape:", df.shape)
#     print("Dtypes:")
#     for column, dtype in df.dtypes.items():
#         print(f"{column}: {dtype}")
#     print()

Selecting a specific dataframe to verify information has migrated correctly

In [None]:
# # Temp option to see all columns
# with pd.option_context('display.max_columns', None):
#     print(dfs_perc['DF_male_all'])

        0    1    2    3    4    5    6    7    8    9   10   11   12   13  \
0     437  450  511  484  513  525  576  559  549  564  573  595  594  597   
1      70   70   84   91   80   90  116   67   97   81   97   83   85  113   
2      72   80   83   60   82   78   69   84   67   90   82   79   84  102   
3     157  203  182  171  192  161  202  186  158  184  211  220  208  213   
4      64   51   65   60   62   79   85   71   76   71   78   85   55   92   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
3216   98   86  120  100  109  120  115   96  114   96  114  113  127  117   
3217  183  124  170  184  175  185  185  160  188  197  189  181  227  181   
3218  190  218  223  195  287  247  211  223  237  185  228  244  232  248   
3219   20   10    4   14    9   10   16   17   18   11   12   18   18   19   
3220  593  582  669  639  691  704  658  650  687  618  670  700  755  720   

       14   15   16   17   18   19   20   21   22   23   24   2

## Joining Dictionaries

In [None]:
# Merge the dictionaries 'dfs' and 'dfs_perc' into 'dfs' in the order of 'df_names'
for df_name in df_names:
    if df_name in dfs_perc:
        if df_name in dfs:
            df_merged = pd.concat([dfs[df_name], dfs_perc[df_name]], axis=1)
            dfs[df_name] = df_merged
        else:
            dfs[df_name] = dfs_perc[df_name]

### Verifying

Print all the count,name,shape,# of cols, col names of the dataframes

In [None]:
# # Print the number of dataframes in the 'dfs' dictionary
# print("Number of Dataframes in 'dfs':", len(dfs))

# # Print the shape of the dataframes in 'dfs'
# print("Data Frames in 'dfs':")
# for df_name, df in dfs.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

## Copying Columns

Copy needed cols from sex dfs to race df

In [None]:
# Create a dictionary to map the column prefixes to the corresponding dataframes
column_map = {
    'DF_total_all': ['DF_male_all', 'DF_female_all'],
    'DF_total_whi': ['DF_total_male_whi', 'DF_total_female_whi'],
    'DF_total_baa': ['DF_total_male_baa', 'DF_total_female_baa'],
    'DF_total_aian': ['DF_total_male_aian', 'DF_total_female_aian'],
    'DF_total_aa': ['DF_total_male_aa', 'DF_total_female_aa'],
    'DF_total_nhop': ['DF_total_male_nhop', 'DF_total_female_nhop'],
    'DF_total_sor': ['DF_total_male_sor', 'DF_total_female_sor'],
    'DF_total_tom': ['DF_total_male_tom', 'DF_total_female_tom'],
    'DF_total_hol': ['DF_total_male_hol', 'DF_total_female_hol']
}

# Copy 'Total' column to the correct dataframes
for key, value in column_map.items():
    # Slicing exclude the first three characters
    dfs[key][f'{value[0][3:]}'] = dfs_perc[value[0]]['Total'].copy()
    dfs[key][f'{value[1][3:]}'] = dfs_perc[value[1]]['Total'].copy()

### Verifying

In [None]:
# # Verify the outcome
# for key, df in dfs.items():
#     print(f"DataFrame '{key}' column names:")
#     print(df.columns)

## Copying Columns

Copy needed cols from race df to total df

In [None]:
# Create a dictionary to map the column prefixes to the corresponding dataframes
column_map = {
    'DF_total_all': ['DF_total_whi', 'DF_total_baa', 'DF_total_aian',
                     'DF_total_aa', 'DF_total_nhop', 'DF_total_sor',
                     'DF_total_tom', 'DF_total_hol'
                     ]
                }

# Copy 'Total' column to the correct dataframes
for key, value in column_map.items():
    for df_name in value:
        dfs[key][df_name[3:]] = dfs[df_name]['Total'].copy()

### Verifying

In [None]:
# # Verify the outcome
# print(f"DataFrame 'DF_total_all' column names:")
# print(dfs['DF_total_all'].columns)

## Adding Age totals for races from age DF to race DF

In [None]:
# Create a dictionary to map the column prefixes to the corresponding dataframes
column_map = {
    'DF_total_all': ['DF_male_all', 'DF_female_all'],
    'DF_total_whi': ['DF_total_male_whi', 'DF_total_female_whi'],
    'DF_total_baa': ['DF_total_male_baa', 'DF_total_female_baa'],
    'DF_total_aian': ['DF_total_male_aian', 'DF_total_female_aian'],
    'DF_total_aa': ['DF_total_male_aa', 'DF_total_female_aa'],
    'DF_total_nhop': ['DF_total_male_nhop', 'DF_total_female_nhop'],
    'DF_total_sor': ['DF_total_male_sor', 'DF_total_female_sor'],
    'DF_total_tom': ['DF_total_male_tom', 'DF_total_female_tom'],
    'DF_total_hol': ['DF_total_male_hol', 'DF_total_female_hol']
}

# Iterate through the columns and update the 'DF_total_all' dataframe using the column_map
for column in cols_mod:
    dfs['DF_total_all'][column] = dfs['DF_male_all'][column] + dfs['DF_female_all'][column]

# Iterate through the column_map to update other dataframes
for key, value in column_map.items():
    for column in cols_mod:
        dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]

### Verifying

In [None]:
# for key, df in dfs.items():
#     print(f"DataFrame '{key}' after modification:")
#     print(df)
#     print()

## Building Weighted Average Column for 'perc_df_2' listed dfs

Removing dataframes from dict 'dfs' for further processing

In [None]:
# Initialize the new dictionary 'dfs_perc'
dfs_perc_2 = {}

# Iterate over the dataframe names in 'perc_df_2' list
for df_name in perc_df_2:
    # Check if the dataframe name exists in 'dfs' dictionary
    if df_name in dfs:
        # Move the matching dataframe from 'dfs' to 'dfs_perc_2'
        dfs_perc_2[df_name] = dfs.pop(df_name)

### Verifying

Should have 9 dataframes

In [None]:
# print(len(dfs_perc_2))
# for key, df in dfs_perc_2.items():
#     print(f"DataFrame '{key}'")

### Dropping process

In [None]:
'''Dropping columns in list'cols_mod' to new dictionary.  Running weighted
average arrays in new dict 'dropped_columns' then rejoining dictionaries to the
affected dataframes from list 'perc_df_2'. I know, imagine how I feel, I'm the
one writing the things.'''

In [None]:
dropped_columns = {}

# Iterate over the dataframes in 'dfs_perc_2'
    # Dataframe's name to 'df_name' and the dataframe itself to 'df'
for df_name, df in dfs_perc_2.items():
    # Store the dropped columns' data
    dropped_columns[df_name] = df[cols_mod]

    # Drop the specified columns from each dataframe
    dfs_perc_2[df_name] = df.drop(cols_mod, axis=1)

### Verifying

In [None]:
# for df_name, df in dropped_columns.items():
#     # Print the name of the dataframe
#     print(f"DataFrame Name: {df_name}")

#     # Print the list of column names for the dataframe
#     print("Column Names:")
#     column_names = df.columns.tolist()
#     print(column_names)

#     # Print the list of column types for the dataframe
#     print("Column Types:")
#     column_types = df.dtypes.tolist()
#     print(column_types)

## Finding weighted Average

For DFs in dict 'dropped_columns'

In [None]:
# Iterate through the dataframes in 'dropped_columns' whose name is in 'perc_df_2'
for df_name, df in dropped_columns.items():
    if df_name in perc_df_2:
        # Get the column names as an array of integers from 'cols_mod'
        values = np.array(cols_mod, dtype=int)

        # Initialize an empty list to store the results
        results = []

        # Iterate over each row in the dataframe
        for _, row in df.iterrows():
            # Get the row entries as an array of integers
            weights = np.array(row.values, dtype=int)

            # Perform element-wise multiplication of values and weights
            weighted_values = values * weights

            # Sum the products of the multiplications
            weighted_sum = np.sum(weighted_values)

            # Sum all items in the weights array
            weights_sum = np.sum(weights)

            # Calculate the weighted average, handle division by zero
            if weights_sum != 0:
                weighted_average = weighted_sum / weights_sum
            else:
                weighted_average = np.nan

            # Append the weighted average to the results list
            results.append(weighted_average)

        # Convert the results list to a float array explicitly
        results = np.array(results, dtype=float)

        # Add the 'Average_Age' column to the dataframe
        df['Average_Age'] = results

        # Reset the index of the dataframe
        df.reset_index(drop=True, inplace=True)

### Verifying

In [None]:
# print("DataFrame 'DF_total_all' with 'Average_Age' column:")
# print(dropped_columns['DF_total_all'])

## Joining Dataframes

In [None]:
# Iterate over the dataframes in 'dfs_perc_2'
for df_name, df_perc in dfs_perc_2.items():
    # Get the corresponding dataframe from 'dropped_columns'
    df_dropped = dropped_columns[df_name]

    # Concatenate the dataframes width-wise
    joined_df = pd.concat([df_perc, df_dropped], axis=1)

    # Update the 'dfs_perc_2' dataframe in place with the joined dataframe
    dfs_perc_2[df_name] = joined_df

    # Reset the index of the dataframe
    dfs_perc_2[df_name].reset_index(drop=True, inplace=True)

### Verifying

Print all the count,name,shape,# of cols, col names of the dataframes

##### dfs_perc_2

In [None]:
# # Print the number of dataframes in the 'dfs_perc_2' dictionary
# print("Number of Dataframes in 'dfs_perc_2':", len(dfs_perc_2))

# # Print the shape of the dataframes in 'dfs_perc_2'
# print("Data Frames in 'dfs_perc_2':")
# for df_name, df in dfs_perc_2.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

#### dfs_perc

In [None]:
# # Print the number of dataframes in the 'dfs_perc' dictionary
# print("Number of Dataframes in 'dfs_perc':", len(dfs_perc))

# # Print the shape of the dataframes in 'dfs_perc'
# print("Data Frames in 'dfs_perc':")
# for df_name, df in dfs_perc.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

#### dropped_columns

In [None]:
# # Print the number of dataframes in the 'dropped_columns' dictionary
# print("Number of Dataframes in 'dropped_columns':", len(dropped_columns))

# # Print the shape of the dataframes in 'dropped_columns'
# print("Data Frames in 'dropped_columns':")
# for df_name, df in dropped_columns.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

### Verifying

Print all the count,name,shape,# of cols, col names of the dataframes

In [None]:
# # Print the number of dataframes in the 'dfs' dictionary
# print("Number of Dataframes in 'dfs':", len(dfs))

# # Print the shape of the dataframes in 'dfs'
# print("Data Frames in 'dfs':")
# for df_name, df in dfs.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

## Building Percent to Total columns

Percent to total for ages in dfs of dict 'dfs_perc_2'

In [None]:
# Loop through each dataframe in the dictionary
for df_name, df in dfs_perc_2.items():
    # Get the list of columns in the dataframe that match cols_mod
    columns = [col for col in df.columns if col in cols_mod]
    updated_columns = []

    # Reset the index to consolidate memory layout
    df.reset_index(drop=True, inplace=True)

    # Calculate the percentage values for the new column
    for col in columns:
        new_col_name = f'{col}_perc'
        updated_columns.extend([col, new_col_name])

        df[new_col_name] = [0 if total == 0 else (value / total)
                            for value, total in zip(df[col], df['Total'])]

    # Reorder the columns in the dataframe
    df = df[updated_columns]

    # Update the dataframe in the 'dfs_perc_2' dictionary
    dfs_perc_2[df_name] = df

### Verifying

In [None]:
# print("DataFrame 'DF_total_all' after modification:")
# print(dfs['DF_total_all'])

## Joining Dictionaries

In [None]:
# Merge the dictionaries 'dfs' and 'dfs_perc_2' into 'dfs' in the order of 'df_names'
for df_name in df_names:
    if df_name in dfs_perc_2:
        if df_name in dfs:
            df_merged = pd.concat([dfs[df_name], dfs_perc_2[df_name]], axis=1)
            dfs[df_name] = df_merged
        else:
            dfs[df_name] = dfs_perc_2[df_name]

## Calculating Error Rates

In [None]:
# Assuming you have already loaded the 'DF_total_all' DataFrame

# Step 1: Calculate the expected total by summing the individual racial category columns
expected_total = dfs['DF_total_all'][['total_whi', 'total_baa', 'total_aian', 'total_aa', 'total_nhop', 'total_sor', 'total_tom', 'total_hol']].sum(axis=1)

# Step 2: Find the absolute error by subtracting the 'Total' column from the expected total
absolute_error = expected_total - dfs['DF_total_all']['Total']

# Step 3: Calculate the error rate as the absolute error divided by the expected total, multiplied by 100
error_rate = (absolute_error / expected_total) * 100

# Add the 'expected_total', 'absolute_error', and 'error_rate' columns to the DataFrame
dfs['DF_total_all']['Expected Total'] = expected_total
dfs['DF_total_all']['Absolute Error'] = absolute_error
dfs['DF_total_all']['Error Rate'] = error_rate

### Verifying

In [None]:
# # Display the DataFrame with the added columns
# print(dfs['DF_total_all'])

## Updating column order

Setting the variables for column order for DF_total_all

The total all DF is unique and needs to be done seperately

In [None]:
col_loc_tot_all = [0,'0_perc',1,'1_perc',2,'2_perc',3,'3_perc',4,'4_perc',5,
                   '5_perc',6,'6_perc',7,'7_perc',8,'8_perc',9,'9_perc',10,
                   '10_perc',11,'11_perc',12,'12_perc',13,'13_perc',14,
                   '14_perc',15,'15_perc',16,'16_perc',17,'17_perc',18,
                   '18_perc',19,'19_perc',20,'20_perc',21,'21_perc',22,
                   '22_perc',23,'23_perc',24,'24_perc',25,'25_perc',26,
                   '26_perc',27,'27_perc',28,'28_perc',29,'29_perc',30,
                   '30_perc',31,'31_perc',32,'32_perc',33,'33_perc',34,
                   '34_perc',35,'35_perc',36,'36_perc',37,'37_perc',38,
                   '38_perc',39,'39_perc',40,'40_perc',41,'41_perc',42,
                   '42_perc',43,'43_perc',44,'44_perc',45,'45_perc',46,
                   '46_perc',47,'47_perc',48,'48_perc',49,'49_perc',50,
                   '50_perc',51,'51_perc',52,'52_perc',53,'53_perc',54,
                   '54_perc',55,'55_perc',56,'56_perc',57,'57_perc',58,
                   '58_perc',59,'59_perc',60,'60_perc',61,'61_perc',62,
                   '62_perc',63,'63_perc',64,'64_perc',65,'65_perc',66,
                   '66_perc',67,'67_perc',68,'68_perc',69,'69_perc',70,
                   '70_perc',71,'71_perc',72,'72_perc',73,'73_perc',74,
                   '74_perc',75,'75_perc',76,'76_perc',77,'77_perc',78,
                   '78_perc',79,'79_perc',80,'80_perc',81,'81_perc',82,
                   '82_perc',83,'83_perc',84,'84_perc',85,'85_perc',86,
                   '86_perc',87,'87_perc',88,'88_perc',89,'89_perc',90,
                   '90_perc',91,'91_perc',92,'92_perc',93,'93_perc',94,
                   '94_perc',95,'95_perc',96,'96_perc',97,'97_perc',98,
                   '98_perc',99,'99_perc',102,'102_perc',107,'107_perc',
                   110,'110_perc','Average_Age','male_all','female_all',
                   'total_whi','total_baa','total_aian','total_aa',
                   'total_nhop','total_sor','total_tom','total_hol',
                   'Expected Total','Total','Absolute Error','Error Rate',
                   'Location','State','County','FIPS'
                   ]

### Ordering process

In [None]:
# Modify only the 'DF_total_all' dataframe
df = dfs['DF_total_all']

# Reorder the columns in the dataframe based on 'col_loc_tot_all'
df = df[col_loc_tot_all]

# Update the dataframe in the 'dfs' dictionary
dfs['DF_total_all'] = df

Setting the variables for column order for total race dataframes

In [None]:
df_perc_8 = ['DF_total_whi','DF_total_baa','DF_total_aian','DF_total_aa',
             'DF_total_nhop','DF_total_sor','DF_total_tom','DF_total_hol'
             ]

col_loc = [0,'0_perc',1,'1_perc',2,'2_perc',3,'3_perc',4,'4_perc',5,'5_perc',6,
           '6_perc',7,'7_perc',8,'8_perc',9,'9_perc',10,'10_perc',11,'11_perc',
           12,'12_perc',13,'13_perc',14,'14_perc',15,'15_perc',16,'16_perc',17,
           '17_perc',18,'18_perc',19,'19_perc',20,'20_perc',21,'21_perc',22,
           '22_perc',23,'23_perc',24,'24_perc',25,'25_perc',26,'26_perc',27,
           '27_perc',28,'28_perc',29,'29_perc',30,'30_perc',31,'31_perc',32,
           '32_perc',33,'33_perc',34,'34_perc',35,'35_perc',36,'36_perc',37,
           '37_perc',38,'38_perc',39,'39_perc',40,'40_perc',41,'41_perc',42,
           '42_perc',43,'43_perc',44,'44_perc',45,'45_perc',46,'46_perc',47,
           '47_perc',48,'48_perc',49,'49_perc',50,'50_perc',51,'51_perc',52,
           '52_perc',53,'53_perc',54,'54_perc',55,'55_perc',56,'56_perc',57,
           '57_perc',58,'58_perc',59,'59_perc',60,'60_perc',61,'61_perc',62,
           '62_perc',63,'63_perc',64,'64_perc',65,'65_perc',66,'66_perc',67,
           '67_perc',68,'68_perc',69,'69_perc',70,'70_perc',71,'71_perc',72,
           '72_perc',73,'73_perc',74,'74_perc',75,'75_perc',76,'76_perc',77,
           '77_perc',78,'78_perc',79,'79_perc',80,'80_perc',81,'81_perc',82,
           '82_perc',83,'83_perc',84,'84_perc',85,'85_perc',86,'86_perc',87,
           '87_perc',88,'88_perc',89,'89_perc',90,'90_perc',91,'91_perc',92,
           '92_perc',93,'93_perc',94,'94_perc',95,'95_perc',96,'96_perc',97,
           '97_perc',98,'98_perc',99,'99_perc',102,'102_perc',107,'107_perc',
           110,'110_perc','Average_Age','Total','Location','State','County',
           'FIPS']

spec_col_dict = {
        'DF_total_whi': ['total_male_whi','total_female_whi'],
        'DF_total_baa': ['total_male_baa','total_female_baa'],
        'DF_total_aian': ['total_male_aian','total_female_aian'],
        'DF_total_aa': ['total_male_aa','total_female_aa'],
        'DF_total_nhop': ['total_male_nhop','total_female_nhop'],
        'DF_total_sor': ['total_male_sor','total_female_sor'],
        'DF_total_tom': ['total_male_tom','total_female_tom'],
        'DF_total_hol': ['total_male_hol','total_female_hol'],
        }

### Ordering process

In [None]:
# Specify the index where you want to insert the specific columns
insert_index = 207  

# Iterate through the list of dataframe names
for df_name in df_perc_8:
    # Get the dataframe from the 'dfs' dictionary
    df = dfs[df_name]

    # Get the specific column names for the current dataframe
    df_spec_col_dict = spec_col_dict[df_name]

    # Separate the columns into two groups: common columns and specific columns
    common_columns = [col for col in col_loc if col not in df_spec_col_dict]
    specific_columns = df_spec_col_dict

    # Reorder the common columns in the dataframe
    df_common_reordered = df[common_columns]

    # Reindex the dataframe with the updated column order
    updated_columns = common_columns[:insert_index] + specific_columns + common_columns[insert_index:]
    df = df_common_reordered.reindex(columns=updated_columns)

    # Update the dataframe in the 'dfs' dictionary
    dfs[df_name] = df

## Export to excel

Create workbook and export wanted dataframes to excel as individual sheets.

In [None]:
# Define the file name
file_name = '2020_agesex_statistics.xlsx'

# Get the file path in the current working directory
file_path = os.path.join(os.getcwd(), file_name)

# Check if the file exists
if os.path.exists(file_path):
    # Load the existing Excel file
    excel_file = pd.read_excel(file_path, engine='openpyxl')

    # Create a new ExcelWriter object using the existing file
    writer = pd.ExcelWriter(file_path, engine='openpyxl', if_sheet_exists='replace', mode='a')

    # Iterate through the dataframes in dfs
    for df_name, df in dfs.items():  # Use 'dfs.items()' to get the name (key) and dataframe (value)
        # Get the name of the dataframe
        name = df_name

        # Write each dataframe to a separate sheet in the Excel file
        df.to_excel(writer, sheet_name=name, index=False)

    # Close the writer
    writer.close()

else:
    # Create a new workbook
    writer = pd.ExcelWriter(file_path, engine='openpyxl')

    # Iterate through the dataframes in dfs
    for df_name, df in dfs.items():  # Use 'dfs.items()' to get the name (key) and dataframe (value)
        # Get the name of the dataframe
        name = df_name

        # Write each dataframe to a separate sheet in the Excel file
        df.to_excel(writer, sheet_name=name, index=False)

    # Close the writer
    writer.close()