# Age & Sex Statistics

## Import Needed Libraries

In [8]:
import pandas as pd
from openpyxl import workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import numpy as np
import os
# import math

## Loading file to dataframes

List of dataframes and their proper order

In [9]:
# Creating ordered list of DFs to name each sheet coming in from workbook
df_names = ['DF_total_all','DF_male_all','DF_female_all','DF_total_whi',
            'DF_total_male_whi','DF_total_female_whi','DF_total_baa',
            'DF_total_male_baa','DF_total_female_baa','DF_total_aian',
            'DF_total_male_aian','DF_total_female_aian','DF_total_aa',
            'DF_total_male_aa','DF_total_female_aa','DF_total_nhop',
            'DF_total_male_nhop','DF_total_female_nhop','DF_total_sor',
            'DF_total_male_sor','DF_total_female_sor','DF_total_tom',
            'DF_total_male_tom','DF_total_female_tom','DF_total_hol',
            'DF_total_male_hol','DF_total_female_hol'
            ]

### Variables for column names and type

In [10]:
cols_int = ['Under 1 Year','1 Year','2 Years','3 Years','4 Years','5 Years',
            '6 Years','7 Years','8 Years','9 Years','10 Years','11 Years',
            '12 Years','13 Years','14 Years','15 Years','16 Years','17 Years',
            '18 Years','19 Years','20 Years','21 Years','22 Years','23 Years',
            '24 Years','25 Years','26 Years','27 Years','28 Years','29 Years',
            '30 Years','31 Years','32 Years','33 Years','34 Years','35 Years',
            '36 Years','37 Years','38 Years','39 Years','40 Years','41 Years',
            '42 Years','43 Years','44 Years','45 Years','46 Years','47 Years',
            '48 Years','49 Years','50 Years','51 Years','52 Years','53 Years',
            '54 Years','55 Years','56 Years','57 Years','58 Years','59 Years',
            '60 Years','61 Years','62 Years','63 Years','64 Years','65 Years',
            '66 Years','67 Years','68 Years','69 Years','70 Years','71 Years',
            '72 Years','73 Years','74 Years','75 Years','76 Years','77 Years',
            '78 Years','79 Years','80 Years','81 Years','82 Years','83 Years',
            '84 Years','85 Years','86 Years','87 Years','88 Years','89 Years',
            '90 Years','91 Years','92 Years','93 Years','94 Years','95 Years',
            '96 Years','97  Years','98  Years','99  Years','100 to 104  Years',
            '105 to 109  Years','110  Years and Over'
            ]

cols_str = ['Location', 'State', 'County', 'FIPS']

### Loading process

In [11]:
# Dictionary to store the dataframes
dfs = {}

# Get the path of the current working directory
cwd = os.getcwd()

# Construct the full path to the .xlsx file
file_path = os.path.join(cwd, '2020_agesex_data.xlsx')

# Load each sheet of the .xlsx file into a named dataframe
for name in df_names:
    df = pd.read_excel(file_path, sheet_name=name, header=None)
    df.columns = df.iloc[0]  # Set the first row as the column headers
    df = df[1:]  # Exclude the first row from the data
    df.reset_index(drop=True, inplace=True)  # Reset the index
    
    # Change column types based on column names
    for column in cols_int:
        if column in df.columns:
            df[column] = df[column].astype(int)
    
    for column in cols_str:
        if column in df.columns:
            df[column] = df[column].astype(str)
    
    dfs[name] = df

### Verifying

Print all the names, shapes, and dtypes of the dataframes loaded in

In [12]:
# # Print the names, shapes, and dtypes of the dataframes in 'dfs'
# print("Data Frames in 'dfs':")
# for name, df in dfs.items():
#     print(name)
#     print("Shape:", df.shape)
#     print("Dtypes:")
#     for column, dtype in df.dtypes.items():
#         print(f"{column}: {dtype}")
#     print()

## Building Percent to Total column
Percent to total for male & female age groups

### Grouping Dataframes with like structures

In [13]:
perc_df = ['DF_male_all','DF_female_all','DF_total_male_whi',
             'DF_total_female_whi','DF_total_male_baa','DF_total_female_baa',
             'DF_total_male_aian','DF_total_female_aian','DF_total_male_aa',
             'DF_total_female_aa','DF_total_male_nhop','DF_total_female_nhop',
             'DF_total_male_sor','DF_total_female_sor','DF_total_male_tom',
             'DF_total_female_tom','DF_total_male_hol','DF_total_female_hol'
             ]

perc_df_2 = ['DF_total_all','DF_total_whi','DF_total_baa','DF_total_aian',
             'DF_total_aa','DF_total_nhop','DF_total_sor','DF_total_tom',
             'DF_total_hol'
             ]

### Percent to total process for perc_df

In [14]:
# Iterate over the dictionary of dataframes
for df_name, df in dfs.items():
    if df_name in perc_df:
        # Get the list of columns in the dataframe that match cols_int
        columns = [col for col in df.columns if col in cols_int]
        updated_columns = []

        # Reset the index to consolidate memory layout
        df.reset_index(drop=True, inplace=True)

        # Calculate the percentage values for the new column
        for col in columns:
            new_col_name = f'{col}_perc'
            updated_columns.extend([col, new_col_name])

            df[new_col_name] = [0 if total == 0 else (value / total)
                                for value, total in zip(df[col], df['Total'])]
        # Append ['Location', 'State', 'County', 'FIPS'] to updated_columns
        updated_columns.extend(['Total','Location', 'State', 'County', 'FIPS'])
        # Reorder the columns in the dataframe
        df = df[updated_columns]

        # Update the dataframe in the 'dfs' dictionary
        dfs[df_name] = df

  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / total)
  df[new_col_name] = [0 if total == 0 else (value / tota

### Verifying

Print all the name,shape,# of cols, col names of the dataframes

In [15]:
# # Print the shape of the dataframes in 'dfs'
# print("Data Frames in 'dfs':")
# print(len(dfs))
# for df_name, df in dfs.items():
#     if df_name in perc_df:
#         print("Dataframe Name:", df_name)
#         print("Shape:", df.shape)
#         print(f"Number of Columns: {len(df.columns)}")
#         print("Columns:", df.columns.tolist())

## Building Weighted Average Column for 'perc_df' listed dfs

Removing modified dataframes from dict dfs for further processing

In [16]:
# Initialize the new dictionary 'dfs_perc'
dfs_perc = {}

# Iterate over the dataframe names in 'perc_df' list
for df_name in perc_df:
    # Check if the dataframe name exists in 'dfs' dictionary
    if df_name in dfs:
        # Move the matching dataframe from 'dfs' to 'dfs_perc'
        dfs_perc[df_name] = dfs.pop(df_name)

### Verifying

In [17]:
# Printing list of both dataframe dictionaries to verify last step

# We should have 9 in dict 'dfs' and 18 in new dict 'perc_df'

print("Dictionary 'dfs':")
print(len(dfs.keys()))
print(list(dfs.keys()))

print("\nDictionary 'perc_df':")
print(len(dfs_perc.keys()))
print(list(dfs_perc.keys()))

Dictionary 'dfs':
9
['DF_total_all', 'DF_total_whi', 'DF_total_baa', 'DF_total_aian', 'DF_total_aa', 'DF_total_nhop', 'DF_total_sor', 'DF_total_tom', 'DF_total_hol']

Dictionary 'perc_df':
18
['DF_male_all', 'DF_female_all', 'DF_total_male_whi', 'DF_total_female_whi', 'DF_total_male_baa', 'DF_total_female_baa', 'DF_total_male_aian', 'DF_total_female_aian', 'DF_total_male_aa', 'DF_total_female_aa', 'DF_total_male_nhop', 'DF_total_female_nhop', 'DF_total_male_sor', 'DF_total_female_sor', 'DF_total_male_tom', 'DF_total_female_tom', 'DF_total_male_hol', 'DF_total_female_hol']


### Variables for columns to drop to new dictionary

In [18]:
columns_to_drop = ['Under 1 Year_perc','1 Year_perc','2 Years_perc',
                  '3 Years_perc','4 Years_perc','5 Years_perc','6 Years_perc',
                  '7 Years_perc','8 Years_perc','9 Years_perc','10 Years_perc',
                  '11 Years_perc','12 Years_perc','13 Years_perc',
                  '14 Years_perc','15 Years_perc','16 Years_perc',
                  '17 Years_perc','18 Years_perc','19 Years_perc',
                  '20 Years_perc','21 Years_perc','22 Years_perc',
                  '23 Years_perc','24 Years_perc','25 Years_perc',
                  '26 Years_perc','27 Years_perc','28 Years_perc',
                  '29 Years_perc','30 Years_perc','31 Years_perc',
                  '32 Years_perc','33 Years_perc','34 Years_perc',
                  '35 Years_perc','36 Years_perc','37 Years_perc',
                  '38 Years_perc','39 Years_perc','40 Years_perc',
                  '41 Years_perc','42 Years_perc','43 Years_perc',
                  '44 Years_perc','45 Years_perc','46 Years_perc',
                  '47 Years_perc','48 Years_perc','49 Years_perc',
                  '50 Years_perc','51 Years_perc','52 Years_perc',
                  '53 Years_perc','54 Years_perc','55 Years_perc',
                  '56 Years_perc','57 Years_perc','58 Years_perc',
                  '59 Years_perc','60 Years_perc','61 Years_perc',
                  '62 Years_perc','63 Years_perc','64 Years_perc',
                  '65 Years_perc','66 Years_perc','67 Years_perc',
                  '68 Years_perc','69 Years_perc','70 Years_perc',
                  '71 Years_perc','72 Years_perc','73 Years_perc',
                  '74 Years_perc','75 Years_perc','76 Years_perc',
                  '77 Years_perc','78 Years_perc','79 Years_perc',
                  '80 Years_perc','81 Years_perc','82 Years_perc',
                  '83 Years_perc','84 Years_perc','85 Years_perc',
                  '86 Years_perc','87 Years_perc','88 Years_perc',
                  '89 Years_perc','90 Years_perc','91 Years_perc',
                  '92 Years_perc','93 Years_perc','94 Years_perc',
                  '95 Years_perc','96 Years_perc','97  Years_perc',
                  '98  Years_perc','99  Years_perc','100 to 104  Years_perc',
                  '105 to 109  Years_perc','110  Years and Over_perc','Total',
                  'Location','State','County','FIPS'
                  ]

### Dropping process

In [19]:
dropped_columns = {}

# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dfs_perc.items():
    # Store the dropped columns' data
    dropped_columns[df_name] = df[columns_to_drop]

    # Drop the specified columns from each dataframe
    dfs_perc[df_name] = df.drop(columns_to_drop, axis=1)

#### Verifying

In [20]:
# print("Dictionary 'dfs_perc':")
# for df_name, df in dfs_perc.items():
#     print(f"DataFrame '{df_name}' column names:")
#     print(list(df.columns))

# print("\nDictionary 'dropped_columns':")
# for df_name, df in dropped_columns.items():
#     print(f"DataFrame '{df_name}' column names:")
#     print(list(df.columns))

### Variables for columns names

In [21]:
# cols_int = ['Under 1 Year','1 Year','2 Years','3 Years','4 Years','5 Years',
#             '6 Years','7 Years','8 Years','9 Years','10 Years','11 Years',
#             '12 Years','13 Years','14 Years','15 Years','16 Years','17 Years',
#             '18 Years','19 Years','20 Years','21 Years','22 Years','23 Years',
#             '24 Years','25 Years','26 Years','27 Years','28 Years','29 Years',
#             '30 Years','31 Years','32 Years','33 Years','34 Years','35 Years',
#             '36 Years','37 Years','38 Years','39 Years','40 Years','41 Years',
#             '42 Years','43 Years','44 Years','45 Years','46 Years','47 Years',
#             '48 Years','49 Years','50 Years','51 Years','52 Years','53 Years',
#             '54 Years','55 Years','56 Years','57 Years','58 Years','59 Years',
#             '60 Years','61 Years','62 Years','63 Years','64 Years','65 Years',
#             '66 Years','67 Years','68 Years','69 Years','70 Years','71 Years',
#             '72 Years','73 Years','74 Years','75 Years','76 Years','77 Years',
#             '78 Years','79 Years','80 Years','81 Years','82 Years','83 Years',
#             '84 Years','85 Years','86 Years','87 Years','88 Years','89 Years',
#             '90 Years','91 Years','92 Years','93 Years','94 Years','95 Years',
#             '96 Years','97 Years','98 Years','99 Years','100 to 104 Years',
#             '105 to 109 Years','110 Years and Over'
#             ]
cols_mod = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,
            25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
            47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,
            69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
            91,92,93,94,95,96,97,98,99,102,107,110
            ] 


### Column change process

In [22]:
# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dfs_perc.items():
    # Change the column names to the list of integers
    df.columns = cols_mod
    
    # Convert the column type to integer
    df[cols_mod] = df[cols_mod].astype(int)

#### Verifying

In [23]:
# # Iterate over the dataframes in 'dfs_perc'
# for df_name, df in dfs_perc.items():
#     print(f"DataFrame '{df_name}' column names and types:")
#     column_info = [(col, str(df[col].dtype)) for col in df.columns]
#     print(column_info)
#     print()

## Finding weighted Average

In [24]:
# Iterate over each key-value pair in dfs_perc
for df_name, df in dfs_perc.items():
    # Get the column names as an array of integers
    values = np.array(df.columns, dtype=int)

    # Initialize an empty list to store the results
    results = []

    # Iterate over each row in the dataframe
    for _, row in df.iterrows():
        # Get the row entries as an array of integers
        weights = np.array(row.values, dtype=int)

        # Perform element-wise multiplication of values and weights
        weighted_values = values * weights

        # Sum the products of the multiplications
        weighted_sum = np.sum(weighted_values)

        # Sum all items in the weights array
        weights_sum = np.sum(weights)

        # Calculate the weighted average
        weighted_average = weighted_sum / weights_sum

        # Append the weighted average to the results list
        results.append(weighted_average)

    # Add the 'Average_Age' column to the dataframe
    df['Average_Age'] = results

    # Reset the index of the dataframe
    df.reset_index(drop=True, inplace=True)

  df['Average_Age'] = results
  df['Average_Age'] = results
  df['Average_Age'] = results
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  weighted_average = weighted_sum / weights_sum
  df['Average_Age'] = results
  df['Average_Age'] = results
  df['Average_Age'] = results
  weighted_average = weighted_sum / weig

### Verifying

verifying values in column of specific dataframe in 'dfs_perc'

In [25]:
# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dfs_perc.items():
    print(f"DataFrame '{df_name}' column names and types:")
    column_info = [(col, str(df[col].dtype)) for col in df.columns]
    print(column_info)
    print()

DataFrame 'DF_male_all' column names and types:
[(0, 'int32'), (1, 'int32'), (2, 'int32'), (3, 'int32'), (4, 'int32'), (5, 'int32'), (6, 'int32'), (7, 'int32'), (8, 'int32'), (9, 'int32'), (10, 'int32'), (11, 'int32'), (12, 'int32'), (13, 'int32'), (14, 'int32'), (15, 'int32'), (16, 'int32'), (17, 'int32'), (18, 'int32'), (19, 'int32'), (20, 'int32'), (21, 'int32'), (22, 'int32'), (23, 'int32'), (24, 'int32'), (25, 'int32'), (26, 'int32'), (27, 'int32'), (28, 'int32'), (29, 'int32'), (30, 'int32'), (31, 'int32'), (32, 'int32'), (33, 'int32'), (34, 'int32'), (35, 'int32'), (36, 'int32'), (37, 'int32'), (38, 'int32'), (39, 'int32'), (40, 'int32'), (41, 'int32'), (42, 'int32'), (43, 'int32'), (44, 'int32'), (45, 'int32'), (46, 'int32'), (47, 'int32'), (48, 'int32'), (49, 'int32'), (50, 'int32'), (51, 'int32'), (52, 'int32'), (53, 'int32'), (54, 'int32'), (55, 'int32'), (56, 'int32'), (57, 'int32'), (58, 'int32'), (59, 'int32'), (60, 'int32'), (61, 'int32'), (62, 'int32'), (63, 'int32'), (

## Joining Dataframes

Test

In [26]:
# Iterate over the dataframes in 'dfs_perc'
for df_name, df_perc in dfs_perc.items():
    # Get the corresponding dataframe from 'dropped_columns'
    df_dropped = dropped_columns[df_name]

    # Concatenate the dataframes width-wise
    joined_df = pd.concat([df_perc, df_dropped], axis=1)

    # Update the 'dfs_perc' dataframe in place with the joined dataframe
    dfs_perc[df_name] = joined_df

    # Reset the index of the dataframe
    dfs_perc[df_name].reset_index(drop=True, inplace=True)

# Delete the 'dropped_columns' dictionary to free up memory
del dropped_columns

old

In [27]:
# # Iterate over the dataframes in 'dfs_perc'
# for df_name, df_perc in dfs_perc.items():
#     # Get the corresponding dataframe from 'dropped_columns'
#     df_dropped = dropped_columns[df_name]

#     # Concatenate the dataframes width-wise
#     joined_df = pd.concat([df_perc, df_dropped], axis=1)

#     # Update the 'dfs_perc' dataframe in place with the joined dataframe
#     dfs_perc[df_name] = joined_df

#     # Reset the index of the dataframe
#     dfs_perc[df_name].reset_index(drop=True, inplace=True)

# # Delete the 'dropped_columns' dictionary to free up memory
# del dropped_columns

### Verifying

In [28]:
# '''At this point, dictionary 'dfs_perc' should have
#     18 dataframes with 212 columns'''

In [29]:
print(len(dfs_perc))
# Iterate over the dataframes in 'dfs_perc'
for df_name, df in dfs_perc.items():
    print(f"DataFrame '{df_name}' column names and types ({len(df.columns)} columns):")
    column_info = [(col, str(df[col].dtype)) for col in df.columns]
    print(column_info)
    print()

18
DataFrame 'DF_male_all' column names and types (212 columns):
[(0, 'int32'), (1, 'int32'), (2, 'int32'), (3, 'int32'), (4, 'int32'), (5, 'int32'), (6, 'int32'), (7, 'int32'), (8, 'int32'), (9, 'int32'), (10, 'int32'), (11, 'int32'), (12, 'int32'), (13, 'int32'), (14, 'int32'), (15, 'int32'), (16, 'int32'), (17, 'int32'), (18, 'int32'), (19, 'int32'), (20, 'int32'), (21, 'int32'), (22, 'int32'), (23, 'int32'), (24, 'int32'), (25, 'int32'), (26, 'int32'), (27, 'int32'), (28, 'int32'), (29, 'int32'), (30, 'int32'), (31, 'int32'), (32, 'int32'), (33, 'int32'), (34, 'int32'), (35, 'int32'), (36, 'int32'), (37, 'int32'), (38, 'int32'), (39, 'int32'), (40, 'int32'), (41, 'int32'), (42, 'int32'), (43, 'int32'), (44, 'int32'), (45, 'int32'), (46, 'int32'), (47, 'int32'), (48, 'int32'), (49, 'int32'), (50, 'int32'), (51, 'int32'), (52, 'int32'), (53, 'int32'), (54, 'int32'), (55, 'int32'), (56, 'int32'), (57, 'int32'), (58, 'int32'), (59, 'int32'), (60, 'int32'), (61, 'int32'), (62, 'int32'),

Selecting a specific dataframe to verify information has migrated correctly

In [30]:
# # Temp option to see all columns
# with pd.option_context('display.max_columns', None):
#     print(dfs_perc['DF_male_all'])

## Joining Dictionaries

In [31]:
# Merge the dictionaries 'dfs' and 'dfs_perc' into 'dfs' in the order of 'df_names'
for df_name in df_names:
    if df_name in dfs_perc:
        if df_name in dfs:
            df_merged = pd.concat([dfs[df_name], dfs_perc[df_name]], axis=1)
            dfs[df_name] = df_merged
        else:
            dfs[df_name] = dfs_perc[df_name]
# Delete the 'dfs_perc' dictionary to free up memory
del dfs_perc

### Verifying

Print all the count,name,shape,# of cols, col names of the dataframes

In [32]:
# # Print the number of dataframes in the 'dfs' dictionary
# print("Number of Dataframes in 'dfs':", len(dfs))

# # Print the shape of the dataframes in 'dfs'
# print("Data Frames in 'dfs':")
# for df_name, df in dfs.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

## Copying Columns

Copy needed cols from sex dfs to race df

In [33]:
# Create a dictionary to map the column prefixes to the corresponding dataframes
column_map = {
    'DF_total_all': ['DF_male_all', 'DF_female_all'],
    'DF_total_whi': ['DF_total_male_whi', 'DF_total_female_whi'],
    'DF_total_baa': ['DF_total_male_baa', 'DF_total_female_baa'],
    'DF_total_aian': ['DF_total_male_aian', 'DF_total_female_aian'],
    'DF_total_aa': ['DF_total_male_aa', 'DF_total_female_aa'],
    'DF_total_nhop': ['DF_total_male_nhop', 'DF_total_female_nhop'],
    'DF_total_sor': ['DF_total_male_sor', 'DF_total_female_sor'],
    'DF_total_tom': ['DF_total_male_tom', 'DF_total_female_tom'],
    'DF_total_hol': ['DF_total_male_hol', 'DF_total_female_hol']
}

# Copy 'Total' column to the correct dataframes
for key, value in column_map.items():
    # Slicing exclude the first three characters 'DF_'
    dfs[key][f'{value[0][3:]}'] = dfs[value[0]]['Total'].copy()
    dfs[key][f'{value[1][3:]}'] = dfs[value[1]]['Total'].copy()

### Verifying

In [34]:
# # Verify the outcome
# for key, df in dfs.items():
#     print(f"DataFrame '{key}' column names:")
#     print(df.columns.tolist())

## Copying Columns

Copy needed cols from race df to total df

In [35]:
# Create a dictionary to map the column prefixes to the corresponding dataframes
column_map = {
    'DF_total_all': ['DF_total_whi', 'DF_total_baa', 'DF_total_aian',
                     'DF_total_aa', 'DF_total_nhop', 'DF_total_sor',
                     'DF_total_tom', 'DF_total_hol'
                     ]}

# Copy 'Total' column to the correct dataframes
for key, value in column_map.items():
    for df_name in value:
        # Slicing exclude the first three characters 'DF_'
        dfs[key][df_name[3:]] = dfs[df_name]['Total'].copy()

### Verifying

In [36]:
# # Verify the outcome
# for key, df in dfs.items():
#     print(f"DataFrame '{key}' column names:")
#     print(df.columns.tolist())

## Adding age totals for races from age DF to race DF

In [37]:
# Create a dictionary to map the column prefixes to the corresponding dataframes
column_map = {
    'DF_total_all': ['DF_male_all', 'DF_female_all'],
    'DF_total_whi': ['DF_total_male_whi', 'DF_total_female_whi'],
    'DF_total_baa': ['DF_total_male_baa', 'DF_total_female_baa'],
    'DF_total_aian': ['DF_total_male_aian', 'DF_total_female_aian'],
    'DF_total_aa': ['DF_total_male_aa', 'DF_total_female_aa'],
    'DF_total_nhop': ['DF_total_male_nhop', 'DF_total_female_nhop'],
    'DF_total_sor': ['DF_total_male_sor', 'DF_total_female_sor'],
    'DF_total_tom': ['DF_total_male_tom', 'DF_total_female_tom'],
    'DF_total_hol': ['DF_total_male_hol', 'DF_total_female_hol']
}

# Iterate through the column_map to update other dataframes
    # Iterate through the columns in 'cols_mod'
for key, value in column_map.items():
    for column in cols_mod:
        dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]

  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][column]
  dfs[key][column] = dfs[value[0]][column] + dfs[value[1]][col

### Verifying

In [38]:
# # Verify the outcome
# for key, df in dfs.items():
#     print(f"DataFrame '{key}' column names:")
#     print(df.columns.tolist())

## Building Weighted Average Column for 'perc_df_2' listed dfs

Removing dataframes from dict 'dfs' for further processing

In [39]:
# Initialize the new dictionary 'dfs_perc_2'
dfs_perc_2 = {}

# Iterate over the dataframe names in 'perc_df_2' list
for df_name in perc_df_2:
    # Check if the dataframe name exists in 'dfs' dictionary
    if df_name in dfs:
        # Move a copy of the matching dataframe from 'dfs' to 'dfs_perc_2'
        dfs_perc_2[df_name] = dfs[df_name].copy()

### Verifying

Should have 9 dfs in dict 'dfs_perc_2'<br>
Should have 27 dfs in dict 'dfs'

In [40]:
print(len(dfs_perc_2))
for key, df in dfs_perc_2.items():
    print(f"DataFrame '{key}'")
    print(df.columns.tolist())

9
DataFrame 'DF_total_all'
['Total', 'Location', 'State', 'County', 'FIPS', 'male_all', 'female_all', 'total_whi', 'total_baa', 'total_aian', 'total_aa', 'total_nhop', 'total_sor', 'total_tom', 'total_hol', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 102, 107, 110]
DataFrame 'DF_total_whi'
['Total', 'Location', 'State', 'County', 'FIPS', 'total_male_whi', 'total_female_whi', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71

In [41]:
# print(len(dfs))
# for key, df in dfs.items():
#     print(f"DataFrame '{key}'")
#     print(df.columns.tolist())

### Dropping process

In [42]:
# '''Dropping columns in list'cols_mod' to new dictionary.  Running weighted
# average arrays in new dict 'dropped_columns' then rejoining dictionaries to the
# affected dataframes from list 'perc_df_2'. I know, imagine how I feel, I'm the
# one writing the things.'''

In [43]:
dropped_columns = {}

# Iterate over the dataframes in dictionary 'dfs_perc_2'
    # Dataframe's name to 'df_name' and the dataframe itself to 'df'
for df_name, df in dfs_perc_2.items():
    # Store the dropped columns' data
        #dict[key] = only columns from list 'cols_mod'
    dropped_columns[df_name] = df[cols_mod]

    # Drop only columns from list 'cols_mod' each dataframe in dict 'dfs_perc_2'
    dfs_perc_2[df_name] = df.drop(cols_mod, axis=1)

### Verifying

#### Dict 'dropped_columns'

In [44]:
# # Print the count of dataframes and their names in 'dropped_columns'
# print("Number of Dataframes in dropped_columns:", len(dropped_columns))
# print("Dataframe Names in dropped_columns:", list(dropped_columns.keys()))

# # Print the count of columns in each dataframe in 'dropped_columns'
# for df_name, dropped_df in dropped_columns.items():
#     print(f"\nDataframe Name: {df_name}")
#     print("Number of Columns in Dataframe:", len(dropped_df.columns))
#     print("Column Names:", list(dropped_df.columns))

#### Dict 'dfs_perc_2'

In [45]:
# # Print the count of dataframes and their names in 'dfs_perc_2'
# print("Number of Dataframes in dfs_perc_2:", len(dfs_perc_2))
# print("Dataframe Names in dfs_perc_2:", list(dfs_perc_2.keys()))

# # Print the count of columns in each dataframe in 'dfs_perc_2'
# for df_name, dropped_df in dfs_perc_2.items():
#     print(f"\nDataframe Name: {df_name}")
#     print("Number of Columns in Dataframe:", len(dropped_df.columns))
#     print("Column Names:", list(dropped_df.columns))

## Finding weighted Average

For DFs in dict 'dropped_columns'

In [46]:
# Iterate through the dataframes in 'dropped_columns' whose name is in list 'perc_df_2'
    # Dataframe's name to 'df_name' and the dataframe itself to 'df' in dict 'dropped_columns'
for df_name, df in dropped_columns.items():
    if df_name in perc_df_2:
        # Get the column names as an array of integers from 'cols_mod'
        values = np.array(cols_mod, dtype=int)

        # Initialize an empty list to store the results
        results = []

        # Iterate over each row in the dataframe
        for _, row in df.iterrows():
            # Get the row entries as an array of integers
            weights = np.array(row.values, dtype=int)

            # Perform element-wise multiplication of values and weights
            weighted_values = values * weights

            # Sum the products of the multiplications
            weighted_sum = np.sum(weighted_values)

            # Sum all items in the weights array
            weights_sum = np.sum(weights)

            # Calculate the weighted average, handle division by zero
            if weights_sum != 0:
                weighted_average = weighted_sum / weights_sum
            else:
                weighted_average = np.nan

            # Append the weighted average to the results list
            results.append(weighted_average)

        # Convert the results list to a float array explicitly
        results = np.array(results, dtype=float)

        # Add the 'Average_Age' column to the dataframe
        df['Average_Age'] = results

        # Reset the index of the dataframe
        df.reset_index(drop=True, inplace=True)

### Verifying

#### Dict 'dropped_columns'

In [47]:
# Print the count of dataframes and their names in 'dropped_columns'
print("Number of Dataframes in dropped_columns:", len(dropped_columns))
print("Dataframe Names in dropped_columns:", list(dropped_columns.keys()))

# Print the count of columns in each dataframe in 'dropped_columns'
for df_name, dropped_df in dropped_columns.items():
    print(f"\nDataframe Name: {df_name}")
    print("Number of Columns in Dataframe:", len(dropped_df.columns))
    print("Column Names:", list(dropped_df.columns))

Number of Dataframes in dropped_columns: 9
Dataframe Names in dropped_columns: ['DF_total_all', 'DF_total_whi', 'DF_total_baa', 'DF_total_aian', 'DF_total_aa', 'DF_total_nhop', 'DF_total_sor', 'DF_total_tom', 'DF_total_hol']

Dataframe Name: DF_total_all
Number of Columns in Dataframe: 104
Column Names: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 102, 107, 110, 'Average_Age']

Dataframe Name: DF_total_whi
Number of Columns in Dataframe: 104
Column Names: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,

## Joining Dataframes

In [48]:
# Iterate through the dataframes in dict 'dfs_perc_2'
    # Dataframe's name to 'df_name' and the dataframe itself to 'df_perc' in dict 'dfs_perc_2'
for df_name, df_perc in dfs_perc_2.items():
    # Get the corresponding dataframe from 'dropped_columns'
    # Iterate through the dataframes in 'dfs_perc_2'
    df_dropped = dropped_columns[df_name]

    # Concatenate the dataframes width-wise
        # adding 104 columns to each dataframe, saving to var 'joined_df'
    joined_df = pd.concat([df_perc, df_dropped], axis=1)

    # Update the 'dfs_perc_2' dataframe in place with the joined dataframe
    dfs_perc_2[df_name] = joined_df

    # Reset the index of the dataframe
    dfs_perc_2[df_name].reset_index(drop=True, inplace=True)
# all data from weighted average now in dictionary of dataframes 'dfs_perc_2'
del dropped_columns

### Verifying

##### dfs_perc_2

In [49]:
# # Print the number of dataframes in the 'dfs_perc_2' dictionary
# print("Number of Dataframes in 'dfs_perc_2':", len(dfs_perc_2))

# # Print the shape of the dataframes in 'dfs_perc_2'
# print("Data Frames in 'dfs_perc_2':")
# for df_name, df in dfs_perc_2.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

#### dfs_perc

In [50]:
# # Print the number of dataframes in the 'dfs_perc' dictionary
# print("Number of Dataframes in 'dfs_perc':", len(dfs_perc))

# # Print the shape of the dataframes in 'dfs_perc'
# print("Data Frames in 'dfs_perc':")
# for df_name, df in dfs_perc.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

Print all the count,name,shape,# of cols, col names of the dataframes

In [51]:
# Print the number of dataframes in the 'dfs' dictionary
print("Number of Dataframes in 'dfs':", len(dfs))

# Print the shape of the dataframes in 'dfs'
print("Data Frames in 'dfs':")
for df_name, df in dfs.items():
    print("Dataframe Name:", df_name)
    print("Shape:", df.shape)
    print(f"Number of Columns: {len(df.columns)}")
    print("Columns:", df.columns.tolist())

Number of Dataframes in 'dfs': 27
Data Frames in 'dfs':
Dataframe Name: DF_total_all
Shape: (3221, 118)
Number of Columns: 118
Columns: ['Total', 'Location', 'State', 'County', 'FIPS', 'male_all', 'female_all', 'total_whi', 'total_baa', 'total_aian', 'total_aa', 'total_nhop', 'total_sor', 'total_tom', 'total_hol', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 102, 107, 110]
Dataframe Name: DF_total_whi
Shape: (3221, 110)
Number of Columns: 110
Columns: ['Total', 'Location', 'State', 'County', 'FIPS', 'total_male_whi', 'total_female_whi', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30

## Building Percent to Total columns

### Verify

some column names may have been saved as integers and will need to be strings

In [52]:
# def get_column_types(df):
#     column_types = {}
#     for column_name in df.columns:
#         column_types[column_name] = type(column_name).__name__
#     return column_types

# result_list = []
# for df_name, df in dfs_perc_2.items():
#     column_types = get_column_types(df)
#     result_list.append((df_name, df, column_types))

# # Print the results
# for df_info in result_list:
#     df_name, df, column_types = df_info
#     print(f"Dataframe: {df_name}")
#     print("Columns:")
#     for col_name, col_type in column_types.items():
#         print(f" - {col_name}: {col_type}")
#     print("\n")

### Percent to Total Process

Percent to total for ages in dfs of dict 'dfs_perc_2'

In [53]:
# # Print the shape of the dataframes in 'dfs_perc_2'
# print("Data Frames in 'dfs_perc_2':")
# for df_name, df in dfs_perc_2.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

In [54]:
# Convert cols_mod to strings to iterate through properly
cols_mod_str = [str(col) for col in cols_mod]

# Create a new dictionary to store the modified dataframes
dfs_perc_2_updated = {}

# Iterate through each dataframe in the dictionary and change column names to type string
for key, df in dfs_perc_2.items():
    # Convert column names to strings
    df.columns = [str(col) for col in df.columns]

    # Create a copy of the original dataframe
    df_updated = df.copy()

    # Calculate the percentage values for the new columns
    for col in cols_mod_str:
        new_col_name = f'{col}_perc'
        df_updated[new_col_name] = [0 if total == 0 else (value / total)
                                    for value, total in zip(df[col], df['Total'])]

    # Add the updated dataframe to the new dictionary
    dfs_perc_2_updated[key] = df_updated

# Replace the original 'dfs_perc_2' dictionary with the updated one
dfs_perc_2 = dfs_perc_2_updated

  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)


  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / total)
  df_updated[new_col_name] = [0 if total == 0 else (value / to

old

In [55]:
# # Convert cols_mod to strings to iterate through properly
# cols_mod_str = [str(col) for col in cols_mod]

# # Iterate through each dataframe in the dictionary and change column names to type string
# for key, df in dfs_perc_2.items():
#     # Convert column names to strings
#     df.columns = [str(col) for col in df.columns]

#     updated_columns = []

#     # Calculate the percentage values for the new columns
#     for col in cols_mod_str:
#         new_col_name = f'{col}_perc'
#         updated_columns.append(new_col_name)

#         df[new_col_name] = [0 if total == 0 else (value / total)
#                             for value, total in zip(df[col], df['Total'])]

#     # Update the dataframe in the dictionary with the new columns
#     dfs_perc_2[key] = df[updated_columns]

### Verifying

In [56]:
# Function to cast column names to strings
def cast_column_names_to_string(df):
    df.columns = [str(col) for col in df.columns]
    return df

# Iterate through the dataframes in 'dfs' and 'dfs_perc_2' and cast column names to strings
for df_name, df in dfs.items():
    dfs[df_name] = cast_column_names_to_string(df)

for df_name, df in dfs_perc_2.items():
    dfs_perc_2[df_name] = cast_column_names_to_string(df)

In [57]:
# Print the shape of the dataframes in 'dfs_perc_2'
print("Data Frames in 'dfs_perc_2':")
for df_name, df in dfs_perc_2.items():
    print("Dataframe Name:", df_name)
    print("Shape:", df.shape)
    print(f"Number of Columns: {len(df.columns)}")
    print("Columns:", df.columns.tolist())

Data Frames in 'dfs_perc_2':
Dataframe Name: DF_total_all
Shape: (3221, 222)
Number of Columns: 222
Columns: ['Total', 'Location', 'State', 'County', 'FIPS', 'male_all', 'female_all', 'total_whi', 'total_baa', 'total_aian', 'total_aa', 'total_nhop', 'total_sor', 'total_tom', 'total_hol', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '102', '107', '110', 'Average_Age', '0_perc', '1_perc', '2_perc', '3_perc', '4_perc', '5_perc', '6_perc', '7_perc', '8_pe

In [58]:
# Print the shape of the dataframes in 'dfs'
print("Data Frames in 'dfs':")
for df_name, df in dfs.items():
    print("Dataframe Name:", df_name)
    print("Shape:", df.shape)
    print(f"Number of Columns: {len(df.columns)}")
    print("Columns:", df.columns.tolist())

Data Frames in 'dfs':
Dataframe Name: DF_total_all
Shape: (3221, 118)
Number of Columns: 118
Columns: ['Total', 'Location', 'State', 'County', 'FIPS', 'male_all', 'female_all', 'total_whi', 'total_baa', 'total_aian', 'total_aa', 'total_nhop', 'total_sor', 'total_tom', 'total_hol', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '102', '107', '110']
Dataframe Name: DF_total_whi
Shape: (3221, 110)
Number of Columns: 110
Columns: ['Total', 'Location', 'Stat

## Joining Dictionaries

In [59]:
# Merge the data from 'dfs_perc_2' into 'dfs'
for df_name, df_perc_2 in dfs_perc_2.items():
    # Check if the current dataframe exists in 'dfs'
    if df_name in dfs:
        # Get the original dataframe from 'dfs'
        df = dfs[df_name]

        # Check if there are any new columns in 'df_perc_2' that don't exist in 'df'
        new_columns = [col for col in df_perc_2.columns if col not in df.columns]

        # Add the new columns from 'df_perc_2' to 'df' without overwriting existing columns
        for col in new_columns:
            # Check for any duplicated columns and handle them if necessary
            new_col_name = col
            counter = 1
            while new_col_name in df.columns:
                new_col_name = f"{col}_{counter}"
                counter += 1

            df[new_col_name] = df_perc_2[col]

        # Update the dataframe in 'dfs' with the modified 'df'
        dfs[df_name] = df

# 'dfs' now contains the data from 'dfs_perc_2' with new columns added to the existing dataframes without overwriting existing columns

  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc_2[col]
  df[new_col_name] = df_perc

In [60]:
# # Merge the data from 'dfs_perc_2' into 'dfs'
# for df_name, df_perc_2 in dfs_perc_2.items():
#     # Check if the current dataframe exists in 'dfs'
#     if df_name in dfs:
#         # Get the original dataframe from 'dfs'
#         df = dfs[df_name]

#         # Check if there are any new columns in 'df_perc_2' that don't exist in 'df'
#         new_columns = [col for col in df_perc_2.columns if col not in df.columns]

#         # Add the new columns from 'df_perc_2' to 'df' without overwriting existing columns
#         for col in new_columns:
#             df[col] = df_perc_2[col]

# # 'dfs' now contains the data from 'dfs_perc_2' with new columns added to the existing dataframes without overwriting existing columns

In [61]:
# Print the shape of the dataframes in 'dfs'
print("Data Frames in 'dfs':")
for df_name, df in dfs.items():
    print("Dataframe Name:", df_name)
    print("Shape:", df.shape)
    print(f"Number of Columns: {len(df.columns)}")
    print("Columns:", df.columns.tolist())

Data Frames in 'dfs':
Dataframe Name: DF_total_all
Shape: (3221, 222)
Number of Columns: 222
Columns: ['Total', 'Location', 'State', 'County', 'FIPS', 'male_all', 'female_all', 'total_whi', 'total_baa', 'total_aian', 'total_aa', 'total_nhop', 'total_sor', 'total_tom', 'total_hol', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '102', '107', '110', 'Average_Age', '0_perc', '1_perc', '2_perc', '3_perc', '4_perc', '5_perc', '6_perc', '7_perc', '8_perc', '9

old

In [62]:
# # Create a new dictionary to store the merged dataframes from both 'dfs' and 'dfs_perc_2'
# merged_dfs = dfs.copy()

# # Merge the data from 'dfs_perc_2' into 'merged_dfs'
# for df_name in df_names:
#     # Check if the current dataframe exists in 'dfs_perc_2'
#     if df_name in dfs_perc_2:
#         # Check if the current dataframe also exists in 'merged_dfs'
#         if df_name in merged_dfs:
#             # If the dataframe exists in both 'merged_dfs' and 'dfs_perc_2', merge them horizontally (concatenation)
#             df_merged = pd.concat([merged_dfs[df_name], dfs_perc_2[df_name]], axis=1)
#             # Store the merged dataframe in the 'merged_dfs' dictionary
#             merged_dfs[df_name] = df_merged
#         else:
#             # If the dataframe only exists in 'dfs_perc_2', store it directly in the 'merged_dfs' dictionary
#             merged_dfs[df_name] = dfs_perc_2[df_name]

# # 'merged_dfs' now contains the data from both 'dfs' and 'dfs_perc_2' without overwriting 'dfs'
# # If you want to keep both original dictionaries, you don't need to delete 'dfs_perc_2'
# # If you still want to delete 'dfs_perc_2', you can do it separately
# del dfs_perc_2

## Calculating Error Rates

In [63]:
# Step 1: Calculate the expected total by summing the individual racial category columns
expected_total = dfs['DF_total_all'][['total_whi', 'total_baa', 'total_aian', 'total_aa', 'total_nhop', 'total_sor', 'total_tom', 'total_hol']].sum(axis=1)

# Step 2: Find the absolute error by subtracting the 'Total' column from the expected total
absolute_error = expected_total - dfs['DF_total_all']['Total']

# Step 3: Calculate the error rate as the absolute error divided by the expected total, multiplied by 100
error_rate = (absolute_error / expected_total) * 100

# Add the 'expected_total', 'absolute_error', and 'error_rate' columns to the DataFrame
dfs['DF_total_all']['Expected Total'] = expected_total
dfs['DF_total_all']['Absolute Error'] = absolute_error
dfs['DF_total_all']['Error Rate'] = error_rate

  dfs['DF_total_all']['Expected Total'] = expected_total
  dfs['DF_total_all']['Absolute Error'] = absolute_error
  dfs['DF_total_all']['Error Rate'] = error_rate


## Building Weighted Average Column for 'DF_total_all'

Copy dataframe 'DF_total_all' from dict 'dfs' for further processing

In [64]:
# # # Access dataframe from the 'dfs' dictionary 
# df = dfs['DF_total_all']

# # Iterate over each column in the dataframe list in 'cols_mod' change to int
# for col in cols_mod:
#     df.rename(columns={str(col): int(col)}, inplace=True)

# # Update the 'DF_total_all' dataframe in the 'dfs' dictionary with the modified column names
# dfs['DF_total_all'] = df

# # Access dataframe from the 'dfs' dictionary
# df = dfs['DF_total_all'].copy()

# # Keep columns from the copy for further processing
# df_dropped = df[cols_mod]

# # Calc the w ave and add 'Average_Age' column
# # Get the column names as an array of integers from 'cols_mod'
# values = np.array(cols_mod, dtype=int)

# # Initialize an empty list to store the results
# results = []

# # Iterate over each row in the dataframe
# for _, row in df_dropped.iterrows():
#     # Get the row entries as an array of integers
#     weights = np.array(row.values, dtype=int)

#     # Perform element-wise multiplication of values and weights
#     weighted_values = values * weights

#     # Sum the products of the multiplications
#     weighted_sum = np.sum(weighted_values)

#     # Sum all items in the weights array
#     weights_sum = np.sum(weights)

#     # Calculate the weighted average, handle division by zero
#     if weights_sum != 0:
#         weighted_average = weighted_sum / weights_sum
#     else:
#         weighted_average = np.nan

#     # Append the weighted average to the results list
#     results.append(weighted_average)

# # Convert the results list to a float array explicitly
# results = np.array(results, dtype=float)

# # Add the 'Average_Age' column to the dataframe using .loc
# df_dropped.loc[:, 'Average_Age'] = results

# # Step 4: Join 'df_dropped' with the original DataFrame in the 'dfs' dictionary
# dfs['DF_total_all'] = df.join(df_dropped['Average_Age'])

# del df_dropped

In [65]:
# # # Convert column names to strings
# # df.columns = [str(col) for col in df.columns]

# # Access dataframe from the 'dfs' dictionary
# df = dfs['DF_total_all'].copy()

# # Keep columns from the copy for further processing
# df_dropped = df[cols_mod]

# # Add total column
# df_dropped['Total'] = df['Total']

# # updated_columns = []

# # Calculate the percentage values for the new column
# for col in df_dropped.columns:
#     new_col_name = f'{col}_perc'
#     # updated_columns.extend([col, new_col_name])
    
#     # Calculate the percentage values for the new columns and add them to 'df_dropped'
#     df_dropped[new_col_name] = [0 if total == 0 else (value / total)
#                             for value, total in zip(df_dropped[col], df_dropped['Total'])]

# # Update the DataFrame in the 'dfs' dictionary with the modified 'df_dropped' columns only
# for col in df_dropped.columns:
#     if col not in df.columns:
#         df[col] = df_dropped[col]

# # Update 'dfs' with the modified 'df'
# dfs['DF_total_all'] = df

### Verifying

In [66]:
# # Display the DataFrame with the added columns
# print(dfs['DF_total_all'])

## Updating column order

Ensure all column names are strings since earlier column names were used as int.

In [67]:
# Loop through all the DataFrames in 'dfs' dictionary
for df_name, df in dfs.items():
    # Convert column names to strings
    df.columns = df.columns.astype(str)

    # Update the DataFrame in the 'dfs' dictionary
    dfs[df_name] = df

The 'total all' DF is unique and needs to be done seperately

In [68]:
col_loc_tot_all = ['0','0_perc','1','1_perc','2','2_perc','3','3_perc','4','4_perc','5',
                   '5_perc','6','6_perc','7','7_perc','8','8_perc','9','9_perc','10',
                   '10_perc','11','11_perc','12','12_perc','13','13_perc','14',
                   '14_perc','15','15_perc','16','16_perc','17','17_perc','18',
                   '18_perc','19','19_perc','20','20_perc','21','21_perc','22',
                   '22_perc','23','23_perc','24','24_perc','25','25_perc','26',
                   '26_perc','27','27_perc','28','28_perc','29','29_perc','30',
                   '30_perc','31','31_perc','32','32_perc','33','33_perc','34',
                   '34_perc','35','35_perc','36','36_perc','37','37_perc','38',
                   '38_perc','39','39_perc','40','40_perc','41','41_perc','42',
                   '42_perc','43','43_perc','44','44_perc','45','45_perc','46',
                   '46_perc','47','47_perc','48','48_perc','49','49_perc','50',
                   '50_perc','51','51_perc','52','52_perc','53','53_perc','54',
                   '54_perc','55','55_perc','56','56_perc','57','57_perc','58',
                   '58_perc','59','59_perc','60','60_perc','61','61_perc','62',
                   '62_perc','63','63_perc','64','64_perc','65','65_perc','66',
                   '66_perc','67','67_perc','68','68_perc','69','69_perc','70',
                   '70_perc','71','71_perc','72','72_perc','73','73_perc','74',
                   '74_perc','75','75_perc','76','76_perc','77','77_perc','78',
                   '78_perc','79','79_perc','80','80_perc','81','81_perc','82',
                   '82_perc','83','83_perc','84','84_perc','85','85_perc','86',
                   '86_perc','87','87_perc','88','88_perc','89','89_perc','90',
                   '90_perc','91','91_perc','92','92_perc','93','93_perc','94',
                   '94_perc','95','95_perc','96','96_perc','97','97_perc','98',
                   '98_perc','99','99_perc','102','102_perc','107','107_perc',
                   '110','110_perc','Average_Age','male_all','female_all',
                   'total_whi','total_baa','total_aian','total_aa',
                   'total_nhop','total_sor','total_tom','total_hol',
                   'Expected Total','Total','Absolute Error','Error Rate',
                   'Location','State','County','FIPS'
                   ]

# Access the DataFrame in 'dfs' dictionary
df = dfs['DF_total_all']

# Reindex the DataFrame using the list
df = df.reindex(columns=col_loc_tot_all)

# Update the DataFrame in the 'dfs' dictionary
dfs['DF_total_all'] = df

# Inplace reset index to fix any fragmentation
df.reset_index(drop=True, inplace=True)

### Ordering process

Setting the variables for column order for total race dataframes

In [69]:
df_perc_8 = ['DF_total_whi','DF_total_baa','DF_total_aian','DF_total_aa',
             'DF_total_nhop','DF_total_sor','DF_total_tom','DF_total_hol'
             ]

col_loc = ['0','0_perc','1','1_perc','2','2_perc','3','3_perc','4','4_perc','5','5_perc','6',
           '6_perc','7','7_perc','8','8_perc','9','9_perc','10','10_perc','11','11_perc',
           '12','12_perc','13','13_perc','14','14_perc','15','15_perc','16','16_perc','17',
           '17_perc','18','18_perc','19','19_perc','20','20_perc','21','21_perc','22',
           '22_perc','23','23_perc','24','24_perc','25','25_perc','26','26_perc','27',
           '27_perc','28','28_perc','29','29_perc','30','30_perc','31','31_perc','32',
           '32_perc','33','33_perc','34','34_perc','35','35_perc','36','36_perc','37',
           '37_perc','38','38_perc','39','39_perc','40','40_perc','41','41_perc','42',
           '42_perc','43','43_perc','44','44_perc','45','45_perc','46','46_perc','47',
           '47_perc','48','48_perc','49','49_perc','50','50_perc','51','51_perc','52',
           '52_perc','53','53_perc','54','54_perc','55','55_perc','56','56_perc','57',
           '57_perc','58','58_perc','59','59_perc','60','60_perc','61','61_perc','62',
           '62_perc','63','63_perc','64','64_perc','65','65_perc','66','66_perc','67',
           '67_perc','68','68_perc','69','69_perc','70','70_perc','71','71_perc','72',
           '72_perc','73','73_perc','74','74_perc','75','75_perc','76','76_perc','77',
           '77_perc','78','78_perc','79','79_perc','80','80_perc','81','81_perc','82',
           '82_perc','83','83_perc','84','84_perc','85','85_perc','86','86_perc','87',
           '87_perc','88','88_perc','89','89_perc','90','90_perc','91','91_perc','92',
           '92_perc','93','93_perc','94','94_perc','95','95_perc','96','96_perc','97',
           '97_perc','98','98_perc','99','99_perc','102','102_perc','107','107_perc',
           '110','110_perc','Average_Age','Total','Location','State','County',
           'FIPS']

spec_col_dict = {
        'DF_total_whi': ['total_male_whi','total_female_whi'],
        'DF_total_baa': ['total_male_baa','total_female_baa'],
        'DF_total_aian': ['total_male_aian','total_female_aian'],
        'DF_total_aa': ['total_male_aa','total_female_aa'],
        'DF_total_nhop': ['total_male_nhop','total_female_nhop'],
        'DF_total_sor': ['total_male_sor','total_female_sor'],
        'DF_total_tom': ['total_male_tom','total_female_tom'],
        'DF_total_hol': ['total_male_hol','total_female_hol'],
        }

### Ordering process

In [70]:
# # Print the shape of the dataframes in 'dfs'
# print("Data Frames in 'dfs':")
# print(len(dfs))
# for df_name, df in dfs.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

In [71]:
# Reorder the columns for each dataframe in 'df_perc_8'
for df_name in df_perc_8:
    # Get the dataframe from 'dfs'
    df = dfs[df_name]
    
    # Get the two unique columns specific to this dataframe
    unique_cols = spec_col_dict[df_name]
    
    # Get the columns that are not unique to this dataframe (common columns)
    common_cols = [col for col in col_loc if col not in unique_cols]
    
    # Calculate the number of columns that need to be inserted for each unique column
    num_cols_to_insert = len(common_cols)
    
    # Get the index where the unique columns should be inserted (index 207 in this case)
    insert_index = 207
    
    # Split the common columns into three groups to accommodate the insertion of unique columns
    common_cols_part1 = common_cols[:insert_index]
    common_cols_part2 = common_cols[insert_index:]
    
    # Reorder the columns as required
    df = df[common_cols_part1 + unique_cols + common_cols_part2]
    
    # Move the resulting dataframe back to the 'dfs' dictionary
    dfs[df_name] = df

In [72]:
# # Print the shape of the dataframes in 'dfs'
# print("Data Frames in 'dfs':")
# print(len(dfs))
# for df_name, df in dfs.items():
#     print("Dataframe Name:", df_name)
#     print("Shape:", df.shape)
#     print(f"Number of Columns: {len(df.columns)}")
#     print("Columns:", df.columns.tolist())

## Export to excel

Create workbook and export wanted dataframes to excel as individual sheets.

In [73]:
# Define the file name
file_name = '2020_agesex_statistics.xlsx'

# Get the file path in the current working directory
file_path = os.path.join(os.getcwd(), file_name)

# Check if the file exists
if os.path.exists(file_path):
    # Load the existing Excel file
    excel_file = pd.read_excel(file_path, engine='openpyxl')

    # Create a new ExcelWriter object using the existing file
    writer = pd.ExcelWriter(file_path, engine='openpyxl', if_sheet_exists='replace', mode='a')

    # Iterate through the dataframes in dfs
    for df_name, df in dfs.items():  # Use 'dfs.items()' to get the name (key) and dataframe (value)
        # Get the name of the dataframe
        name = df_name

        # Write each dataframe to a separate sheet in the Excel file
        df.to_excel(writer, sheet_name=name, index=False)

    # Close the writer
    writer.close()

else:
    # Create a new workbook
    writer = pd.ExcelWriter(file_path, engine='openpyxl')

    # Iterate through the dataframes in dfs
    for df_name, df in dfs.items():  # Use 'dfs.items()' to get the name (key) and dataframe (value)
        # Get the name of the dataframe
        name = df_name

        # Write each dataframe to a separate sheet in the Excel file
        df.to_excel(writer, sheet_name=name, index=False)

    # Close the writer
    writer.close()

export dataframe individually as .xlsx file, create folder as well

In [74]:
# Define the directory name
dir_name = 'Statistics_Dataframes'

# Get the directory path in the current working directory
dir_path = os.path.join(os.getcwd(), dir_name)

# Check if the directory exists, if not create it
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# Iterate through the dataframes in dfs
for df_name, df in dfs.items():  # Use 'dfs.items()' to get the name (key) and dataframe (value)
    # Define the file name using the name of the dataframe
    file_name = f'{df_name}.xlsx'
    
    # Get the file path in the directory
    file_path = os.path.join(dir_path, file_name)
    
    # Write dataframe to a new Excel file, overwrite if it already exists
    df.to_excel(file_path, index=False)