## Contents
- [Importing and Combining Total Number Data](#Importing-and-Combining-Total-Number-Data)  
- [Feature Engineering](#Feature-Engineering)
- [Dataframe to Hold Both Total Numbers and Percent Values](#Dataframe-to-Hold-Both-Total-Numbers-and-Percent-Values)
- [Dataframe to Hold Percent Values](#Dataframe-to-Hold-Percent-Values)

# Importing and Combining Total Number Data

In [1]:
import pandas as pd

In [2]:
# Import the relevant dataframes.
race = pd.read_csv('./data/preprocessing/tx_dp05_race_cleaned.csv')
sa = pd.read_csv('./data/preprocessing/tx_dp05_sex_age_cleaned.csv')
land = pd.read_csv('./data/preprocessing/tx_area_cleaned.csv')

In [3]:
# Display the first few rows of data. 
race.head(2)

Unnamed: 0,Geographic Area Name,race_pop,race_pop_hispanic_or_latino_(of_any_race),race_pop_white_alone,race_pop_black_or_african_american_alone,race_pop_american_indian_and_alaska_native_alone,race_pop_asian_alone,race_pop_native_hawaiian_and_other_pacific_islander_alone,race_pop_some_other_race_alone,race_pop_two_or_more_races
0,"Austin County, Texas",29565,7819,18525,2576,48,89,0,52,456
1,"Kenedy County, Texas",595,522,72,0,0,1,0,0,0


In [4]:
# Display the first few rows of data. 
sa.head(2)

Unnamed: 0,Geographic Area Name,sex_age_pop,sex_age_pop_male,sex_age_pop_female,sex_age_pop_under_5,sex_age_pop_5_to_9,sex_age_pop_10_to_14,sex_age_pop_15_to_19,sex_age_pop_20_to_24,sex_age_pop_25_to_34,sex_age_pop_35_to_44,sex_age_pop_45_to_54,sex_age_pop_55_to_59,sex_age_pop_60_to_64,sex_age_pop_65_to_74,sex_age_pop_75_to_84,sex_age_pop_85_and_over,sex_age_median_age_in_years
0,"Austin County, Texas",29565,14684,14881,1780,1960,2118,1861,1712,3339,3275,3821,2327,1978,3243,1532,619,40.7
1,"Kenedy County, Texas",595,286,309,85,37,40,10,10,95,47,75,51,9,85,29,22,39.5


In [5]:
# Display the first few rows of data. 
land.head(2)

Unnamed: 0,Geographic Area Name,sq_mi
0,"Anderson County, Texas",1062.63
1,"Andrews County, Texas",1500.721


In [6]:
# Merge the three dataframes on Geographic Area Name.
# This will be the dataframe for tracking total numbers.
df_num = race.merge(sa,on='Geographic Area Name').merge(land,on='Geographic Area Name')
df_num.head()

Unnamed: 0,Geographic Area Name,race_pop,race_pop_hispanic_or_latino_(of_any_race),race_pop_white_alone,race_pop_black_or_african_american_alone,race_pop_american_indian_and_alaska_native_alone,race_pop_asian_alone,race_pop_native_hawaiian_and_other_pacific_islander_alone,race_pop_some_other_race_alone,race_pop_two_or_more_races,...,sex_age_pop_25_to_34,sex_age_pop_35_to_44,sex_age_pop_45_to_54,sex_age_pop_55_to_59,sex_age_pop_60_to_64,sex_age_pop_65_to_74,sex_age_pop_75_to_84,sex_age_pop_85_and_over,sex_age_median_age_in_years,sq_mi
0,"Austin County, Texas",29565,7819,18525,2576,48,89,0,52,456,...,3339,3275,3821,2327,1978,3243,1532,619,40.7,646.492
1,"Kenedy County, Texas",595,522,72,0,0,1,0,0,0,...,95,47,75,51,9,85,29,22,39.5,1458.453
2,"Nueces County, Texas",360486,228462,107652,13071,919,7134,242,226,2780,...,52547,45030,43503,22563,21051,28881,15165,5299,35.3,838.316
3,"Colorado County, Texas",21022,6200,11855,2655,27,7,0,0,278,...,2054,2233,2440,1280,1866,2467,1356,640,42.5,960.284
4,"San Patricio County, Texas",67046,38483,26032,1003,101,671,30,7,719,...,8923,8328,8078,4417,3367,5759,2935,929,35.3,693.436


In [7]:
# Display the columns.
df_num.columns

Index(['Geographic Area Name', 'race_pop',
       'race_pop_hispanic_or_latino_(of_any_race)', 'race_pop_white_alone',
       'race_pop_black_or_african_american_alone',
       'race_pop_american_indian_and_alaska_native_alone',
       'race_pop_asian_alone',
       'race_pop_native_hawaiian_and_other_pacific_islander_alone',
       'race_pop_some_other_race_alone', 'race_pop_two_or_more_races',
       'sex_age_pop', 'sex_age_pop_male', 'sex_age_pop_female',
       'sex_age_pop_under_5', 'sex_age_pop_5_to_9', 'sex_age_pop_10_to_14',
       'sex_age_pop_15_to_19', 'sex_age_pop_20_to_24', 'sex_age_pop_25_to_34',
       'sex_age_pop_35_to_44', 'sex_age_pop_45_to_54', 'sex_age_pop_55_to_59',
       'sex_age_pop_60_to_64', 'sex_age_pop_65_to_74', 'sex_age_pop_75_to_84',
       'sex_age_pop_85_and_over', 'sex_age_median_age_in_years', 'sq_mi'],
      dtype='object')

In [8]:
# Rename the median age column for the function.
df_num = df_num.rename(columns={'Geographic Area Name': 'county_state'})

In [9]:
# Set the index to the geography.
df_num = df_num.set_index('county_state')

In [10]:
# Display the first few rows of the dataframe.
df_num.head(3)

Unnamed: 0_level_0,race_pop,race_pop_hispanic_or_latino_(of_any_race),race_pop_white_alone,race_pop_black_or_african_american_alone,race_pop_american_indian_and_alaska_native_alone,race_pop_asian_alone,race_pop_native_hawaiian_and_other_pacific_islander_alone,race_pop_some_other_race_alone,race_pop_two_or_more_races,sex_age_pop,...,sex_age_pop_25_to_34,sex_age_pop_35_to_44,sex_age_pop_45_to_54,sex_age_pop_55_to_59,sex_age_pop_60_to_64,sex_age_pop_65_to_74,sex_age_pop_75_to_84,sex_age_pop_85_and_over,sex_age_median_age_in_years,sq_mi
county_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Austin County, Texas",29565,7819,18525,2576,48,89,0,52,456,29565,...,3339,3275,3821,2327,1978,3243,1532,619,40.7,646.492
"Kenedy County, Texas",595,522,72,0,0,1,0,0,0,595,...,95,47,75,51,9,85,29,22,39.5,1458.453
"Nueces County, Texas",360486,228462,107652,13071,919,7134,242,226,2780,360486,...,52547,45030,43503,22563,21051,28881,15165,5299,35.3,838.316


# Feature Engineering

In [11]:
# Compare the base populations for race and sex/age to ensure they're the same.
df_num[['race_pop', 'sex_age_pop']]

Unnamed: 0_level_0,race_pop,sex_age_pop
county_state,Unnamed: 1_level_1,Unnamed: 2_level_1
"Austin County, Texas",29565,29565
"Kenedy County, Texas",595,595
"Nueces County, Texas",360486,360486
"Colorado County, Texas",21022,21022
"San Patricio County, Texas",67046,67046
...,...,...
"McCulloch County, Texas",8098,8098
"Lee County, Texas",16952,16952
"Ellis County, Texas",168838,168838
"Kerr County, Texas",51365,51365


In [12]:
# Create a new column for county population density which is the result of
# dividing population by square miles
df_num['pop_density'] = df_num['race_pop'] / df_num['sq_mi']

In [13]:
df_num.head()

Unnamed: 0_level_0,race_pop,race_pop_hispanic_or_latino_(of_any_race),race_pop_white_alone,race_pop_black_or_african_american_alone,race_pop_american_indian_and_alaska_native_alone,race_pop_asian_alone,race_pop_native_hawaiian_and_other_pacific_islander_alone,race_pop_some_other_race_alone,race_pop_two_or_more_races,sex_age_pop,...,sex_age_pop_35_to_44,sex_age_pop_45_to_54,sex_age_pop_55_to_59,sex_age_pop_60_to_64,sex_age_pop_65_to_74,sex_age_pop_75_to_84,sex_age_pop_85_and_over,sex_age_median_age_in_years,sq_mi,pop_density
county_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Austin County, Texas",29565,7819,18525,2576,48,89,0,52,456,29565,...,3275,3821,2327,1978,3243,1532,619,40.7,646.492,45.731424
"Kenedy County, Texas",595,522,72,0,0,1,0,0,0,595,...,47,75,51,9,85,29,22,39.5,1458.453,0.407967
"Nueces County, Texas",360486,228462,107652,13071,919,7134,242,226,2780,360486,...,45030,43503,22563,21051,28881,15165,5299,35.3,838.316,430.012072
"Colorado County, Texas",21022,6200,11855,2655,27,7,0,0,278,21022,...,2233,2440,1280,1866,2467,1356,640,42.5,960.284,21.89144
"San Patricio County, Texas",67046,38483,26032,1003,101,671,30,7,719,67046,...,8328,8078,4417,3367,5759,2935,929,35.3,693.436,96.686644


In [14]:
# Export the data.
df_num.to_csv('./data/tx_cleaned_numbers.csv')

# Dataframe to Hold Both Total Numbers and Percent Values

In [15]:
# Make a copy of the dataframe that will hold total numbers AND percentages.
df = df_num.copy()

In [16]:
for column in df.columns:
    print(column)

race_pop
race_pop_hispanic_or_latino_(of_any_race)
race_pop_white_alone
race_pop_black_or_african_american_alone
race_pop_american_indian_and_alaska_native_alone
race_pop_asian_alone
race_pop_native_hawaiian_and_other_pacific_islander_alone
race_pop_some_other_race_alone
race_pop_two_or_more_races
sex_age_pop
sex_age_pop_male
sex_age_pop_female
sex_age_pop_under_5
sex_age_pop_5_to_9
sex_age_pop_10_to_14
sex_age_pop_15_to_19
sex_age_pop_20_to_24
sex_age_pop_25_to_34
sex_age_pop_35_to_44
sex_age_pop_45_to_54
sex_age_pop_55_to_59
sex_age_pop_60_to_64
sex_age_pop_65_to_74
sex_age_pop_75_to_84
sex_age_pop_85_and_over
sex_age_median_age_in_years
sq_mi
pop_density


# Dataframe to Hold Percent Values

In [17]:
# Define a function to create new columns with percentages.
def to_percentage(dataframe):
    
    for column in dataframe.columns:
        if column.startswith('race_pop_'):
            dataframe['percent_' + column] = dataframe[column] / dataframe['race_pop']
        
        elif column.startswith('sex_age_pop_'):
            dataframe['percent_' + column] = dataframe[column] / dataframe['race_pop']
    
    return

In [18]:
# Apply the function to the numbers dataframe
to_percentage(df)

In [19]:
# Display the 
df.head(3)

Unnamed: 0_level_0,race_pop,race_pop_hispanic_or_latino_(of_any_race),race_pop_white_alone,race_pop_black_or_african_american_alone,race_pop_american_indian_and_alaska_native_alone,race_pop_asian_alone,race_pop_native_hawaiian_and_other_pacific_islander_alone,race_pop_some_other_race_alone,race_pop_two_or_more_races,sex_age_pop,...,percent_sex_age_pop_15_to_19,percent_sex_age_pop_20_to_24,percent_sex_age_pop_25_to_34,percent_sex_age_pop_35_to_44,percent_sex_age_pop_45_to_54,percent_sex_age_pop_55_to_59,percent_sex_age_pop_60_to_64,percent_sex_age_pop_65_to_74,percent_sex_age_pop_75_to_84,percent_sex_age_pop_85_and_over
county_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Austin County, Texas",29565,7819,18525,2576,48,89,0,52,456,29565,...,0.062946,0.057906,0.112938,0.110773,0.129241,0.078708,0.066903,0.109691,0.051818,0.020937
"Kenedy County, Texas",595,522,72,0,0,1,0,0,0,595,...,0.016807,0.016807,0.159664,0.078992,0.12605,0.085714,0.015126,0.142857,0.048739,0.036975
"Nueces County, Texas",360486,228462,107652,13071,919,7134,242,226,2780,360486,...,0.070804,0.073226,0.145767,0.124915,0.120679,0.062591,0.058396,0.080117,0.042068,0.0147


In [20]:
# Extract the columns with percentages, save to a new dataframe.
df_percent = df.filter(regex = 'percent', axis = 1)

In [21]:
# Display the first few rows of the dataframe.
df_percent.head(3)

Unnamed: 0_level_0,percent_race_pop_hispanic_or_latino_(of_any_race),percent_race_pop_white_alone,percent_race_pop_black_or_african_american_alone,percent_race_pop_american_indian_and_alaska_native_alone,percent_race_pop_asian_alone,percent_race_pop_native_hawaiian_and_other_pacific_islander_alone,percent_race_pop_some_other_race_alone,percent_race_pop_two_or_more_races,percent_sex_age_pop_male,percent_sex_age_pop_female,...,percent_sex_age_pop_15_to_19,percent_sex_age_pop_20_to_24,percent_sex_age_pop_25_to_34,percent_sex_age_pop_35_to_44,percent_sex_age_pop_45_to_54,percent_sex_age_pop_55_to_59,percent_sex_age_pop_60_to_64,percent_sex_age_pop_65_to_74,percent_sex_age_pop_75_to_84,percent_sex_age_pop_85_and_over
county_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Austin County, Texas",0.264468,0.626585,0.08713,0.001624,0.00301,0.0,0.001759,0.015424,0.496668,0.503332,...,0.062946,0.057906,0.112938,0.110773,0.129241,0.078708,0.066903,0.109691,0.051818,0.020937
"Kenedy County, Texas",0.877311,0.121008,0.0,0.0,0.001681,0.0,0.0,0.0,0.480672,0.519328,...,0.016807,0.016807,0.159664,0.078992,0.12605,0.085714,0.015126,0.142857,0.048739,0.036975
"Nueces County, Texas",0.633761,0.29863,0.036259,0.002549,0.01979,0.000671,0.000627,0.007712,0.493833,0.506167,...,0.070804,0.073226,0.145767,0.124915,0.120679,0.062591,0.058396,0.080117,0.042068,0.0147


## Combine percent dataframe with other key features

In [22]:
df_percent.columns

Index(['percent_race_pop_hispanic_or_latino_(of_any_race)',
       'percent_race_pop_white_alone',
       'percent_race_pop_black_or_african_american_alone',
       'percent_race_pop_american_indian_and_alaska_native_alone',
       'percent_race_pop_asian_alone',
       'percent_race_pop_native_hawaiian_and_other_pacific_islander_alone',
       'percent_race_pop_some_other_race_alone',
       'percent_race_pop_two_or_more_races', 'percent_sex_age_pop_male',
       'percent_sex_age_pop_female', 'percent_sex_age_pop_under_5',
       'percent_sex_age_pop_5_to_9', 'percent_sex_age_pop_10_to_14',
       'percent_sex_age_pop_15_to_19', 'percent_sex_age_pop_20_to_24',
       'percent_sex_age_pop_25_to_34', 'percent_sex_age_pop_35_to_44',
       'percent_sex_age_pop_45_to_54', 'percent_sex_age_pop_55_to_59',
       'percent_sex_age_pop_60_to_64', 'percent_sex_age_pop_65_to_74',
       'percent_sex_age_pop_75_to_84', 'percent_sex_age_pop_85_and_over'],
      dtype='object')

In [23]:
# Two other metrics from the original dataframe to carry over.
df_temp = df[['sex_age_median_age_in_years', 'pop_density']]

In [24]:
# Concatenate the two dataframes to get a complete feature set.
df_percent = pd.concat([df_temp, df_percent], axis=1)

In [25]:
# Display the dataframe.
df_percent.head(3)

Unnamed: 0_level_0,sex_age_median_age_in_years,pop_density,percent_race_pop_hispanic_or_latino_(of_any_race),percent_race_pop_white_alone,percent_race_pop_black_or_african_american_alone,percent_race_pop_american_indian_and_alaska_native_alone,percent_race_pop_asian_alone,percent_race_pop_native_hawaiian_and_other_pacific_islander_alone,percent_race_pop_some_other_race_alone,percent_race_pop_two_or_more_races,...,percent_sex_age_pop_15_to_19,percent_sex_age_pop_20_to_24,percent_sex_age_pop_25_to_34,percent_sex_age_pop_35_to_44,percent_sex_age_pop_45_to_54,percent_sex_age_pop_55_to_59,percent_sex_age_pop_60_to_64,percent_sex_age_pop_65_to_74,percent_sex_age_pop_75_to_84,percent_sex_age_pop_85_and_over
county_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Austin County, Texas",40.7,45.731424,0.264468,0.626585,0.08713,0.001624,0.00301,0.0,0.001759,0.015424,...,0.062946,0.057906,0.112938,0.110773,0.129241,0.078708,0.066903,0.109691,0.051818,0.020937
"Kenedy County, Texas",39.5,0.407967,0.877311,0.121008,0.0,0.0,0.001681,0.0,0.0,0.0,...,0.016807,0.016807,0.159664,0.078992,0.12605,0.085714,0.015126,0.142857,0.048739,0.036975
"Nueces County, Texas",35.3,430.012072,0.633761,0.29863,0.036259,0.002549,0.01979,0.000671,0.000627,0.007712,...,0.070804,0.073226,0.145767,0.124915,0.120679,0.062591,0.058396,0.080117,0.042068,0.0147


In [26]:
# Export the data.
df_percent.to_csv('./data/tx_cleaned_percent.csv')