In [1]:
# Import pandas with alias
import pandas as pd

# Read in the census dataframe
census = pd.read_csv('census_data.csv', index_col=0)

In [2]:
## Change "birth_year"
census["birth_year"] = census["birth_year"].replace("missing", 1967)
census["birth_year"] = census["birth_year"].astype("int")
print(census.birth_year.unique())
print(census.birth_year.mean())
print(census.head())

[2005 1987 1992 1965 1945 1951 1963 1949 1950 1971 2007 1944 1995 1973
 1946 1954 1994 1989 1947 1993 1976 1984 1967 1966 1941 2000 1953 1956
 1960 2001 1980 1955 1985 1996 1968 1979 2006 1962 1981 1959 1977 1978
 1983 1957 1961 1982 2002 1998 1999 1952 1940 1986 1958]
1973.4
  first_name  last_name  birth_year  voted  num_children  income_year  \
0     Denise      Ratke        2005  False             0     92129.41   
1       Hali  Cummerata        1987  False             0     75649.17   
2    Salomon        Orn        1992   True             2    166313.45   
3     Sarina   Schiller        1965  False             2     71704.81   
4       Gust  Abernathy        1945  False             2    143316.08   

       higher_tax marital_status  
0        disagree         single  
1         neutral       divorced  
2           agree         single  
3  strongly agree        married  
4           agree        married  


In [3]:
## Order "higher_tax"
census.higher_tax = pd.Categorical(census.higher_tax, [
  "strongly disagree", "disagree",
  "neutral", "agree", "strongly agree"
], ordered=True)
print(census.higher_tax.unique())
census["higher_tax_codes"] = census.higher_tax.cat.codes
print(census.higher_tax_codes.median())
print(census.head())

['disagree', 'neutral', 'agree', 'strongly agree', 'strongly disagree']
Categories (5, object): ['strongly disagree' < 'disagree' < 'neutral' < 'agree' < 'strongly agree']
2.0
  first_name  last_name  birth_year  voted  num_children  income_year  \
0     Denise      Ratke        2005  False             0     92129.41   
1       Hali  Cummerata        1987  False             0     75649.17   
2    Salomon        Orn        1992   True             2    166313.45   
3     Sarina   Schiller        1965  False             2     71704.81   
4       Gust  Abernathy        1945  False             2    143316.08   

       higher_tax marital_status  higher_tax_codes  
0        disagree         single                 1  
1         neutral       divorced                 2  
2           agree         single                 3  
3  strongly agree        married                 4  
4           agree        married                 3  


In [4]:
# OHE "marital_status"
marital_status = pd.get_dummies(census.marital_status)
print(marital_status)
census = census.join(marital_status)
print(census.head())

    divorced  married  single  widowed
0          0        0       1        0
1          1        0       0        0
2          0        0       1        0
3          0        1       0        0
4          0        1       0        0
..       ...      ...     ...      ...
95         0        1       0        0
96         0        0       1        0
97         0        0       1        0
98         0        0       0        1
99         0        0       1        0

[100 rows x 4 columns]
  first_name  last_name  birth_year  voted  num_children  income_year  \
0     Denise      Ratke        2005  False             0     92129.41   
1       Hali  Cummerata        1987  False             0     75649.17   
2    Salomon        Orn        1992   True             2    166313.45   
3     Sarina   Schiller        1965  False             2     71704.81   
4       Gust  Abernathy        1945  False             2    143316.08   

       higher_tax marital_status  higher_tax_codes  divorced  married

In [5]:
# Label Encode "marital_status"
print(census.marital_status.unique())
census.marital_status = pd.Categorical(census.marital_status, ["divorced", "married", "single", "widowed"])
census["marital_codes"] = census.marital_status.cat.codes
print(census.head())

['single' 'divorced' 'married' 'widowed']
  first_name  last_name  birth_year  voted  num_children  income_year  \
0     Denise      Ratke        2005  False             0     92129.41   
1       Hali  Cummerata        1987  False             0     75649.17   
2    Salomon        Orn        1992   True             2    166313.45   
3     Sarina   Schiller        1965  False             2     71704.81   
4       Gust  Abernathy        1945  False             2    143316.08   

       higher_tax marital_status  higher_tax_codes  divorced  married  single  \
0        disagree         single                 1         0        0       1   
1         neutral       divorced                 2         1        0       0   
2           agree         single                 3         0        0       1   
3  strongly agree        married                 4         0        1       0   
4           agree        married                 3         0        1       0   

   widowed  marital_codes  
0   

In [6]:
# Add "age_group" variable to categorize by age group
census["age"] = (2021 - census.birth_year).astype("int")
census["age_group"] = census.apply(
  lambda row: str(row.age - (row.age % 5)) + "-" + \
               str(row.age - (row.age % 5) + 4),
  axis = 1)
census["age_group"] = pd.Categorical(census.age_group, [str(i) + "-" + str(i + 4) for i in list(range(10, 85, 5))], ordered=True)
census["age_codes"] = census["age_group"].cat.codes
print(census.age.min(), census.age.max())
print(census.age_group.unique())
print(census.head())

14 81
['15-19', '30-34', '25-29', '55-59', '75-79', ..., '35-39', '80-84', '20-24', '60-64', '40-44']
Length: 15
Categories (15, object): ['10-14' < '15-19' < '20-24' < '25-29' ... '65-69' < '70-74' < '75-79' < '80-84']
  first_name  last_name  birth_year  voted  num_children  income_year  \
0     Denise      Ratke        2005  False             0     92129.41   
1       Hali  Cummerata        1987  False             0     75649.17   
2    Salomon        Orn        1992   True             2    166313.45   
3     Sarina   Schiller        1965  False             2     71704.81   
4       Gust  Abernathy        1945  False             2    143316.08   

       higher_tax marital_status  higher_tax_codes  divorced  married  single  \
0        disagree         single                 1         0        0       1   
1         neutral       divorced                 2         1        0       0   
2           agree         single                 3         0        0       1   
3  strongly agree