In [1]:
import pandas as pd

c = pd.read_csv('census_data.csv', index_col=0)
c.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status
0,Denise,Ratke,2005,False,0,92129.41,disagree,single
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced
2,Salomon,Orn,1992,True,2,166313.45,agree,single
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married
4,Gust,Abernathy,1945,False,2,143316.08,agree,married


In [2]:
c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   first_name      100 non-null    object 
 1   last_name       100 non-null    object 
 2   birth_year      100 non-null    object 
 3   voted           100 non-null    bool   
 4   num_children    100 non-null    int64  
 5   income_year     100 non-null    float64
 6   higher_tax      100 non-null    object 
 7   marital_status  100 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(5)
memory usage: 6.3+ KB


### Average birth year of respondents

In [3]:
# check for NaN
c.birth_year.value_counts()

1961       4
1949       4
2005       3
2007       3
1985       3
1989       3
2006       3
1954       3
1946       3
1973       3
1995       3
1971       3
1978       3
1963       3
1951       3
1992       3
1966       3
1962       2
1941       2
1998       2
1955       2
2001       2
1960       2
1987       2
1953       2
1984       2
1945       2
1994       2
2000       1
1977       1
1986       1
1940       1
1952       1
1999       1
1965       1
2002       1
1982       1
1957       1
1983       1
1950       1
1959       1
missing    1
1981       1
1944       1
1979       1
1968       1
1996       1
1947       1
1993       1
1980       1
1976       1
1956       1
1958       1
Name: birth_year, dtype: int64

In [4]:
c.birth_year.replace('missing', 1967, inplace=True)

c.birth_year = c.birth_year.astype('int')

c.birth_year.mean()

1973.4

### Categorize higher_tax

In [5]:
c.higher_tax = pd.Categorical(c.higher_tax, 
                              ['strongly disagree', 'disagree', 'neutral', 'agree', 'strongly agree'],
                              ordered = True
                             )

c.higher_tax.unique()

['disagree', 'neutral', 'agree', 'strongly agree', 'strongly disagree']
Categories (5, object): ['strongly disagree' < 'disagree' < 'neutral' < 'agree' < 'strongly agree']

In [6]:
c.higher_tax.cat.codes.median()

2.0

### one-hot encode on marital_status

In [7]:
c1 = pd.get_dummies(c, columns = ['marital_status'])
c1.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status_divorced,marital_status_married,marital_status_single,marital_status_widowed
0,Denise,Ratke,2005,False,0,92129.41,disagree,0,0,1,0
1,Hali,Cummerata,1987,False,0,75649.17,neutral,1,0,0,0
2,Salomon,Orn,1992,True,2,166313.45,agree,0,0,1,0
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,0,1,0,0
4,Gust,Abernathy,1945,False,2,143316.08,agree,0,1,0,0


### categorize marital_status

In [8]:
c.marital_status.value_counts()

married     36
single      35
divorced    22
widowed      7
Name: marital_status, dtype: int64

In [9]:
c.marital_status = pd.Categorical(c.marital_status, 
                                  ['married', 'single', 'divorced', 'widowed'],
                                  ordered = False
                                 )
c.marital_status.unique()

['single', 'divorced', 'married', 'widowed']
Categories (4, object): ['married', 'single', 'divorced', 'widowed']

In [10]:
c['marital_codes'] = c.marital_status.cat.codes
c.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status,marital_codes
0,Denise,Ratke,2005,False,0,92129.41,disagree,single,1
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced,2
2,Salomon,Orn,1992,True,2,166313.45,agree,single,1
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married,0
4,Gust,Abernathy,1945,False,2,143316.08,agree,married,0


### group birth_year in 5-year increment

In [11]:
import datetime

c['age'] = datetime.datetime.today().year - c.birth_year
c.age.describe()

count    100.000000
mean      48.600000
std       20.102264
min       15.000000
25%       30.000000
50%       50.000000
75%       67.000000
max       82.000000
Name: age, dtype: float64

In [12]:
bins = list(range(14, 85, 5))
bins[0] = 15

groups = ['15-19',
          '20-24',
          '25-29',
          '30-34',
          '35-39',
          '40-44',
          '45-49',
          '50-54',
          '55-59',
          '60-64',
          '65-69',
          '70-74',
          '75-79',
          '80-84'
         ]

c['age_group'] = pd.cut(c.age, bins, labels = groups)
c.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status,marital_codes,age,age_group
0,Denise,Ratke,2005,False,0,92129.41,disagree,single,1,17,15-19
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced,2,35,35-39
2,Salomon,Orn,1992,True,2,166313.45,agree,single,1,30,30-34
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married,0,57,55-59
4,Gust,Abernathy,1945,False,2,143316.08,agree,married,0,77,75-79
