In [1]:
import pandas as pd
import os
from IPython.display import display


# Get the current working directory
script_dir = os.getcwd()
file_path = os.path.join(script_dir, 'census_data.csv')

print('Dataframe Read: \n')
census = pd.read_csv(file_path, index_col=0)
census

Dataframe Read: 



FileNotFoundError: [Errno 2] No such file or directory: '/Users/diegopons/Documents/Coding/ML-AI-Self-Learning/Python/Pandas/census_data.csv'

### Census Data Types

In [None]:
display(census.head())
print(f'census data types:\n\n{census.dtypes}')

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status
0,Denise,Ratke,2005,False,0,92129.41,disagree,single
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced
2,Salomon,Orn,1992,True,2,166313.45,agree,single
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married
4,Gust,Abernathy,1945,False,2,143316.08,agree,married


census data types:

first_name         object
last_name          object
birth_year         object
voted                bool
num_children        int64
income_year       float64
higher_tax         object
marital_status     object
dtype: object


### Change Variable Types

In [None]:
census['first_name'] = census['first_name'].astype('string')
census['last_name'] = census['last_name'].astype('string')
census['birth_year'] = census['birth_year'].replace(['missing'], 1967).astype('int') #if column has missing value, it cannot be changed to int; must replace first
census['income_year'] = census['income_year'].astype('float')
print(f'census data types:\n\n{census.dtypes}')

census data types:

first_name        string[python]
last_name         string[python]
birth_year                 int64
voted                       bool
num_children               int64
income_year              float64
higher_tax                object
marital_status            object
dtype: object


### Inspecting Data Types

In [None]:
print(f'Unique Birth Years\n{census.birth_year.unique()}\n')
print(f'Birth Year Mean\n{census.birth_year.mean()}\n')
print(f'census data types:\n{census.dtypes}')

Unique Birth Years
[2005 1987 1992 1965 1945 1951 1963 1949 1950 1971 2007 1944 1995 1973
 1946 1954 1994 1989 1947 1993 1976 1984 1967 1966 1941 2000 1953 1956
 1960 2001 1980 1955 1985 1996 1968 1979 2006 1962 1981 1959 1977 1978
 1983 1957 1961 1982 2002 1998 1999 1952 1940 1986 1958]

Birth Year Mean
1973.4

census data types:
first_name        string[python]
last_name         string[python]
birth_year                 int64
voted                       bool
num_children               int64
income_year              float64
higher_tax                object
marital_status            object
dtype: object


### Create Higher Tax Category

##### This section displays the purpose of the Pandas Categorical Data Type. Python on its own does not understand a category and its inherent ranking if it has one. Using pd.Categorical(), we can create this

In [None]:
#create category for the higher_tax column with pd.Categorical
census['higher_tax'] = pd.Categorical(census['higher_tax'], categories=['strongly disagree', 'disagree', 'neutral', 'agree', 'strongly agree'], ordered=True)
print(census['higher_tax'].unique())
census



['disagree', 'neutral', 'agree', 'strongly agree', 'strongly disagree']
Categories (5, object): ['strongly disagree' < 'disagree' < 'neutral' < 'agree' < 'strongly agree']


Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status
0,Denise,Ratke,2005,False,0,92129.41,disagree,single
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced
2,Salomon,Orn,1992,True,2,166313.45,agree,single
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married
4,Gust,Abernathy,1945,False,2,143316.08,agree,married
...,...,...,...,...,...,...,...,...
95,Carisa,Hills,1958,False,3,157117.14,agree,married
96,Tameka,Collins,2001,False,1,61518.34,strongly disagree,single
97,Adams,Leuschke,1987,False,0,41784.87,strongly agree,single
98,Earnestine,Gutmann,1985,True,4,79021.46,disagree,widowed


##### using the cat.codes method gives the category data type a numerical representation (in this case, the higher_tax column)

In [None]:
# Use cat.codes to label encode the higher_tax variable
census['higher_tax'] = census['higher_tax'].cat.codes
print(census['higher_tax'].median())
census


2.0


Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status
0,Denise,Ratke,2005,False,0,92129.41,1,single
1,Hali,Cummerata,1987,False,0,75649.17,2,divorced
2,Salomon,Orn,1992,True,2,166313.45,3,single
3,Sarina,Schiller,1965,False,2,71704.81,4,married
4,Gust,Abernathy,1945,False,2,143316.08,3,married
...,...,...,...,...,...,...,...,...
95,Carisa,Hills,1958,False,3,157117.14,3,married
96,Tameka,Collins,2001,False,1,61518.34,0,single
97,Adams,Leuschke,1987,False,0,41784.87,4,single
98,Earnestine,Gutmann,1985,True,4,79021.46,1,widowed


### Hot Encode Marital Status
##### (One Hot Encoding / OHE) Doing an OHE rewrites the Dataframe; create a new one if you must

In [None]:
# Use pd.get_dummies to one-hot encode the marital_status variable
census_ohe = pd.get_dummies(census, columns=['marital_status'])
display(census_ohe.head())      

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status_divorced,marital_status_married,marital_status_single,marital_status_widowed
0,Denise,Ratke,2005,False,0,92129.41,1,False,False,True,False
1,Hali,Cummerata,1987,False,0,75649.17,2,True,False,False,False
2,Salomon,Orn,1992,True,2,166313.45,3,False,False,True,False
3,Sarina,Schiller,1965,False,2,71704.81,4,False,True,False,False
4,Gust,Abernathy,1945,False,2,143316.08,3,False,True,False,False


### Bonus

In [None]:
#Create a new variable called marital_codes
census['marital_status'] = pd.Categorical(census['marital_status'], categories=['single', 'divorced', 'married', 'widowed'], ordered=False)
census['marital_status'] = census['marital_status'].cat.codes
census.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status
0,Denise,Ratke,2005,False,0,92129.41,1,0
1,Hali,Cummerata,1987,False,0,75649.17,2,1
2,Salomon,Orn,1992,True,2,166313.45,3,0
3,Sarina,Schiller,1965,False,2,71704.81,4,2
4,Gust,Abernathy,1945,False,2,143316.08,3,2
