In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Census Data Analysis**

In this notebook, we will perform various data preprocessing techniques on a census dataframe.

# **Reading in the Census Dataframe**

In [None]:
census = pd.read_csv('/kaggle/input/census-data/Census Variable.csv', index_col=0)

We read in the census dataframe from a CSV file named 'census_data.csv' using pd.read_csv(). The index_col=0 parameter specifies that the first column should be used as the index.

# **Assessing Variable Types**

In [None]:

display(census.style.background_gradient(cmap='YlGnBu'))

# Get the data types of the columns
dtypes_df = census.dtypes.reset_index()
dtypes_df.columns = ['Column', 'Data Type']

# Apply styling to the DataFrame
styled_dtypes_df = dtypes_df.style.set_properties(**{'text-align': 'left', 'font-size': '12px'})\
                                   .set_table_styles([{'selector': 'th', 'props': [('background-color', '#edf8b4')]}])\
                                   .set_table_attributes('style="border-collapse: collapse; border: 1px solid black;"')\
                                    .set_caption('Data Types of Census Columns')      
# Display the styled DataFrame
display(styled_dtypes_df)

We print out the beginning of the census dataframe using print(census) to inspect its contents.  Next we inspect the data types of each column in the census dataframe, helping us assess the variable types.

# **Inspecting Datatypes**

In [None]:
# Get the unique values of the 'birth_year' column
unique_birth_years = census['birth_year'].unique()

# Create a new DataFrame with the unique values
unique_df = pd.DataFrame({'Birth Year': unique_birth_years})

# Apply styling to the DataFrame
styled_unique_df = unique_df.style.set_properties(**{'text-align': 'center', 'font-weight': 'bold'})\
                                  .set_caption('Unique Birth Years')\
                                  .set_table_styles([{'selector': 'th', 'props': [('background-color', '#ff9999')]}])\
                                  .set_table_attributes('style="border-collapse: collapse; border: 1px solid black;"')
display(styled_unique_df)

This printed out the unique values in the 'birth_year' column, allowing us to identify any missing or invalid values.

# **Altering Data**

In [None]:
census['birth_year'] = census['birth_year'].replace(['missing'], 1967)
census['birth_year'] = census['birth_year'].astype('int64')

# Get the unique values of the 'birth_year' column
unique_birth_years = census['birth_year'].unique()

# Create a DataFrame with the unique values
unique_df = pd.DataFrame({'Birth Year': unique_birth_years})

# Apply styling to the DataFrame
styled_unique_df = unique_df.style.set_properties(**{'text-align': 'center', 'font-size': '12px'})\
                                  .set_caption('Unique Birth Years')\
                                  .set_table_styles([{'selector': 'th', 'props': [('background-color', '#ff9999')]}])\
                                  .set_table_attributes('style="border-collapse: collapse; border: 1px solid black;"')\
                                  
# Display the styled DataFrame
display(styled_unique_df)

# Get the data types of the columns
dtypes_df = census.dtypes.reset_index()
dtypes_df.columns = ['Column', 'Data Type']

# Apply styling to the DataFrame
styled_dtypes_df = dtypes_df.style.set_properties(**{'text-align': 'left', 'font-size': '12px'})\
                                   .set_table_styles([{'selector': 'th', 'props': [('background-color', '#edf8b4')]}])\
                                   .set_table_attributes('style="border-collapse: collapse; border: 1px solid black;"')\
                                    .set_caption('Data Types of Census Columns')      
# Display the styled DataFrame
display(styled_dtypes_df)

print(census['birth_year'].mean())

We replaced the value 'missing' in the 'birth_year' column with the integer value 1967 using census['birth_year'].replace(['missing'], 1967) to handle missing data. Then, we convert the data type of the 'birth_year' column to 'object' using census['birth_year'].astype('object').

We print out the unique values in the 'birth_year' column after the replacement and data type conversion, the data types of each column, and the mean value of the 'birth_year' column.

# **Assigning Categorical Order and Label Encoding**

In [None]:
# Assign categorical order to the 'higher_tax' column
census['higher_tax'] = pd.Categorical(census['higher_tax'], ['strongly disagree', 'disagree', 'neutral', 'agree', 'strongly agree'], ordered=True)

# Get the unique categories of the 'higher_tax' column
unique_categories = census['higher_tax'].unique()

# Convert the unique categories back to Categorical dtype
unique_categories = pd.Categorical(unique_categories, categories=census['higher_tax'].cat.categories, ordered=True)

# Create a DataFrame with the unique categories
unique_df = pd.DataFrame({'Higher Tax Categories': unique_categories, 'Category Code': unique_categories.codes})


# Sort the DataFrame based on the 'Code' column
unique_df = unique_df.sort_values('Category Code')

# Apply styling to the unique categories DataFrame
styled_unique_df = unique_df.style.set_properties(**{'text-align': 'center', 'font-size': '14px', 'font-family': 'Arial'})\
                                  .set_caption('Unique Higher Tax Categories')\
                                  .set_table_styles([{'selector': 'th', 'props': [('background-color', '#66b3ff'), ('color', 'white')]},
                                                     {'selector': 'td', 'props': [('border', '1px solid #66b3ff'), ('padding', '5px')]}])
display(styled_unique_df)

# Convert the 'higher_tax' column to numeric codes
census['higher_tax'] = census['higher_tax'].cat.codes

# Calculate the median value of the 'higher_tax' column
median_value = census['higher_tax'].median()

# Create a DataFrame with the median value
median_df = pd.DataFrame({'Median Higher Tax': [median_value]})

# Apply styling to the median value DataFrame
styled_median_df = median_df.style.set_properties(**{'text-align': 'center', 'font-size': '16px', 'font-weight': 'bold', 'font-family': 'Arial'})\
                                  .set_caption('Median Higher Tax Value')\
                                  .set_table_styles([{'selector': 'th', 'props': [('background-color', '#ff9933'), ('color', 'white')]},
                                                     {'selector': 'td', 'props': [('border', '1px solid #ff9933'), ('padding', '5px')]}])

display(styled_median_df)

We assign a categorical order to the 'higher_tax' column using pd.Categorical() with the specified order and ordered=True. We print out the unique values in the 'higher_tax' column.

Then, we label encode the 'higher_tax' column using census['higher_tax'].cat.codes and print out the median value of the encoded column.

# **One-Hot Encoding**

In [None]:
# Apply one-hot encoding to the 'marital_status' column
census = pd.get_dummies(census, columns=['marital_status'])

# Apply styling to the DataFrame
styled_census = census.style.set_properties(**{'text-align': 'center', 'font-size': '12px', 'font-family': 'Arial'})\
                                      .set_caption('First 5 Rows of Census Data')\
                                      .set_table_styles([{'selector': 'th', 'props': [('background-color', '#66b3ff'), ('color', 'white')]},
                                                         {'selector': 'td', 'props': [('border', '1px solid #66b3ff'), ('padding', '5px')]}])\
                                      .background_gradient(cmap='YlGnBu')
#Display the styled DataFrame
display(styled_census)

We apply one-hot encoding (OHE) to the 'marital_status' column using pd.get_dummies(), creating new binary columns for each unique value in the 'marital_status' column.

Finally, we print out the first 5 rows of the updated census dataframe to inspect the changes made after applying one-hot encoding.