<a href="https://colab.research.google.com/github/emanfj/Healthcare-Disparities-Analysis/blob/main/census_2017.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=51ea631ca17677dbc3bf776d77ad027907a53ba47a238823f6db4c5b4738519c
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [17]:
# necessary imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, isnan, isnull
from pyspark.sql.functions import lit,mean
from pyspark.ml.feature import Imputer

In [5]:
# starting spark session
spark = SparkSession.builder.appName("Census").getOrCreate()

In [6]:
# loading data
demographics = spark.read.csv('/content/drive/MyDrive/2017_census.csv',header=True, inferSchema=True)

In [7]:
# getting the shape of the dataframe
demographics_rows=demographics.count()
demographics_columns=len(demographics.columns)
print('Rows for demographics: ',demographics_rows)
print('Columns for demographics: ',demographics_columns)

Rows for demographics:  74001
Columns for demographics:  37


In [8]:
demographics.printSchema()

root
 |-- TractId: long (nullable = true)
 |-- State: string (nullable = true)
 |-- County: string (nullable = true)
 |-- TotalPop: integer (nullable = true)
 |-- Men: integer (nullable = true)
 |-- Women: integer (nullable = true)
 |-- Hispanic: double (nullable = true)
 |-- White: double (nullable = true)
 |-- Black: double (nullable = true)
 |-- Native: double (nullable = true)
 |-- Asian: double (nullable = true)
 |-- Pacific: double (nullable = true)
 |-- VotingAgeCitizen: integer (nullable = true)
 |-- Income: double (nullable = true)
 |-- IncomeErr: double (nullable = true)
 |-- IncomePerCap: double (nullable = true)
 |-- IncomePerCapErr: double (nullable = true)
 |-- Poverty: double (nullable = true)
 |-- ChildPoverty: double (nullable = true)
 |-- Professional: double (nullable = true)
 |-- Service: double (nullable = true)
 |-- Office: double (nullable = true)
 |-- Construction: double (nullable = true)
 |-- Production: double (nullable = true)
 |-- Drive: double (nullable = 

In [9]:
#formatting of columns for demographics data
demographics = demographics \
    .withColumnRenamed("TotalPop", "TotalPopulation") \
    .withColumnRenamed("IncomeErr", "IncomeError") \
    .withColumnRenamed("IncomePerCap", "IncomePerCapita") \
    .withColumnRenamed("IncomePerCapErr", "IncomePerCapitaError") \
    .withColumnRenamed("OtherTransp", "OtherTransportation")

In [14]:
# percentage of missing values in each column
missing_percentage = demographics.select([(count(when(isnan(c) | col(c).isNull(), c))/count(lit(1))).alias(c) for c in demographics.columns])
missing_percentage.show()

+-------+-----+------+---------------+---+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|TractId|State|County|TotalPopulation|Men|Women|            Hispanic|               White|               Black|              Native|               Asian|             Pacific|VotingAgeCitizen|              Income|         IncomeError|    IncomePerCapita|IncomePerCapitaError|             Poverty|        ChildPoverty|       

In [22]:
# get mean values for each numeric column
numeric_columns = [col_name for col_name, col_type in demographics.dtypes if col_type != 'string']
means = demographics.select(*(mean(col(c)).alias(c) for c in numeric_columns))

# extract mean values from DataFrame
mean_values = means.first().asDict()

# fill null values with mean for numeric columns
for col_name in numeric_columns:
    demographics = demographics.na.fill(mean_values[col_name], [col_name])

# show df after imputation
demographics.show()

+----------+-------+--------------+---------------+----+-----+--------+-----+-----+------+-----+-------+----------------+-------+-----------+---------------+--------------------+-------+------------+------------+-------+------+------------+----------+-----+-------+-------+----+-------------------+----------+-----------+--------+-----------+----------+------------+----------+------------+
|   TractId|  State|        County|TotalPopulation| Men|Women|Hispanic|White|Black|Native|Asian|Pacific|VotingAgeCitizen| Income|IncomeError|IncomePerCapita|IncomePerCapitaError|Poverty|ChildPoverty|Professional|Service|Office|Construction|Production|Drive|Carpool|Transit|Walk|OtherTransportation|WorkAtHome|MeanCommute|Employed|PrivateWork|PublicWork|SelfEmployed|FamilyWork|Unemployment|
+----------+-------+--------------+---------------+----+-----+--------+-----+-----+------+-----+-------+----------------+-------+-----------+---------------+--------------------+-------+------------+------------+------

In [23]:
# find duplicate rows across all cols
demographics.exceptAll(demographics.dropDuplicates()).show()

+-------+-----+------+---------------+---+-----+--------+-----+-----+------+-----+-------+----------------+------+-----------+---------------+--------------------+-------+------------+------------+-------+------+------------+----------+-----+-------+-------+----+-------------------+----------+-----------+--------+-----------+----------+------------+----------+------------+
|TractId|State|County|TotalPopulation|Men|Women|Hispanic|White|Black|Native|Asian|Pacific|VotingAgeCitizen|Income|IncomeError|IncomePerCapita|IncomePerCapitaError|Poverty|ChildPoverty|Professional|Service|Office|Construction|Production|Drive|Carpool|Transit|Walk|OtherTransportation|WorkAtHome|MeanCommute|Employed|PrivateWork|PublicWork|SelfEmployed|FamilyWork|Unemployment|
+-------+-----+------+---------------+---+-----+--------+-----+-----+------+-----+-------+----------------+------+-----------+---------------+--------------------+-------+------------+------------+-------+------+------------+----------+-----+------

In [29]:
# display unique values in string dtype cols
demographics.select("State","County").distinct().show(50)

+-------------+--------------------+
|        State|              County|
+-------------+--------------------+
|        Idaho|      Benewah County|
|         Iowa|      Carroll County|
|         Iowa|  Cerro Gordo County|
|     Kentucky|       Barren County|
|     Kentucky|         Bell County|
|     Kentucky|      Bracken County|
|  Mississippi|       Jasper County|
|     Missouri|         Knox County|
|     Missouri|      Laclede County|
|      Montana|       McCone County|
|     New York|       Queens County|
|      Alabama|      Chilton County|
|       Alaska|Juneau City and B...|
|       Alaska|    Nome Census Area|
|      Florida|       Martin County|
|        Idaho|   Clearwater County|
|      Indiana|       Greene County|
|         Iowa|       Dallas County|
|         Iowa|          Lee County|
|    Minnesota|       Becker County|
|    Minnesota|      Goodhue County|
|  Mississippi|      Bolivar County|
|   California|       Madera County|
|      Florida|      Osceola County|
|