In [1]:
import pandas as pd

In [9]:
from pyspark.sql import functions as F

In [2]:
spark

In [23]:
demographics_df = spark.read.load('us-cities-demographics.csv', 
                                 format="csv", sep=";", inferSchema="true", header="true")

In [24]:
demographics_df.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Median Age: double (nullable = true)
 |-- Male Population: integer (nullable = true)
 |-- Female Population: integer (nullable = true)
 |-- Total Population: integer (nullable = true)
 |-- Number of Veterans: integer (nullable = true)
 |-- Foreign-born: integer (nullable = true)
 |-- Average Household Size: double (nullable = true)
 |-- State Code: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Count: integer (nullable = true)



In [25]:
demographics_df = demographics_df.select(
F.lower(F.col('City')).alias('city'),
F.col('Median Age').alias('median_age'),
F.col('Male Population').alias('male_population'),
F.col('Female Population').alias('female_population'),
F.col('Total Population').alias('total_population'),
F.col('Foreign-born').alias('foreign_born'),
F.col('Average Household Size').alias('houshold_size'),
F.col('State Code').alias('state'),
F.col('Race').alias('race'),
F.col('Count').alias('count_race')
)
demographics_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- median_age: double (nullable = true)
 |-- male_population: integer (nullable = true)
 |-- female_population: integer (nullable = true)
 |-- total_population: integer (nullable = true)
 |-- foreign_born: integer (nullable = true)
 |-- houshold_size: double (nullable = true)
 |-- state: string (nullable = true)
 |-- race: string (nullable = true)
 |-- count_race: integer (nullable = true)



In [35]:
demographics_df.select('race').distinct().collect()

[Row(race='Black or African-American'),
 Row(race='Hispanic or Latino'),
 Row(race='White'),
 Row(race='Asian'),
 Row(race='American Indian and Alaska Native')]

In [27]:
demographics_df.columns

['city',
 'median_age',
 'male_population',
 'female_population',
 'total_population',
 'foreign_born',
 'houshold_size',
 'state',
 'race',
 'count_race']

In [28]:
race_names = [
    'Black or African-American',
    'Hispanic or Latino',
    'White',
    'Asian',
    'American Indian and Alaska Native'
]
demographics_pivot = demographics_df.groupby(
['city',
 'median_age',
 'male_population',
 'female_population',
 'total_population',
 'foreign_born',
 'houshold_size',
 'state']).pivot('race', race_names).sum('count_race')

In [29]:
demographics_pivot = demographics_pivot\
.withColumnRenamed('Black or African-American', 'african_american')\
.withColumnRenamed('Hispanic or Latino', 'hispanic')\
.withColumnRenamed('White', 'white')\
.withColumnRenamed('Asian', 'asian')\
.withColumnRenamed('American Indian and Alaska Native', 'native_american')

In [30]:
demographics_pivot.printSchema()

root
 |-- city: string (nullable = true)
 |-- median_age: double (nullable = true)
 |-- male_population: integer (nullable = true)
 |-- female_population: integer (nullable = true)
 |-- total_population: integer (nullable = true)
 |-- foreign_born: integer (nullable = true)
 |-- houshold_size: double (nullable = true)
 |-- state: string (nullable = true)
 |-- african_american: long (nullable = true)
 |-- hispanic: long (nullable = true)
 |-- white: long (nullable = true)
 |-- asian: long (nullable = true)
 |-- native_american: long (nullable = true)



In [32]:
demographics_pivot.filter("city = 'san francisco'").show(10)

+-------------+----------+---------------+-----------------+----------------+------------+-------------+-----+----------------+--------+------+------+---------------+
|         city|median_age|male_population|female_population|total_population|foreign_born|houshold_size|state|african_american|hispanic| white| asian|native_american|
+-------------+----------+---------------+-----------------+----------------+------------+-------------+-----+----------------+--------+------+------+---------------+
|san francisco|      38.3|         439752|           425064|          864816|      297199|         2.37|   CA|           53270|  132114|442155|324034|           8997|
+-------------+----------+---------------+-----------------+----------------+------------+-------------+-----+----------------+--------+------+------+---------------+



In [33]:
demographics_pivot.count()

596

In [34]:
demographics_pivot.repartition(1).write.mode('overwrite').parquet('data/demographics_transformed/')