In [128]:
import findspark
findspark.init()

from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors, VectorUDT

import pyspark.sql.types as T
import pyspark.sql.functions as F

import pandas as pd
pd.set_option('display.max_rows', 1000)

from etl import SparkETL

In [2]:
etl = SparkETL()

In [3]:
spark = etl.get_spark()

22/05/06 14:24:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [210]:
demographics_schema = T.StructType([
    T.StructField('City', T.StringType(), True),
    T.StructField('State', T.StringType(), True),
    T.StructField('Median Age', T.StringType(), True),
    T.StructField('Male Population', T.StringType(), True),
    T.StructField('Female Population', T.StringType(), True),
    T.StructField('Total Population', T.IntegerType(), True),
    T.StructField('Number of Veterans', T.StringType(), True),
    T.StructField('Foreign-born', T.StringType(), True),
    T.StructField('Average Household Size', T.StringType(), True),
    T.StructField('State Code', T.StringType(), True),
    T.StructField('Race', T.StringType(), True),
    T.StructField('Count', T.StringType(), True)
])

In [211]:
demographics_staging = (
    spark.read
    .format('csv')
    .schema(demographics_schema)
    .option('header', 'true')
    .option('sep', ';')
    .load(etl.data_sources['demographics'])
)

In [212]:
def race_cols(df):
    return df.select('City', 'State', 'Race', 'Count', 'Total Population')

In [213]:
def race_ratio(df):
    return df.withColumn('race_ratio', F.col('Count') / F.col('Total Population'))

In [214]:
def pivot_race(df):
    return df.pivot(index=['City', 'State'], columns='Race', values='race_ratio')

In [215]:
def race_ratios(df):
    return (
        demographics_staging
        .pipe(race_cols)
        .pipe(race_ratio)
    )

In [216]:
def pivot_race(df):
    
    race_ratios(demographics_staging).createOrReplaceTempView('demographics')
    
    return (
        spark.sql("""
                SELECT 
                    City, 
                    State, 
                    `Total Population` as population,
                    FIRST(Count) as count, 
                    IFNULL(MAX(native), 0.0) AS native,
                    IFNULL(MAX(asian), 0.0) AS asian,
                    IFNULL(MAX(black), 0.0) AS black,
                    IFNULL(MAX(latino), 0.0) AS latino,
                    IFNULL(MAX(white), 0.0) AS white
                FROM demographics
                PIVOT (
                    MAX(race_ratio) as ratio
                    FOR Race IN (
                        'American Indian and Alaska Native' AS native, 
                        'Asian' AS asian, 
                        'Black or African-American' AS black, 
                        'Hispanic or Latino' AS latino, 
                        'White' AS white
                    )
                )
                GROUP BY City, State, `Total Population`
        """)
        .drop('count')
    )

In [217]:
@F.udf(VectorUDT())
def features_udf(ratios_array):
    return Vectors.dense(ratios_array)

In [218]:
def prepare_features(df):
    return df.withColumn(
        'features', 
        features_udf(
            F.array(
                F.col('native'),
                F.col('asian'),
                F.col('black'),
                F.col('latino'),
                F.col('white')))
    )

In [219]:
def ratios_kmeans(df):
    kmeans = KMeans().setK(7).setSeed(1)
    model = kmeans.fit(df)
    return model.transform(df)

In [220]:
def project_clusters(df):
    return (
        demographics_staging
        .pipe(race_cols)
        .pipe(race_ratio)
        .pipe(pivot_race)
        .pipe(prepare_features)
        .pipe(ratios_kmeans)
    )

In [252]:
def ethnicities():
    
    ethnicities_pd = pd.DataFrame([
                                [0, 'white, black minority'],
                                [6, 'white'],    
                                [2, 'white, latino minority'],    
                                [5, 'white, asian minority'],
                                [4, 'black, white minority'],    
                                [1, 'latino, white minority'],    
                                [3, 'white latino']
                    ], columns=['ethnicity_id', 'ethnicity']
    )

    return spark.createDataFrame(ethnicities_pd)

In [253]:
def join_ethnicities(df):
    ethnicities_df = ethnicities()
    return df.join(
        ethnicities_df,
        on=ethnicities_df['ethnicity_id'] == df['prediction'],
        how='inner'
    )

In [254]:
def join_state(df):
    
    state = etl.read_clean_table('state')

    return df.join(
        state,
        on=df['State'] == state['name'],
        how='left'
    )

In [255]:
@F.udf(T.IntegerType())
def size_id_udf(population):
    if population < 200000:
        return 0
    elif population <= 500000:
        return 1
    elif population <= 1500000:
        return 2
    else:
        return 3

In [256]:
@F.udf(T.StringType())
def size_udf(population):
    if population < 200000:
        return 'small (50K - 200K)'
    elif population <= 500000:
        return 'medium (200K - 500K)'
    elif population <= 1500000:
        return 'large (500K - 1,5M)'
    else:
        return 'very large (> 1,5M)'

In [257]:
def project_size(df):
    return (
        df
        .withColumn('size_id', size_id_udf(F.col('population')))
        .withColumn('size', size_udf(F.col('population')))
    )

In [258]:
def project_schema(df):
    return (
        df.select(
            'state_id',
            F.col('City').alias('city'),
            'asian',
            'black',
            'latino',
            'native',
            'white',
            'ethnicity_id',
            'ethnicity',
            'population',
            'size_id',
            'size'
        )
    )

In [269]:
def demographics(df):
    return (
        df
        .pipe(project_clusters)
        .pipe(join_ethnicities)
        .pipe(join_state)
        .pipe(project_size)
        .pipe(project_schema)
    )

In [270]:
etl.save_clean_table(
    demographics(demographics_staging).coalesce(1),
    'demographics'
)

                                                                                

In [271]:
etl.read_clean_table('demographics').toPandas()

Unnamed: 0,state_id,city,asian,black,latino,native,white,ethnicity_id,ethnicity,population,size_id,size
0,IL,Evanston,0.116521,0.154536,0.120162,0.007203,0.721582,0,"white, black minority",75523,0,small (50K - 200K)
1,CA,Arden-Arcade,0.076395,0.141749,0.158638,0.026871,0.720522,0,"white, black minority",96276,0,small (50K - 200K)
2,LA,Kenner,0.040667,0.255894,0.26479,0.002354,0.668539,0,"white, black minority",67106,0,small (50K - 200K)
3,AL,Hoover,0.056094,0.214418,0.04043,0.0,0.729252,0,"white, black minority",84839,0,small (50K - 200K)
4,MI,Warren,0.089808,0.217139,0.023715,0.0112,0.711878,0,"white, black minority",135356,0,small (50K - 200K)
5,MA,Worcester,0.086209,0.148756,0.211032,0.010373,0.713711,0,"white, black minority",184806,0,small (50K - 200K)
6,IN,Indianapolis,0.034543,0.299299,0.098331,0.010202,0.652581,0,"white, black minority",848423,2,"large (500K - 1,5M)"
7,IN,South Bend,0.017233,0.289185,0.158659,0.003961,0.68806,0,"white, black minority",103757,0,small (50K - 200K)
8,IL,Rockford,0.036827,0.22178,0.193664,0.015407,0.717448,0,"white, black minority",149346,0,small (50K - 200K)
9,MN,Minneapolis,0.082383,0.219003,0.097293,0.02248,0.67617,0,"white, black minority",410935,1,medium (200K - 500K)
