In [26]:
import findspark
findspark.init()

import pyspark.sql.functions as F

import pandas as pd
pd.set_option('display.max_rows', 1000)

from etl import SparkETL

In [3]:
etl = SparkETL()

In [5]:
i94_data_dictionary = etl.read_clean_table('i94_data_dictionary')

In [10]:
i94_data_dictionary.printSchema()

root
 |-- I94ADDR: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- I94CIT: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- I94MODE: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- I94PORT: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- VISATYPE: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)



In [23]:
def explode_countries(df):
    return (
        df
        .select(
            F.explode('I94CIT')
        )
        .select(
            F.element_at(F.col('col'), 1).alias('country_id'),
            F.element_at(F.col('col'), 2).alias('country')
        )
    )

In [28]:
def clean_mexico(df):
    return (
        df
        .withColumn(
            'country',
            F.expr("""
                IF(
                    country_id = 582,
                    'MEXICO',
                    country
                )
            """)
        )
    )

In [31]:
def clean_country(df):
    return (
        i94_data_dictionary
        .pipe(explode_countries)
        .pipe(clean_mexico)
    )

In [32]:
def save_clean_country():
    etl.save_clean_table(i94_data_dictionary.pipe(clean_country), 'country')

In [34]:
save_clean_country()

In [35]:
etl.read_clean_table('country').toPandas()

Unnamed: 0,country_id,country
0,582,MEXICO
1,236,AFGHANISTAN
2,101,ALBANIA
3,316,ALGERIA
4,102,ANDORRA
5,324,ANGOLA
6,529,ANGUILLA
7,518,ANTIGUA-BARBUDA
8,687,ARGENTINA
9,151,ARMENIA
