In [1]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [2]:
# Parse configurations - Done
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

In [3]:
# Run on production version
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [30]:
def convert_column_names(df):
    '''
    This procedure standardizing column names to snake case format. Format ex: customer_name, billing_address, total_price.
    
    Parameters
    ----------
    dataframe : string_of_dataframe
        The input dataframe with column names might have elements of messy columns names, including accents, different delimiters, casing and multiple white spaces.
        Snake case style replaces the white spaces and symbol delimiters with underscore and converts all characters to lower case
    
    Returns
    -------
    Dataframe with column names has been changed to snake case format.
    '''
    cols = df.columns
    column_name_changed = []

    for col in cols:
        new_column = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_") #strip beginning spaces, makes lowercase, add underscpre
        column_name_changed.append(new_column)

    df.columns = column_name_changed

In [5]:
input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')
saslabel_dataset = config.get('DATA','SAS_LABEL')

In [6]:
worldtempe_dataset

'../../data2/GlobalLandTemperaturesByCity.csv'

In [8]:
# For production only
# worldtempe_dataset = '../../data2/GlobalLandTemperaturesByCity.csv'

# worldtempe_dataset = 'GlobalLandTemperaturesByCity_part9.csv'
worldtempe_df = pd.read_csv(worldtempe_dataset,sep=",")
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
dt                               object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                             object
Country                          object
Latitude                         object
Longitude                        object
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [9]:
worldtempe_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [10]:
worldtempe_df['Country'].value_counts().sort_values()

Papua New Guinea               1581
Oman                           1653
Djibouti                       1797
Eritrea                        1797
Botswana                       1881
Lesotho                        1881
Namibia                        1881
Swaziland                      1881
Central African Republic       1893
Congo                          1893
Costa Rica                     1953
Rwanda                         1965
Burundi                        1965
Guinea Bissau                  1977
Liberia                        1977
Mauritania                     1977
Bahrain                        2049
Qatar                          2049
Hong Kong                      2082
South Korea                    2097
Cambodia                       2265
Singapore                      2265
Suriname                       2277
Guyana                         2277
Mongolia                       2318
Laos                           2371
Azerbaijan                     2460
Jordan                      

In [11]:
worldtempe_df.shape

(8599212, 7)

### Cleaning Country

Filter out `Country` for single value `United States` and check dataframe size

In [12]:
worldtempe_df = worldtempe_df[worldtempe_df['Country']=='United States']

In [13]:
worldtempe_df['Country'].value_counts().sort_values()

United States    687289
Name: Country, dtype: int64

In [14]:
worldtempe_df.shape

(687289, 7)

### Cleaning columns with datetime datetype

In [15]:
worldtempe_df['dt_converted'] = pd.to_datetime(worldtempe_df.dt)

In [16]:
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 687289 entries, 47555 to 8439246
Data columns (total 8 columns):
dt                               687289 non-null object
AverageTemperature               661524 non-null float64
AverageTemperatureUncertainty    661524 non-null float64
City                             687289 non-null object
Country                          687289 non-null object
Latitude                         687289 non-null object
Longitude                        687289 non-null object
dt_converted                     687289 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 47.2+ MB


In [17]:
worldtempe_df=worldtempe_df[worldtempe_df['dt_converted']>"1960-01-01"].copy()

In [19]:
worldtempe_df.shape

(165508, 8)

In [20]:
worldtempe_df['dt_converted'].max()

Timestamp('2013-09-01 00:00:00')

In [21]:
worldtempe_df['dt_converted'].min()

Timestamp('1960-02-01 00:00:00')

In [22]:
worldtempe_df.isnull().sum()

dt                               0
AverageTemperature               1
AverageTemperatureUncertainty    1
City                             0
Country                          0
Latitude                         0
Longitude                        0
dt_converted                     0
dtype: int64

In [23]:
worldtempe_df[worldtempe_df['AverageTemperature'].isnull()]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
287781,2013-09-01,,,Anchorage,United States,61.88N,151.13W,2013-09-01


*NULL* value doen't impact to join data ***i94 immigration*** on year 2016 only. Keep it as is.

### Cleaning combination `City` and `dt_converted`

In [24]:
worldtempe_df[worldtempe_df[['City','dt_converted']].duplicated()].tail()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
7148658,2013-05-01,14.309,0.331,Springfield,United States,42.59N,72.00W,2013-05-01
7148659,2013-06-01,19.313,0.353,Springfield,United States,42.59N,72.00W,2013-06-01
7148660,2013-07-01,23.629,0.447,Springfield,United States,42.59N,72.00W,2013-07-01
7148661,2013-08-01,19.579,0.336,Springfield,United States,42.59N,72.00W,2013-08-01
7148662,2013-09-01,15.883,1.368,Springfield,United States,42.59N,72.00W,2013-09-01


In [25]:
worldtempe_df[(worldtempe_df['City'] == 'Springfield') & (worldtempe_df.dt == '2013-07-01')]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
7142182,2013-07-01,25.132,0.211,Springfield,United States,37.78N,93.56W,2013-07-01
7145421,2013-07-01,23.824,0.142,Springfield,United States,39.38N,89.48W,2013-07-01
7148660,2013-07-01,23.629,0.447,Springfield,United States,42.59N,72.00W,2013-07-01


Seem like there are multi temperature sensor was distributed many points of a city. The temperature results of a city are difference not much.

### Standalizing column names format

Convert column names to ***snake_case*** format. Format ex: *customer_name, billing_address, ...*

In [33]:
convert_column_names(worldtempe_df)
worldtempe_df.columns

Index(['dt', 'averagetemperature', 'averagetemperatureuncertainty', 'city',
       'country', 'latitude', 'longitude', 'dt_converted'],
      dtype='object')

### Baseline dataframe World Teperature

In [34]:
worldtempe_df.to_csv('worldtempe_df_clean.csv', index=False)

In [35]:
worldtempe_df = pd.read_csv('worldtempe_df_clean.csv',sep=",")
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165508 entries, 0 to 165507
Data columns (total 8 columns):
dt                               165508 non-null object
averagetemperature               165507 non-null float64
averagetemperatureuncertainty    165507 non-null float64
city                             165508 non-null object
country                          165508 non-null object
latitude                         165508 non-null object
longitude                        165508 non-null object
dt_converted                     165508 non-null object
dtypes: float64(2), object(6)
memory usage: 10.1+ MB


In [36]:
worldtempe_df.head()

Unnamed: 0,dt,averagetemperature,averagetemperatureuncertainty,city,country,latitude,longitude,dt_converted
0,1960-02-01,4.995,0.325,Abilene,United States,32.95N,100.53W,1960-02-01
1,1960-03-01,8.575,0.303,Abilene,United States,32.95N,100.53W,1960-03-01
2,1960-04-01,18.452,0.282,Abilene,United States,32.95N,100.53W,1960-04-01
3,1960-05-01,21.709,0.286,Abilene,United States,32.95N,100.53W,1960-05-01
4,1960-06-01,27.714,0.387,Abilene,United States,32.95N,100.53W,1960-06-01


In [37]:
# Read out from CSV file to spark dataframe
worldtempe_df = spark.read.csv("worldtempe_df_clean.csv")

In [38]:
# Write dataframe to CSV partitions use Spark

#rmdir(Path("worldtempe_df_clean"))
# worldtempe_df.write.options(header='True', delimiter=',').csv("worldtempe_df_clean")
worldtempe_df.write.mode('overwrite').csv("worldtempe_df_clean")

### Staging cleaned `WORLD TEMPERATURE` from saved csv partitions

In [39]:
# Read out from csv partitions to staging dataframe
worldtempe_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("worldtempe_df_clean")

In [40]:
# Verify loaded dataframe
worldtempe_df.show()

+-------------------+------------------+-----------------------------+-------+-------------+--------+---------+-------------------+
|                 dt|averagetemperature|averagetemperatureuncertainty|   city|      country|latitude|longitude|       dt_converted|
+-------------------+------------------+-----------------------------+-------+-------------+--------+---------+-------------------+
|1960-02-01 00:00:00|             4.995|                        0.325|Abilene|United States|  32.95N|  100.53W|1960-02-01 00:00:00|
|1960-03-01 00:00:00| 8.575000000000001|                        0.303|Abilene|United States|  32.95N|  100.53W|1960-03-01 00:00:00|
|1960-04-01 00:00:00|            18.452|                        0.282|Abilene|United States|  32.95N|  100.53W|1960-04-01 00:00:00|
|1960-05-01 00:00:00|            21.709|          0.28600000000000003|Abilene|United States|  32.95N|  100.53W|1960-05-01 00:00:00|
|1960-06-01 00:00:00|27.714000000000002|                        0.387|Abilen

In [41]:
# Create table from dataframe
worldtempe_df.createOrReplaceTempView('worldtempe_table')

In [None]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_worldtempe_rows
    FROM worldtempe_table
""").show()