In [1]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os
from pathlib import Path

In [2]:
# Parse configurations - Done
config = configparser.ConfigParser()
config.read('etl.cfg')

['etl.cfg']

In [3]:
# Run on production version
spark = SparkSession.builder\
            .config("spark.jars.repositories", "https://repos.spark-packages.org/")\
            .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11")\
            .enableHiveSupport()\
            .getOrCreate()

In [4]:
def rmdir(directory):
    '''
    This procedure perform pure recursive a directory.
    
    Parameters
    ----------
    directory : string_of_path_to_dir
        The input directory is a path to target dir. This dir and all its belong child objects wil be deleted.
        Syntax note: rmdir(Path("target_path_to_dir"))
            with Path("target_path_to_dir") returns path to dir format as 'directory' input
    
    Returns
    -------
    None
    '''
    directory = Path(directory)
    for item in directory.iterdir():
        if item.is_dir():
            rmdir(item)
        else:
            item.unlink()
    directory.rmdir()

In [5]:
def convert_column_names(df):
    '''
    This procedure standardizing column names to snake case format. Format ex: customer_name, billing_address, total_price.
    
    Parameters
    ----------
    dataframe : string_of_dataframe
        The input dataframe with column names might have elements of messy columns names, including accents, different delimiters, casing and multiple white spaces.
        Snake case style replaces the white spaces and symbol delimiters with underscore and converts all characters to lower case
    
    Returns
    -------
    Dataframe with column names has been changed to snake_case format.
    '''
    cols = df.columns
    column_name_changed = []

    for col in cols:
        new_column = col.lstrip().rstrip().lower().replace (" ", "_").replace ("-", "_")
        column_name_changed.append(new_column)

    df.columns = column_name_changed

In [6]:
input_data_source = config.get('DIR','INPUT_DIR')
output_processed_data = config.get('DIR','OUTPUT_DIR')

i94immi_dataset = config.get('DATA','I94_IMMI')
worldtempe_dataset = config.get('DATA','WORLD_TEMPE')
citydemo_dataset = config.get('DATA','CITY_DEMOGRAPHIC')
airport_dataset = config.get('DATA','AIR_PORT')
saslabel_dataset = config.get('DATA','SAS_LABEL')

In [7]:
worldtempe_dataset

'../../data2/GlobalLandTemperaturesByCity.csv'

In [18]:
# For production only
# worldtempe_dataset = '../../data2/GlobalLandTemperaturesByCity.csv'

worldtempe_dataset = 'GlobalLandTemperaturesByCity_part9.csv'
worldtempe_df = pd.read_csv(worldtempe_dataset,sep=",", index_col=False)
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   dt                             100000 non-null  object 
 1   AverageTemperature             96552 non-null   float64
 2   AverageTemperatureUncertainty  96552 non-null   float64
 3   City                           100000 non-null  object 
 4   Country                        100000 non-null  object 
 5   Latitude                       100000 non-null  object 
 6   Longitude                      100000 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.3+ MB


In [8]:
worldtempe_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1748-09-01,,,Belgorod,Russia,50.63N,36.76E
1,1748-10-01,,,Belgorod,Russia,50.63N,36.76E
2,1748-11-01,,,Belgorod,Russia,50.63N,36.76E
3,1748-12-01,,,Belgorod,Russia,50.63N,36.76E
4,1749-01-01,,,Belgorod,Russia,50.63N,36.76E


In [22]:
pd.Timestamp(worldtempe_df.dt)

TypeError: Cannot convert input [0        1748-09-01
1        1748-10-01
2        1748-11-01
3        1748-12-01
4        1749-01-01
            ...    
99995    1815-11-01
99996    1815-12-01
99997    1816-01-01
99998    1816-02-01
99999    1816-03-01
Name: dt, Length: 100000, dtype: object] of type <class 'pandas.core.series.Series'> to Timestamp

In [16]:
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   dt                             100000 non-null  object        
 1   AverageTemperature             96552 non-null   float64       
 2   AverageTemperatureUncertainty  96552 non-null   float64       
 3   City                           100000 non-null  object        
 4   Country                        100000 non-null  object        
 5   Latitude                       100000 non-null  object        
 6   Longitude                      100000 non-null  object        
 7   dt_converted                   100000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 6.1+ MB


In [21]:
worldtempe_df['dt_timestamp'] = pd.Timestamp(worldtempe_df.dt)

TypeError: Cannot convert input [0        1748-09-01
1        1748-10-01
2        1748-11-01
3        1748-12-01
4        1749-01-01
            ...    
99995    1815-11-01
99996    1815-12-01
99997    1816-01-01
99998    1816-02-01
99999    1816-03-01
Name: dt, Length: 100000, dtype: object] of type <class 'pandas.core.series.Series'> to Timestamp

In [10]:
worldtempe_df['Country'].value_counts().sort_values()

Papua New Guinea               1581
Oman                           1653
Djibouti                       1797
Eritrea                        1797
Swaziland                      1881
Namibia                        1881
Botswana                       1881
Lesotho                        1881
Congo                          1893
Central African Republic       1893
Costa Rica                     1953
Burundi                        1965
Rwanda                         1965
Liberia                        1977
Guinea Bissau                  1977
Mauritania                     1977
Bahrain                        2049
Qatar                          2049
Hong Kong                      2082
South Korea                    2097
Singapore                      2265
Cambodia                       2265
Guyana                         2277
Suriname                       2277
Mongolia                       2318
Laos                           2371
Azerbaijan                     2460
Jordan                      

In [12]:
worldtempe_df.shape

(8599212, 7)

### Cleaning Country

Filter out `Country` for single value `United States` and check dataframe size

In [13]:
worldtempe_df = worldtempe_df[worldtempe_df['Country']=='United States']

In [14]:
worldtempe_df['Country'].value_counts().sort_values()

United States    687289
Name: Country, dtype: int64

In [15]:
worldtempe_df.shape

(687289, 7)

### Cleaning columns with datetime datetype

In [16]:
worldtempe_df['dt_converted'] = pd.to_datetime(worldtempe_df.dt)

In [17]:
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 687289 entries, 47555 to 8439246
Data columns (total 8 columns):
dt                               687289 non-null object
AverageTemperature               661524 non-null float64
AverageTemperatureUncertainty    661524 non-null float64
City                             687289 non-null object
Country                          687289 non-null object
Latitude                         687289 non-null object
Longitude                        687289 non-null object
dt_converted                     687289 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 47.2+ MB


In [19]:
worldtempe_df=worldtempe_df[worldtempe_df['dt_converted']>"1960-01-01"]

In [20]:
worldtempe_df.shape

(165508, 8)

In [21]:
worldtempe_df['dt_converted'].max()

Timestamp('2013-09-01 00:00:00')

In [22]:
worldtempe_df['dt_converted'].min()

Timestamp('1960-02-01 00:00:00')

In [23]:
worldtempe_df.isnull().sum()

dt                               0
AverageTemperature               1
AverageTemperatureUncertainty    1
City                             0
Country                          0
Latitude                         0
Longitude                        0
dt_converted                     0
dtype: int64

In [29]:
worldtempe_df[worldtempe_df['AverageTemperature'].isnull()]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
287781,2013-09-01,,,Anchorage,United States,61.88N,151.13W,2013-09-01


*NULL* value doen't impact to join data ***i94 immigration*** on year 2016 only. Keep it as is.

### Cleaning combination `City` and `dt_converted`

In [31]:
worldtempe_df.nunique()

dt                                 644
AverageTemperature               34110
AverageTemperatureUncertainty      985
City                               248
Country                              1
Latitude                            15
Longitude                           95
dt_converted                       644
dtype: int64

In [37]:
worldtempe_df["City"] = worldtempe_df["City"].str.upper()

In [38]:
worldtempe_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
49236,1960-02-01,4.995,0.325,ABILENE,United States,32.95N,100.53W,1960-02-01
49237,1960-03-01,8.575,0.303,ABILENE,United States,32.95N,100.53W,1960-03-01
49238,1960-04-01,18.452,0.282,ABILENE,United States,32.95N,100.53W,1960-04-01
49239,1960-05-01,21.709,0.286,ABILENE,United States,32.95N,100.53W,1960-05-01
49240,1960-06-01,27.714,0.387,ABILENE,United States,32.95N,100.53W,1960-06-01


In [49]:
worldtempe_df[worldtempe_df[['City','AverageTemperature']].duplicated()]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
49499,1982-01-01,5.807,0.429,ABILENE,United States,32.95N,100.53W,1982-01-01
49552,1986-06-01,25.614,0.162,ABILENE,United States,32.95N,100.53W,1986-06-01
49641,1993-11-01,8.751,0.260,ABILENE,United States,32.95N,100.53W,1993-11-01
49678,1996-12-01,8.166,0.227,ABILENE,United States,32.95N,100.53W,1996-12-01
140079,1994-12-01,2.173,0.289,AKRON,United States,40.99N,80.95W,1994-12-01
140172,2002-09-01,20.116,0.122,AKRON,United States,40.99N,80.95W,2002-09-01
140273,2011-02-01,-1.359,0.274,AKRON,United States,40.99N,80.95W,2011-02-01
140274,2011-03-01,2.899,0.238,AKRON,United States,40.99N,80.95W,2011-03-01
170072,1986-06-01,20.277,0.342,ALBUQUERQUE,United States,34.56N,107.03W,1986-06-01
170094,1988-04-01,10.597,0.209,ALBUQUERQUE,United States,34.56N,107.03W,1988-04-01


In [55]:
worldtempe_df[worldtempe_df[['City','dt_converted']].duplicated()].tail()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
7148658,2013-05-01,14.309,0.331,SPRINGFIELD,United States,42.59N,72.00W,2013-05-01
7148659,2013-06-01,19.313,0.353,SPRINGFIELD,United States,42.59N,72.00W,2013-06-01
7148660,2013-07-01,23.629,0.447,SPRINGFIELD,United States,42.59N,72.00W,2013-07-01
7148661,2013-08-01,19.579,0.336,SPRINGFIELD,United States,42.59N,72.00W,2013-08-01
7148662,2013-09-01,15.883,1.368,SPRINGFIELD,United States,42.59N,72.00W,2013-09-01


In [48]:
worldtempe_df[(worldtempe_df['City'] == 'SPRINGFIELD') & (worldtempe_df.dt == '2013-08-01')]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
7142183,2013-08-01,24.568,0.303,SPRINGFIELD,United States,37.78N,93.56W,2013-08-01
7145422,2013-08-01,23.645,0.378,SPRINGFIELD,United States,39.38N,89.48W,2013-08-01
7148661,2013-08-01,19.579,0.336,SPRINGFIELD,United States,42.59N,72.00W,2013-08-01


In [54]:
worldtempe_df[(worldtempe_df['City'] == 'YONKERS') & (worldtempe_df.dt == '2013-08-01')]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
8439245,2013-08-01,21.001,0.323,YONKERS,United States,40.99N,74.56W,2013-08-01


In [53]:
worldtempe_df[(worldtempe_df['City'] == 'WINSTON SALEM') & (worldtempe_df.dt == '1970-02-01')]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
8229698,1970-02-01,4.411,0.146,WINSTON SALEM,United States,36.17N,79.56W,1970-02-01


Seem like there are multi temperature sensor was distributed many points of a city. The temperature results of a city are difference not much.

In [42]:
worldtempe_df[worldtempe_df[['City','dt_converted','AverageTemperature']].duplicated()].tail()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
487934,1986-11-01,2.766,0.194,AURORA,United States,40.99N,87.34W,1986-11-01
7144796,1961-06-01,21.821,0.404,SPRINGFIELD,United States,39.38N,89.48W,1961-06-01


Unique values should be the combination of `'City','dt_converted','AverageTemperature'`

### Standalizing column names format

Convert column names to ***snake_case*** format. Format ex: *customer_name, billing_address, ...*

In [56]:
convert_column_names(worldtempe_df)
worldtempe_df.columns

Index(['dt', 'averagetemperature', 'averagetemperatureuncertainty', 'city',
       'country', 'latitude', 'longitude', 'dt_converted'],
      dtype='object')

In [57]:
staging_cols = ['dt', 'averagetemperature', 'averagetemperatureuncertainty', 'city', 'dt_converted','country']
worldtempe_df = worldtempe_df[staging_cols]
worldtempe_df.columns

Index(['dt', 'averagetemperature', 'averagetemperatureuncertainty', 'city',
       'dt_converted', 'country'],
      dtype='object')

In [58]:
worldtempe_df.head()

Unnamed: 0,dt,averagetemperature,averagetemperatureuncertainty,city,dt_converted,country
49236,1960-02-01,4.995,0.325,ABILENE,1960-02-01,United States
49237,1960-03-01,8.575,0.303,ABILENE,1960-03-01,United States
49238,1960-04-01,18.452,0.282,ABILENE,1960-04-01,United States
49239,1960-05-01,21.709,0.286,ABILENE,1960-05-01,United States
49240,1960-06-01,27.714,0.387,ABILENE,1960-06-01,United States


In [59]:
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165508 entries, 49236 to 8439246
Data columns (total 6 columns):
dt                               165508 non-null object
averagetemperature               165507 non-null float64
averagetemperatureuncertainty    165507 non-null float64
city                             165508 non-null object
dt_converted                     165508 non-null datetime64[ns]
country                          165508 non-null object
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 8.8+ MB


### Baseline dataframe World Teperature

In [60]:
worldtempe_df.to_csv('worldtempe_df_clean.csv', index=False, header=True)

In [62]:
# Verify
worldtempe_df = pd.read_csv('worldtempe_df_clean.csv',sep=",")
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165508 entries, 0 to 165507
Data columns (total 6 columns):
dt                               165508 non-null object
averagetemperature               165507 non-null float64
averagetemperatureuncertainty    165507 non-null float64
city                             165508 non-null object
dt_converted                     165508 non-null object
country                          165508 non-null object
dtypes: float64(2), object(4)
memory usage: 7.6+ MB


In [63]:
worldtempe_df.head()

Unnamed: 0,dt,averagetemperature,averagetemperatureuncertainty,city,dt_converted,country
0,1960-02-01,4.995,0.325,ABILENE,1960-02-01,United States
1,1960-03-01,8.575,0.303,ABILENE,1960-03-01,United States
2,1960-04-01,18.452,0.282,ABILENE,1960-04-01,United States
3,1960-05-01,21.709,0.286,ABILENE,1960-05-01,United States
4,1960-06-01,27.714,0.387,ABILENE,1960-06-01,United States


### Staging cleaned `WORLD TEMPERATURE` from saved csv partitions

In [None]:
# clear old dir
rmdir(Path("worldtempe_df_clean"))

In [65]:
worldtempe_df = spark.read.csv("worldtempe_df_clean.csv", header=True)

In [None]:
# Write dataframe to CSV partitions use Spark
# worldtempe_df.write.options(header="True", delimiter=",").csv("worldtempe_df_clean")
# worldtempe_df.write.mode('overwrite').csv("worldtempe_df_clean")
# worldtempe_df.write.options(header="True", delimiter=",").csv("worldtempe_df_clean")

# Read out from CSV file to spark dataframe
# worldtempe_df = spark.read.csv("worldtempe_df_clean.csv")
# worldtempe_df = spark.read.options(inferSchema="true", delimiter=",", header = "true").csv("worldtempe_df_clean")

In [66]:
# Verify loaded dataframe
worldtempe_df.show()

+----------+------------------+-----------------------------+-------+------------+-------------+
|        dt|averagetemperature|averagetemperatureuncertainty|   city|dt_converted|      country|
+----------+------------------+-----------------------------+-------+------------+-------------+
|1960-02-01|             4.995|                        0.325|ABILENE|  1960-02-01|United States|
|1960-03-01| 8.575000000000001|                        0.303|ABILENE|  1960-03-01|United States|
|1960-04-01|            18.452|                        0.282|ABILENE|  1960-04-01|United States|
|1960-05-01|            21.709|          0.28600000000000003|ABILENE|  1960-05-01|United States|
|1960-06-01|            27.714|                        0.387|ABILENE|  1960-06-01|United States|
|1960-07-01|            27.646|                        0.326|ABILENE|  1960-07-01|United States|
|1960-08-01|            27.481|                        0.341|ABILENE|  1960-08-01|United States|
|1960-09-01|            24.413

In [67]:
# Create table from dataframe
worldtempe_df.createOrReplaceTempView('worldtempe_table')

In [68]:
# Verify created table will be using for staging
spark.sql("""
    SELECT COUNT(*) as amount_worldtempe_rows
    FROM worldtempe_table
""").show()

+----------------------+
|amount_worldtempe_rows|
+----------------------+
|                165508|
+----------------------+

