In [1]:
import re
import boto3

from pyspark.sql.types import DateType
from pyspark.sql.functions import udf, year, month
from pyspark.sql.types import *
from datetime import datetime

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.\
    config("spark.jars.packages","saurfang:spark-sas7bdat:2.0.0-s_2.11")\
    .enableHiveSupport().getOrCreate()

In [3]:
def parse_datetime(x):
    try:
        # Try parse yyyy-MM-dd
        return datetime.strptime(x, "%Y-%m-%d")
    except:
        try:
            # Try parse dd-MM-yy
            return datetime.strptime(x, "%d-%m-%y")
        except:
            return None
udf_parse_datetime = udf(lambda x: parse_datetime(x), DateType())

def map_country(city):
    for key, value in valid_city.items():
        if city.lower() == value.lower():
            return key
udf_map_country = udf(lambda x : map_country(x), StringType())

In [4]:
filepath = "../../data2/GlobalLandTemperaturesByCity.csv"

bucket_name = 'yonglun-udacity-capstone'

sas_description_filekey = 'raw/I94_SAS_Labels_Descriptions.SAS'
sas_description_filename = '/tmp/I94_SAS_Labels_Descriptions.SAS'

In [5]:
# Load
raw_temp_df = spark.read.format("csv").option("header", "true").load(filepath)

In [6]:
#Parse Data Labels
# S3 client
s3 = boto3.resource('s3',
                    region_name="us-west-2",
                    aws_access_key_id='<aw access key>',
                    aws_secret_access_key='<aws secret key>',
)

In [7]:
# Get Label Descriptions File
s3.Bucket(bucket_name).download_file(sas_description_filekey, sas_description_filename)

with open(sas_description_filename) as header_file:
    lines = header_file.readlines()

    # valid_city: Line 10 to 298
    # valid_city len: 289
    city_regex = re.compile(r'([0-9]+)(.*)(\'.*\')(\s\;)?')
    valid_city = {}
    for line in lines[9:298]:
        match_groups = city_regex.search(line)
        valid_city[int(match_groups.group(1))] = match_groups.group(3).strip('\'')

In [9]:
#from pprint import pprint
#print(len(valid_city))
#pprint(valid_city)

In [10]:
# Clean
cleaned_temp_df = raw_temp_df\
    .filter(raw_temp_df.AverageTemperature.isNotNull())\
    .filter(raw_temp_df.AverageTemperatureUncertainty.isNotNull())\

In [11]:
# Transform
transformed_temp_df = cleaned_temp_df\
    .select("dt",
            "AverageTemperature",
            "AverageTemperatureUncertainty",
            "City",
            "Country",
            "Latitude",
            "Longitude")\
    .withColumn("dt", udf_parse_datetime("dt"))\
    .withColumnRenamed("AverageTemperature", "avg_temp")\
    .withColumnRenamed("AverageTemperatureUncertainty", "avg_temp_uncertainty")\
    .withColumn("city_code", udf_map_country("country"))\
    .withColumnRenamed("City", "city")\
    .withColumnRenamed("Country", "country")\
    .withColumnRenamed("Latitude", "latitude")\
    .withColumnRenamed("Longitude", "longitude")\
    .withColumnRenamed("dt", "date_time")\
    .withColumn('month', month('date_time')) \
    .withColumn('year', year('date_time')) \
    
transformed_temp_df = transformed_temp_df.filter(transformed_temp_df.city_code != 'null')

In [12]:
year2000onwards = transformed_temp_df.filter(transformed_temp_df.year > 2000)
year2000onwards.show(10)

+----------+-------------------+--------------------+-----+-------+--------+---------+---------+-----+----+
| date_time|           avg_temp|avg_temp_uncertainty| city|country|latitude|longitude|city_code|month|year|
+----------+-------------------+--------------------+-----+-------+--------+---------+---------+-----+----+
|2001-01-01| 1.9180000000000001|               0.381|Århus|Denmark|  57.05N|   10.33E|      108|    1|2001|
|2001-02-01|0.24100000000000002| 0.32799999999999996|Århus|Denmark|  57.05N|   10.33E|      108|    2|2001|
|2001-03-01|               1.31| 0.23600000000000002|Århus|Denmark|  57.05N|   10.33E|      108|    3|2001|
|2001-04-01|               5.89|               0.158|Århus|Denmark|  57.05N|   10.33E|      108|    4|2001|
|2001-05-01| 12.015999999999998| 0.35100000000000003|Århus|Denmark|  57.05N|   10.33E|      108|    5|2001|
|2001-06-01|             13.944| 0.35200000000000004|Århus|Denmark|  57.05N|   10.33E|      108|    6|2001|
|2001-07-01| 18.453000000000

In [None]:
# Write
# transformed_temp_df.write\
#     .partitionBy("year", "month")\
#     .mode("append")\
#     .parquet("{}/transformed/temperature/".format(s3_bucket_name))