# Weather Enhanced MN Phenology Dataset
Combines phenology and weather data at the county level for the state of Minnesota.

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType, DateType

INPUT_FOLDER = "data/"
OUTPUT_FOLDER = "out/"

Start spark session

In [2]:
spark = SparkSession \
        .builder \
        .getOrCreate()

## Phenology and County Codes
Done together since FIPS codes aid with phenology data cleanup

In [3]:
mnphn_df = spark.read.option("header", True) \
    .csv(INPUT_FOLDER + "2018.4.6-MNPN-WEBSITE-master-dataset.csv")
fips_df = spark.read.json(INPUT_FOLDER + "USDA-NRCS_FIPS_Codes.json", multiLine=True)
mnfips_df = fips_df.filter(fips_df.State == "MN")

### Data clean up for phenology data
- remove empty counties
- drop duplicates
- align multiple spellings
- correct abbreviations, etc...

In [4]:
mnphn_df = mnphn_df.filter("COUNTY is not NULL").dropDuplicates()

#### Data Quality Check
Compare phenology data's counties to FIPS codes to determine invalid county names.

In [5]:
mnphn_df.join(mnfips_df, lower(mnphn_df.COUNTY) == lower(mnfips_df.Name), "leftanti") \
.select("COUNTY").distinct().show()

+---------------+
|         COUNTY|
+---------------+
|        ASHLAND|
|        RAMSEY |
|       ST.LOUIS|
|             BR|
| AITKIN /ITASCA|
|      ST. LOUIS|
|  AITKIN/ITASCA|
|MORRISON COUNTY|
+---------------+



County Changes to make
- "ASHLAND" county is in WI, remove since it's out of scope for this project
- "RAMSEY " has an extra space to take out
- "ST.LOUIS" and "ST. LOUIS" should be "ST LOUIS"
- "BR" is probably "Brown", it's the only "Br*" county and there are no counties with "BR" as initials
- "MORRISON COUNTY" should just be "MORRISON"
- "AITKIN/ITASCA" & "AITKIN /ITASCA" could be either one, remove for sake of cleanliness

In [6]:
mnphn_df = mnphn_df.filter("COUNTY not in ('ASHLAND', 'AITKIN /ITASCA', 'AITKIN/ITASCA')")
mnphn_df = mnphn_df.withColumn("COUNTY", when(mnphn_df.COUNTY == "RAMSEY ","RAMSEY")
                    .when(mnphn_df.COUNTY == "ST.LOUIS","ST LOUIS")
                    .when(mnphn_df.COUNTY == "ST. LOUIS","ST LOUIS")
                    .when(mnphn_df.COUNTY == "BR","BROWN")
                    .when(mnphn_df.COUNTY == "MORRISON COUNTY","MORRISON")
                    .otherwise(mnphn_df.COUNTY))

Check to confirm no more mismatches, count should be zero.

In [7]:
mnphn_df.join(mnfips_df, lower(mnphn_df.COUNTY) == lower(mnfips_df.Name), "leftanti") \
.select("COUNTY").distinct().count()

0

In [8]:
mnphn_df.limit(5).toPandas()

Unnamed: 0,YEAR,DAY,EVENT,SPECIES (COMMON NAME),GENUS,SPECIES,COUNTY,LIFEFORM,GROUP,DATASET,DAY OF YEAR,INVASIVE
0,1944,25-May,LAST FLOWER,APPLE,MALUS,DOMESTICA,RAMSEY,PLANTS,WOODY,3,146,
1,1946,24-Mar,FLOWERING,AMERICAN ELM,ULMUS,AMERICANA,RAMSEY,PLANTS,WOODY,3,83,
2,1947,29-May,FULL FLOWERING,LILAC,SYRINGA,VULGARIS,RAMSEY,PLANTS,WOODY,3,149,
3,1958,25-Apr,LAST FLOWER,LEATHERWOOD,DIRCA,PALUSTRIS,HENNEPIN,PLANTS,WOODY,12,115,
4,1960,10-Jul,FLOWERING,SILVERLEAF SCURF PEA,PEDIOMELUM,ARGOPHYLLUM,HENNEPIN,PLANTS,FORB,12,192,


## Climate Data
Percipitation and temperatures per county and month.

In [9]:
# Climate tables have a variable amount of spaces as their delimiters
# Pandas can handle this where pyspark can't, so use that first
climHeader=["FIPS-YEAR","JAN","FEB","MAR","APR","MAY","JUN","JUL",\
        "AUG","SEP","OCT","NOV","DEC"]
pcpn_pd = pd.read_csv(INPUT_FOLDER + "climdiv-pcpncy-v1.0.0-20211104", \
                     sep="\s+",\
                     names=climHeader)
pcpn_df = spark.createDataFrame(pcpn_pd)

#### Data Quality Check
For perciptation, range of values is listed as 00.00 to 99.99. Check for these, "-9.99" indicates no reading, so should be null.

In [10]:
pcpn_df.filter("JAN < 0 OR \
            FEB < 0 OR \
            MAR < 0 OR \
            APR < 0 OR \
            MAY < 0 OR \
            JUN < 0 OR \
            JUL < 0 OR \
            AUG < 0 OR \
            SEP < 0 OR \
            OCT < 0 OR \
            NOV < 0 OR \
            DEC < 0").show(5)

+----------+----+----+-----+----+----+----+----+-----+----+----+-----+-----+
| FIPS-YEAR| JAN| FEB|  MAR| APR| MAY| JUN| JUL|  AUG| SEP| OCT|  NOV|  DEC|
+----------+----+----+-----+----+----+----+----+-----+----+----+-----+-----+
|1001012021|3.19|3.04| 9.03|4.49|3.87|7.26|6.51| 5.32|4.65|5.05|-9.99|-9.99|
|1003012021|2.87|4.43| 4.99|9.82| 5.2|9.94|8.81|10.02|7.88|4.81|-9.99|-9.99|
|1005012021|5.34|5.03| 7.31|6.71|2.45| 4.7|6.54|  7.7|2.42| 5.2|-9.99|-9.99|
|1007012021| 4.2|3.76| 6.34| 4.8|6.67|8.31|8.15| 5.78|3.77|4.99|-9.99|-9.99|
|1009012021|3.94|5.45|12.57| 3.5| 5.6|9.59|7.79| 5.93| 4.4|4.77|-9.99|-9.99|
+----------+----+----+-----+----+----+----+----+-----+----+----+-----+-----+
only showing top 5 rows



In [11]:
pcpn_df.filter("JAN > 99.99 OR \
            FEB > 99.99 OR \
            MAR > 99.99 OR \
            APR > 99.99 OR \
            MAY > 99.99 OR \
            JUN > 99.99 OR \
            JUL > 99.99 OR \
            AUG > 99.99 OR \
            SEP > 99.99 OR \
            OCT > 99.99 OR \
            NOV > 99.99 OR \
            DEC > 99.99").show(5)

+---------+---+---+---+---+---+---+---+---+---+---+---+---+
|FIPS-YEAR|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC|
+---------+---+---+---+---+---+---+---+---+---+---+---+---+
+---------+---+---+---+---+---+---+---+---+---+---+---+---+



Convert missing readings to null

In [12]:
@udf(returnType=DoubleType()) 
def pcpnMiss2null(obsv):
    '''
    When percipitation is below 0 it should be null
    '''
    if obsv < 0.0:
        return None
    else:
        return obsv 

In [13]:
pcpn_df = pcpn_df.withColumn("JAN", pcpnMiss2null(col("JAN")))\
    .withColumn("FEB", pcpnMiss2null(col("FEB")))\
    .withColumn("MAR", pcpnMiss2null(col("MAR")))\
    .withColumn("APR", pcpnMiss2null(col("APR")))\
    .withColumn("MAY", pcpnMiss2null(col("MAY")))\
    .withColumn("JUN", pcpnMiss2null(col("JUN")))\
    .withColumn("JUL", pcpnMiss2null(col("JUL")))\
    .withColumn("AUG", pcpnMiss2null(col("AUG")))\
    .withColumn("SEP", pcpnMiss2null(col("SEP")))\
    .withColumn("OCT", pcpnMiss2null(col("OCT")))\
    .withColumn("NOV", pcpnMiss2null(col("NOV")))\
    .withColumn("DEC", pcpnMiss2null(col("DEC")))

In [14]:
pcpn_df.printSchema()

root
 |-- FIPS-YEAR: long (nullable = true)
 |-- JAN: double (nullable = true)
 |-- FEB: double (nullable = true)
 |-- MAR: double (nullable = true)
 |-- APR: double (nullable = true)
 |-- MAY: double (nullable = true)
 |-- JUN: double (nullable = true)
 |-- JUL: double (nullable = true)
 |-- AUG: double (nullable = true)
 |-- SEP: double (nullable = true)
 |-- OCT: double (nullable = true)
 |-- NOV: double (nullable = true)
 |-- DEC: double (nullable = true)



In [15]:
pcpn_df.limit(5).toPandas()

Unnamed: 0,FIPS-YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,1001011895,7.03,2.96,8.36,3.53,3.96,5.4,3.92,3.36,0.73,2.03,1.44,3.66
1,1001011896,5.86,5.42,5.54,3.98,3.77,6.24,4.38,2.57,0.82,1.66,2.89,1.94
2,1001011897,3.27,6.63,10.94,4.35,0.81,1.57,3.96,5.02,0.87,0.75,1.84,4.38
3,1001011898,2.33,2.07,2.6,4.56,0.54,3.13,5.8,6.02,1.51,3.21,6.66,3.91
4,1001011899,5.8,6.94,3.35,2.22,2.93,2.31,6.8,2.9,0.63,3.02,1.98,5.25


#### Data Quality Check
Essentially the same procedure for temperature data, but valid readings are -50.00 to 140.00 and -99.99 is a missing value.

In [16]:
tmpMax_pd = pd.read_csv(INPUT_FOLDER + "climdiv-tmaxcy-v1.0.0-20211104", \
                     sep="\s+",\
                     names=climHeader)
tmpMax_df = spark.createDataFrame(tmpMax_pd)

In [17]:
tmpMin_pd = pd.read_csv(INPUT_FOLDER + "climdiv-tmincy-v1.0.0-20211104", \
                     sep="\s+",\
                     names=climHeader)
tmpMin_df = spark.createDataFrame(tmpMin_pd)

In [18]:
tmpAvg_pd = pd.read_csv(INPUT_FOLDER + "climdiv-tmpccy-v1.0.0-20211104", \
                     sep="\s+",\
                     names=climHeader)
tmpAvg_df = spark.createDataFrame(tmpAvg_pd)

In [19]:
tmpCheckLow = "JAN < -50.0 OR \
            FEB < -50.0 OR \
            MAR < -50.0 OR \
            APR < -50.0 OR \
            MAY < -50.0 OR \
            JUN < -50.0 OR \
            JUL < -50.0 OR \
            AUG < -50.0 OR \
            SEP < -50.0 OR \
            OCT < -50.0 OR \
            NOV < -50.0 OR \
            DEC < -50.0"

In [20]:
tmpCheckHigh = "JAN > 140.0 OR \
            FEB > 140.0 OR \
            MAR > 140.0 OR \
            APR > 140.0 OR \
            MAY > 140.0 OR \
            JUN > 140.0 OR \
            JUL > 140.0 OR \
            AUG > 140.0 OR \
            SEP > 140.0 OR \
            OCT > 140.0 OR \
            NOV > 140.0 OR \
            DEC > 140.0"

In [21]:
tmpMax_df.filter(tmpCheckLow).show(5)

+----------+----+----+----+----+----+----+----+----+----+----+-----+-----+
| FIPS-YEAR| JAN| FEB| MAR| APR| MAY| JUN| JUL| AUG| SEP| OCT|  NOV|  DEC|
+----------+----+----+----+----+----+----+----+----+----+----+-----+-----+
|1001272021|58.0|60.8|73.1|76.0|83.0|88.2|89.5|90.8|84.9|78.7|-99.9|-99.9|
|1003272021|61.9|61.4|73.5|75.8|82.5|88.1|89.4|90.5|85.5|81.8|-99.9|-99.9|
|1005272021|59.0|62.2|73.2|76.0|82.9|87.6|89.1|90.2|85.2|78.6|-99.9|-99.9|
|1007272021|55.4|57.8|71.3|73.9|80.7|86.2|88.0|89.2|82.6|77.7|-99.9|-99.9|
|1009272021|52.1|54.7|68.2|72.0|78.3|84.4|86.8|87.0|81.4|75.4|-99.9|-99.9|
+----------+----+----+----+----+----+----+----+----+----+----+-----+-----+
only showing top 5 rows



In [22]:
tmpMax_df.filter(tmpCheckHigh).show(5)

+---------+---+---+---+---+---+---+---+---+---+---+---+---+
|FIPS-YEAR|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC|
+---------+---+---+---+---+---+---+---+---+---+---+---+---+
+---------+---+---+---+---+---+---+---+---+---+---+---+---+



In [23]:
tmpMin_df.filter(tmpCheckLow).show(5)

+----------+----+----+----+----+----+----+----+----+----+----+-----+-----+
| FIPS-YEAR| JAN| FEB| MAR| APR| MAY| JUN| JUL| AUG| SEP| OCT|  NOV|  DEC|
+----------+----+----+----+----+----+----+----+----+----+----+-----+-----+
|1001282021|38.4|37.6|49.4|49.7|59.0|68.0|71.0|72.0|65.3|57.6|-99.9|-99.9|
|1003282021|42.5|41.0|53.1|53.5|62.6|71.0|73.4|73.1|68.4|61.2|-99.9|-99.9|
|1005282021|38.9|39.6|49.7|50.4|57.4|67.6|69.3|69.9|63.9|57.6|-99.9|-99.9|
|1007282021|35.1|34.3|46.3|47.3|56.8|65.5|69.2|70.4|63.7|55.7|-99.9|-99.9|
|1009282021|35.1|34.2|47.1|48.3|57.4|65.8|68.8|70.0|63.0|56.0|-99.9|-99.9|
+----------+----+----+----+----+----+----+----+----+----+----+-----+-----+
only showing top 5 rows



In [24]:
tmpMin_df.filter(tmpCheckHigh).show(5)

+---------+---+---+---+---+---+---+---+---+---+---+---+---+
|FIPS-YEAR|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC|
+---------+---+---+---+---+---+---+---+---+---+---+---+---+
+---------+---+---+---+---+---+---+---+---+---+---+---+---+



In [25]:
tmpAvg_df.filter(tmpCheckLow).show(5)

+----------+----+----+----+----+----+----+----+----+----+----+-----+-----+
| FIPS-YEAR| JAN| FEB| MAR| APR| MAY| JUN| JUL| AUG| SEP| OCT|  NOV|  DEC|
+----------+----+----+----+----+----+----+----+----+----+----+-----+-----+
|1001022021|48.2|49.2|61.2|62.8|71.0|78.1|80.2|81.4|75.1|68.2|-99.9|-99.9|
|1003022021|52.2|51.2|63.3|64.6|72.6|79.5|81.4|81.8|77.0|71.5|-99.9|-99.9|
|1005022021|48.9|50.9|61.4|63.2|70.2|77.6|79.2|80.1|74.6|68.1|-99.9|-99.9|
|1007022021|45.2|46.1|58.8|60.6|68.8|75.9|78.6|79.8|73.2|66.7|-99.9|-99.9|
|1009022021|43.6|44.4|57.7|60.2|67.8|75.1|77.8|78.5|72.2|65.7|-99.9|-99.9|
+----------+----+----+----+----+----+----+----+----+----+----+-----+-----+
only showing top 5 rows



In [26]:
tmpAvg_df.filter(tmpCheckHigh).show(5)

+---------+---+---+---+---+---+---+---+---+---+---+---+---+
|FIPS-YEAR|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC|
+---------+---+---+---+---+---+---+---+---+---+---+---+---+
+---------+---+---+---+---+---+---+---+---+---+---+---+---+



In [27]:
@udf(returnType=DoubleType()) 
def tmpMiss2null(obsv):
    '''
    When temperature is below -50 it should be null
    '''
    if obsv < -50.0:
        return None
    else:
        return obsv 

In [28]:
tmpMax_df = tmpMax_df.withColumn("JAN", tmpMiss2null(col("JAN")))\
    .withColumn("FEB", tmpMiss2null(col("FEB")))\
    .withColumn("MAR", tmpMiss2null(col("MAR")))\
    .withColumn("APR", tmpMiss2null(col("APR")))\
    .withColumn("MAY", tmpMiss2null(col("MAY")))\
    .withColumn("JUN", tmpMiss2null(col("JUN")))\
    .withColumn("JUL", tmpMiss2null(col("JUL")))\
    .withColumn("AUG", tmpMiss2null(col("AUG")))\
    .withColumn("SEP", tmpMiss2null(col("SEP")))\
    .withColumn("OCT", tmpMiss2null(col("OCT")))\
    .withColumn("NOV", tmpMiss2null(col("NOV")))\
    .withColumn("DEC", tmpMiss2null(col("DEC")))

In [29]:
tmpMin_df = tmpMin_df.withColumn("JAN", tmpMiss2null(col("JAN")))\
    .withColumn("FEB", tmpMiss2null(col("FEB")))\
    .withColumn("MAR", tmpMiss2null(col("MAR")))\
    .withColumn("APR", tmpMiss2null(col("APR")))\
    .withColumn("MAY", tmpMiss2null(col("MAY")))\
    .withColumn("JUN", tmpMiss2null(col("JUN")))\
    .withColumn("JUL", tmpMiss2null(col("JUL")))\
    .withColumn("AUG", tmpMiss2null(col("AUG")))\
    .withColumn("SEP", tmpMiss2null(col("SEP")))\
    .withColumn("OCT", tmpMiss2null(col("OCT")))\
    .withColumn("NOV", tmpMiss2null(col("NOV")))\
    .withColumn("DEC", tmpMiss2null(col("DEC")))

In [30]:
tmpAvg_df = tmpAvg_df.withColumn("JAN", tmpMiss2null(col("JAN")))\
    .withColumn("FEB", tmpMiss2null(col("FEB")))\
    .withColumn("MAR", tmpMiss2null(col("MAR")))\
    .withColumn("APR", tmpMiss2null(col("APR")))\
    .withColumn("MAY", tmpMiss2null(col("MAY")))\
    .withColumn("JUN", tmpMiss2null(col("JUN")))\
    .withColumn("JUL", tmpMiss2null(col("JUL")))\
    .withColumn("AUG", tmpMiss2null(col("AUG")))\
    .withColumn("SEP", tmpMiss2null(col("SEP")))\
    .withColumn("OCT", tmpMiss2null(col("OCT")))\
    .withColumn("NOV", tmpMiss2null(col("NOV")))\
    .withColumn("DEC", tmpMiss2null(col("DEC")))

In [31]:
tmpMax_df.printSchema()

root
 |-- FIPS-YEAR: long (nullable = true)
 |-- JAN: double (nullable = true)
 |-- FEB: double (nullable = true)
 |-- MAR: double (nullable = true)
 |-- APR: double (nullable = true)
 |-- MAY: double (nullable = true)
 |-- JUN: double (nullable = true)
 |-- JUL: double (nullable = true)
 |-- AUG: double (nullable = true)
 |-- SEP: double (nullable = true)
 |-- OCT: double (nullable = true)
 |-- NOV: double (nullable = true)
 |-- DEC: double (nullable = true)



In [32]:
tmpMax_df.limit(5).toPandas()

Unnamed: 0,FIPS-YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,1001271895,53.7,48.7,67.6,76.4,81.9,89.2,91.1,90.4,90.9,76.0,66.6,58.0
1,1001271896,54.2,60.8,65.3,81.6,88.5,88.2,92.0,94.5,90.8,77.2,69.9,58.7
2,1001271897,54.2,63.1,71.4,75.1,83.2,95.6,93.3,89.9,88.9,81.3,68.1,58.8
3,1001271898,60.6,59.1,71.0,72.0,89.5,93.9,91.5,88.8,86.7,73.6,61.7,55.7
4,1001271899,55.6,53.4,68.8,73.4,89.3,93.7,92.2,92.6,87.5,78.4,68.1,56.6


In [33]:
tmpMin_df.printSchema()

root
 |-- FIPS-YEAR: long (nullable = true)
 |-- JAN: double (nullable = true)
 |-- FEB: double (nullable = true)
 |-- MAR: double (nullable = true)
 |-- APR: double (nullable = true)
 |-- MAY: double (nullable = true)
 |-- JUN: double (nullable = true)
 |-- JUL: double (nullable = true)
 |-- AUG: double (nullable = true)
 |-- SEP: double (nullable = true)
 |-- OCT: double (nullable = true)
 |-- NOV: double (nullable = true)
 |-- DEC: double (nullable = true)



In [34]:
tmpMin_df.limit(5).toPandas()

Unnamed: 0,FIPS-YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,1001281895,34.2,27.7,43.4,51.8,59.3,67.4,69.7,70.3,67.1,46.9,42.1,32.5
1,1001281896,34.4,37.2,42.6,57.0,65.0,67.9,71.4,71.7,65.0,52.2,46.1,35.9
2,1001281897,33.2,41.5,51.2,50.9,56.8,69.2,71.4,69.3,64.4,53.4,41.7,37.7
3,1001281898,39.6,34.4,49.1,47.1,60.4,69.1,70.2,69.6,65.7,50.6,38.6,32.7
4,1001281899,33.6,29.6,44.3,51.3,64.1,68.4,69.9,70.4,61.1,54.8,43.2,34.1


In [35]:
tmpAvg_df.printSchema()

root
 |-- FIPS-YEAR: long (nullable = true)
 |-- JAN: double (nullable = true)
 |-- FEB: double (nullable = true)
 |-- MAR: double (nullable = true)
 |-- APR: double (nullable = true)
 |-- MAY: double (nullable = true)
 |-- JUN: double (nullable = true)
 |-- JUL: double (nullable = true)
 |-- AUG: double (nullable = true)
 |-- SEP: double (nullable = true)
 |-- OCT: double (nullable = true)
 |-- NOV: double (nullable = true)
 |-- DEC: double (nullable = true)



In [36]:
tmpAvg_df.limit(5).toPandas()

Unnamed: 0,FIPS-YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,1001021895,44.0,38.2,55.5,64.1,70.6,78.3,80.4,80.4,79.0,61.4,54.4,45.3
1,1001021896,44.3,49.0,54.0,69.3,76.8,78.0,81.7,83.1,77.9,64.7,58.0,47.3
2,1001021897,43.7,52.3,61.3,63.0,70.0,82.4,82.4,79.6,76.6,67.4,54.9,48.2
3,1001021898,50.1,46.8,60.1,59.6,75.0,81.5,80.8,79.2,76.2,62.1,50.2,44.2
4,1001021899,44.6,41.5,56.6,62.3,76.7,81.0,81.0,81.5,74.3,66.6,55.7,45.3


## Create Analytic Tables
Star schema with _Observations_ as the fact table and the following dimension tables:
_Biological_, _County_, _Climate_, _Time_
### Observations (fact table)
- date
- species
- FIPS
- event

Join in the county data to the phenology set, so we can select it back out.

In [37]:
mnphn_df = mnphn_df.join(mnfips_df, lower(mnphn_df.COUNTY) == lower(mnfips_df.Name))

In [38]:
obsv_tbl = mnphn_df.select(from_unixtime(unix_timestamp(\
                concat_ws("-",col("YEAR"),col("DAY")), 'yyyy-dd-MMM')).cast("date").alias("date"), \
                col("SPECIES").alias("species"), \
                col("FIPS").alias("county"), \
                col("EVENT").alias("event"))
obsv_tbl.printSchema()

root
 |-- date: date (nullable = true)
 |-- species: string (nullable = true)
 |-- county: string (nullable = true)
 |-- event: string (nullable = true)



In [39]:
obsv_tbl.limit(5).toPandas()

Unnamed: 0,date,species,county,event
0,1944-05-25,DOMESTICA,27123,LAST FLOWER
1,1946-03-24,AMERICANA,27123,FLOWERING
2,1947-05-29,VULGARIS,27123,FULL FLOWERING
3,1958-04-25,PALUSTRIS,27053,LAST FLOWER
4,1960-07-10,ARGOPHYLLUM,27053,FLOWERING


### Biological
- species
- common name
- genus
- lifeform
- group
- invasive


In [40]:
bio_tbl = mnphn_df.select(\
               col("SPECIES").alias("species"),
               col("SPECIES (COMMON NAME)").alias("common_name"),
               col("GENUS ").alias("genus"),
               col("LIFEFORM").alias("lifeform"),
               col("GROUP").alias("group"),
               col("INVASIVE").alias("mn_invasive")).distinct()
bio_tbl.printSchema()

root
 |-- species: string (nullable = true)
 |-- common_name: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- lifeform: string (nullable = true)
 |-- group: string (nullable = true)
 |-- mn_invasive: string (nullable = true)



In [41]:
bio_tbl.limit(5).toPandas()

Unnamed: 0,species,common_name,genus,lifeform,group,mn_invasive
0,SP,BLUE EYED GRASS,SISYRINCHIUM,PLANTS,FORB,
1,ALLEGHANIENSIS,YELLOW BIRCH,BETULA,PLANTS,WOODY,
2,FULVA,ORANGE DAY-LILY,HEMEROCALLIS,PLANTS,FORB,
3,REFLEXA,SPIDERWORT,TRADESCANTIA,PLANTS,FORB,
4,CAMPESTRIS,SACHEM,ATALOPEDES,ANIMALS,BUTTERFLIES,


### County
- name
- FIPS
- state

In [42]:
cnty_tbl = mnfips_df.select(\
                 col("Name").alias("name"),
                 col("FIPS"),
                 col("State").alias("state"))
cnty_tbl.printSchema()

root
 |-- name: string (nullable = true)
 |-- FIPS: string (nullable = true)
 |-- state: string (nullable = true)



In [43]:
cnty_tbl.limit(5).toPandas()

Unnamed: 0,name,FIPS,state
0,Aitkin,27001,MN
1,Anoka,27003,MN
2,Becker,27005,MN
3,Beltrami,27007,MN
4,Benton,27009,MN


### Climate
- FIPS
- year
- month
- Tmin
- Tmax
- Tavg
- Pcpn

FIPS gets a little tricky here since NOAA and everyone else uses different state codes. For this MN restricted set, it's easy to go from 21 => 27, but for a national set, more care would be needed.
Also, the climate data is monthly data, so there's no day involved.

In [44]:
# filter to minnesota and change FIPS code
pcpn_tbl = pcpn_df.filter(col("FIPS-YEAR").like('21%'))\
    .withColumn("FIPS-YEAR", ((col("FIPS-YEAR").cast('bigint')+6000000000).cast('string')))
tmpMin_tbl = tmpMin_df.filter(col("FIPS-YEAR").like('21%'))\
    .withColumn("FIPS-YEAR", ((col("FIPS-YEAR").cast('bigint')+6000000000).cast('string')))
tmpMax_tbl = tmpMax_df.filter(col("FIPS-YEAR").like('21%'))\
    .withColumn("FIPS-YEAR", ((col("FIPS-YEAR").cast('bigint')+6000000000).cast('string')))
tmpAvg_tbl = tmpAvg_df.filter(col("FIPS-YEAR").like('21%'))\
    .withColumn("FIPS-YEAR", ((col("FIPS-YEAR").cast('bigint')+6000000000).cast('string')))

In [45]:
def convertClim(df, dataCol):
    '''
    Takes a climate data frame with month columns and FIPS-YEAR and converts
    them into month rows and splits FIPS and year.
    Thanks to Doug on this post for the method: https://stackoverflow.com/a/64130519
    '''
    df = df.withColumn("FIPS", col("FIPS-YEAR").substr(1,5))\
    .withColumn("YEAR", col("FIPS-YEAR").substr(8,4)).drop("FIPS-YEAR")

    df = df.withColumn('j1', array(lit('01'), col('JAN')))
    df = df.withColumn('f2', array(lit('02'), col('FEB')))
    df = df.withColumn('m3', array(lit('03'), col('MAR')))
    df = df.withColumn('a4', array(lit('04'), col('APR')))
    df = df.withColumn('m5', array(lit('05'), col('MAY')))
    df = df.withColumn('j6', array(lit('06'), col('JUN')))
    df = df.withColumn('j7', array(lit('07'), col('JUL')))
    df = df.withColumn('a8', array(lit('08'), col('AUG')))
    df = df.withColumn('s9', array(lit('09'), col('SEP')))
    df = df.withColumn('o10', array(lit('10'), col('OCT')))
    df = df.withColumn('n11', array(lit('11'), col('NOV')))
    df = df.withColumn('d12', array(lit('12'), col('DEC')))

    df = df.withColumn('months', array('j1','f2','m3','a4','m5','j6','j7','a8','s9','o10','n11','d12'))

    df = df.withColumn('months', explode('months'))

    df = df.withColumn('month', col('months')[0])
    df = df.withColumn(dataCol, col('months')[1])
    return df.select("FIPS", col("YEAR").alias("year"), "month", dataCol)

In [46]:
# run the convert function on our four climate tables
pcpn_tbl = convertClim(pcpn_tbl, 'pcpn')
tmpMin_tbl = convertClim(tmpMin_tbl, 'tmpMin')
tmpMax_tbl = convertClim(tmpMax_tbl, 'tmpMax')
tmpAvg_tbl = convertClim(tmpAvg_tbl, 'tmpAvg')

In [47]:
# Combine all four climate tables
clim_tbl = pcpn_tbl.join(tmpMin_tbl, ["FIPS", "year", "month"])
clim_tbl = clim_tbl.join(tmpMax_tbl, ["FIPS", "year", "month"])
clim_tbl = clim_tbl.join(tmpAvg_tbl, ["FIPS", "year", "month"])

In [48]:
#change data types
clim_tbl = clim_tbl.withColumn("year", col("year").cast('int'))\
    .withColumn("month", col("month").cast('int'))\
    .withColumn("pcpn", col("pcpn").cast('double'))\
    .withColumn("tmpMin", col("tmpMin").cast('double'))\
    .withColumn("tmpMax", col("tmpMax").cast('double'))\
    .withColumn("tmpAvg", col("tmpAvg").cast('double'))
clim_tbl.printSchema()

root
 |-- FIPS: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- pcpn: double (nullable = true)
 |-- tmpMin: double (nullable = true)
 |-- tmpMax: double (nullable = true)
 |-- tmpAvg: double (nullable = true)



#### Data Quality Check
Ensure the joining was succesfull.
Row counts for all the tables should be the same and a look at a random entry to spot check validity. 

In [49]:
print(clim_tbl.count(), tmpAvg_tbl.count(), tmpMin_tbl.count(), tmpMax_tbl.count(),pcpn_tbl.count())

132588 132588 132588 132588 132588


In [50]:
clim_tbl.where("FIPS == 27053 AND year == 1982 AND month == 01").show()
pcpn_tbl.where("FIPS == 27053 AND year == 1982 AND month == 01").show()
tmpMin_tbl.where("FIPS == 27053 AND year == 1982 AND month == 01").show()
tmpMax_tbl.where("FIPS == 27053 AND year == 1982 AND month == 01").show()
tmpAvg_tbl.where("FIPS == 27053 AND year == 1982 AND month == 01").show()

+-----+----+-----+----+------+------+------+
| FIPS|year|month|pcpn|tmpMin|tmpMax|tmpAvg|
+-----+----+-----+----+------+------+------+
|27053|1982|    1| 1.7|  -9.6|   9.7|   0.1|
+-----+----+-----+----+------+------+------+

+-----+----+-----+----+
| FIPS|year|month|pcpn|
+-----+----+-----+----+
|27053|1982|   01| 1.7|
+-----+----+-----+----+

+-----+----+-----+------+
| FIPS|year|month|tmpMin|
+-----+----+-----+------+
|27053|1982|   01|  -9.6|
+-----+----+-----+------+

+-----+----+-----+------+
| FIPS|year|month|tmpMax|
+-----+----+-----+------+
|27053|1982|   01|   9.7|
+-----+----+-----+------+

+-----+----+-----+------+
| FIPS|year|month|tmpAvg|
+-----+----+-----+------+
|27053|1982|   01|   0.1|
+-----+----+-----+------+



### Time Table
The most specific time item is date from observations, so go from there.
- date
- day
- week
- month
- year
- weekday

In [51]:
# create datetime column from original timestamp column
time_tbl = obsv_tbl.select('date').distinct()

# extract columns to create time table
time_tbl = time_tbl.select(
    col("date"),
    dayofmonth(col("date")).alias("day"),
    weekofyear(col("date")).alias("week"),
    month(col("date")).alias("month"),
    year(col("date")).alias("year"),
    date_format(col("date"), "E").alias("weekday")
    )
    
time_tbl.printSchema()

root
 |-- date: date (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: string (nullable = true)



In [52]:
time_tbl.limit(5).toPandas()

Unnamed: 0,date,day,week,month,year,weekday
0,2005-06-06,6,23,6,2005,Mon
1,2006-05-17,17,20,5,2006,Wed
2,1987-09-15,15,38,9,1987,Tue
3,2009-07-25,25,30,7,2009,Sat
4,1991-03-26,26,13,3,1991,Tue


## Write out Analysis Tables to Parquet files
This will fail on the Udacity workspace due to disk space constraints.

In [54]:
obsv_tbl.write.partitionBy("date", "county").mode("overwrite")\
    .parquet(OUTPUT_FOLDER + "obsv.parquet")
bio_tbl.write.mode("overwrite")\
    .parquet(OUTPUT_FOLDER + "bio.parquet")
cnty_tbl.write.partitionBy("state").mode("overwrite")\
    .parquet(OUTPUT_FOLDER + "cnty.parquet")
clim_tbl.write.partitionBy("year", "FIPS").mode("overwrite")\
    .parquet(OUTPUT_FOLDER + "clim.parquet")
time_tbl.write.partitionBy("year").mode("overwrite")\
    .parquet(OUTPUT_FOLDER + "time.parquet")