In [2]:
import pandas as pd
import numpy as np
import re
import findspark
findspark.init('/home/dave/spark-2.4.1-bin-hadoop2.7/')
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import *
import sys
import os

datafile = "/home/dave/Documents/CS691/proj/data/snotel/SNOTEL_518_Heavenly_Valley_CA"

spark = SparkSession.builder.appName("proj_exp").getOrCreate()

df = spark.read.text(datafile)

df= df.filter(~ df.value.rlike('#'))
df= df.filter(~ df.value.rlike('Date'))

#split DF into Cols: Date,Station Name,Station Id,Snow Water Equivalent (in),Change In Snow Water Equivalent (in),
#Snow Depth (in),Change In Snow Depth (in),Air Temperature Observed (degF),Barometric Pressure (inch_Hg),Dew Point 
#Temperature (degF),Wind Direction Observed (degree),Wind Speed Observed (mph)
split_col = pyspark.sql.functions.split(df['value'], ',')

df = df.withColumn('Date', split_col.getItem(0))
df = df.withColumn('Station Name', split_col.getItem(1))
df = df.withColumn('sid', split_col.getItem(2))
df = df.withColumn('swe', split_col.getItem(3))
df = df.withColumn('dswe', split_col.getItem(4))
df = df.withColumn('sd', split_col.getItem(5))
df = df.withColumn('dsd', split_col.getItem(6))
df = df.withColumn('at', split_col.getItem(7))

#df.show(n=20)

#create delta air temp
w = Window().partitionBy().orderBy(F.col("Date"))
df = df.select("*", F.lag("at").over(w).alias("prev_at"))
df = df.withColumn("dat", F.when(F.isnull(df.at - df.prev_at), 0)
                              .otherwise(df.at - df.prev_at))
df = df.select('Date', 'Station Name', 'sid', 'swe', 'dswe', 'sd', 'dsd', 'at', 'dat')
cr = df.select('Date', 'Station Name', 'sid')

#make compass rose
#cr = df.select('Date', 'Station Name', 'sid')

#compass rose
# array index
#---0----------1---------2--------3----------4--------5---------6--------7--------
#---west---northwest---north---northeast---east---southeast---south---southwest---

def ones(x):
    return [1,1,1,1,1,1,1,1]
    
ones_udf_int = F.udf(lambda z: ones(z), ArrayType(IntegerType()))
        

cr = cr.select('*', ones_udf_int('sid').alias('below_treeline'))
cr = cr.select('*', ones_udf_int('sid').alias('near_treeline'))
cr = cr.select('*', ones_udf_int('sid').alias('above_treeline'))

cr.printSchema()
cr.show(n=20)

spark.stop()


root
 |-- Date: string (nullable = true)
 |-- Station Name: string (nullable = true)
 |-- sid: string (nullable = true)
 |-- below_treeline: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- near_treeline: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- above_treeline: array (nullable = true)
 |    |-- element: integer (containsNull = true)

+----------------+---------------+---+--------------------+--------------------+--------------------+
|            Date|   Station Name|sid|      below_treeline|       near_treeline|      above_treeline|
+----------------+---------------+---+--------------------+--------------------+--------------------+
|2009-04-01 00:00|Heavenly Valley|518|[1, 1, 1, 1, 1, 1...|[1, 1, 1, 1, 1, 1...|[1, 1, 1, 1, 1, 1...|
|2009-04-01 01:00|Heavenly Valley|518|[1, 1, 1, 1, 1, 1...|[1, 1, 1, 1, 1, 1...|[1, 1, 1, 1, 1, 1...|
|2009-04-01 02:00|Heavenly Valley|518|[1, 1, 1, 1, 1, 1...|[1, 1, 1, 1, 1, 1...|[1, 1, 1

In [None]:
r