In [85]:
import pandas as pd
import numpy as np
import pyarrow as pa
import re
import findspark
findspark.init('/home/dave/spark-2.4.1-bin-hadoop2.7/')
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import *
import sys
import os
from IPython.display import display, HTML

datafile = "/home/dave/Documents/CS691/proj/data/snotel/SNOTEL_518_Heavenly_Valley_CA"
spark = SparkSession.builder.appName("proj_exp").getOrCreate()

df = spark.read.text(datafile)
df= df.filter(~ df.value.rlike('#'))
df= df.filter(~ df.value.rlike('Date'))
  
#split DF into Cols: Date,Station Name,Station Id,Snow Water Equivalent (in),Change In Snow Water Equivalent (in),
#Snow Depth (in),Change In Snow Depth (in),Air Temperature Observed (degF),Barometric Pressure (inch_Hg),Dew Point 
#Temperature (degF),Wind Direction Observed (degree),Wind Speed Observed (mph)
split_col = pyspark.sql.functions.split(df['value'], ',')

df = df.withColumn('Date', split_col.getItem(0))
df = df.withColumn('Station Name', split_col.getItem(1))
df = df.withColumn('sid', split_col.getItem(2))
df = df.withColumn('swe', split_col.getItem(3))
df = df.withColumn('dswe', split_col.getItem(4))
df = df.withColumn('sd', split_col.getItem(5))
df = df.withColumn('dsd', split_col.getItem(6))
df = df.withColumn('at', split_col.getItem(7))

#create delta air temp
w = Window().partitionBy().orderBy(F.col("Date"))
df = df.select("*", F.lag("at").over(w).alias("prev_at"))
df = df.withColumn("dat", F.when(F.isnull(df.at - df.prev_at), 0)
                              .otherwise(df.at - df.prev_at))
df = df.select('Date', 'Station Name', 'sid', 'swe', 'dswe', 'sd', 'dsd', 'at', 'dat')
cr = df.select('Date', 'sid')

#split date and time
split_col = pyspark.sql.functions.split(df['Date'], ' ')
df = df.withColumn('date_only', split_col.getItem(0))
df = df.withColumn('time_only', split_col.getItem(1))
df = df.select('Date', 'date_only', 'time_only', 'Station Name', 'sid', 'swe', 'dswe', 'sd', 'dsd', 'at', 'dat')

#make compass rose
#not currently used- maintaining for future use
#compass rose
# array index
#---0----------1---------2--------3----------4--------5---------6--------7--------
#---west---northwest---north---northeast---east---southeast---south---southwest---

def ones(x):
    return [1,1,1,1,1,1,1,1]
    
ones_udf_array = F.udf(lambda z: ones(z), ArrayType(IntegerType()))
        
#cr = cr.select('*', ones_udf_array('sid').alias('below_treeline'))
#cr = cr.select('*', ones_udf_array('sid').alias('near_treeline'))
#cr = cr.select('*', ones_udf_array('sid').alias('above_treeline'))

def sd(x):
    return [1,1,1,1,1,1,1,1]
    
sd_udf_array = F.udf(lambda z: sd(z), ArrayType(IntegerType()))
#cr = cr.select('*', ones_udf_array('sid').alias('below_treeline'))

#join input and output tables for processing (remove dup cols)
df = df.select('Date', 'date_only', 'time_only', 'Station Name', 'swe', 'dswe', 'sd', 'dsd', 'at', 'dat')
sc = df.join(cr, ["Date"])

#pandas udf processing

x_columns = ['time_only', 'dsd']
schema = sc.select("date_only", *x_columns).schema
sc = sc.select("date_only", *x_columns)

def snowfall(x):
    try:
        if int(x) >= 4 :
            return "2"
        elif int(x) >= 2:
            return "1"
        else:
            return "0"
    except: 
        return "0"
@pandas_udf(schema, F.PandasUDFType.GROUPED_MAP)
# Input/output are both a pandas.DataFrame
def dsd_calc_udf(pdf):
    #sort_by_dsd = pdf.sort_values(by=['dsd'],ascending = False)
    pdf.dsd = pdf.dsd.apply(snowfall)
    return pdf
    #return pd.DataFrame([[group_key] + [model.params[i] for i in   x_columns]], columns=[group_column] + x_columns)

sc.show(n = 3)
test_data = sc.select('time_only', 'dsd').limit(24).toPandas()

print("test input")
display(test_data)

test_output = dsd_calc_udf.func(test_data)

print("test output")
display(test_output)

beta = sc.groupby("date_only").apply(dsd_calc_udf)

print("beta count")
print(beta.count())

print("sc count")
print(sc.count())

spark.stop()


+----------+---------+---+
| date_only|time_only|dsd|
+----------+---------+---+
|2009-04-01|    00:00|  0|
|2009-04-01|    01:00| -1|
|2009-04-01|    02:00|  0|
+----------+---------+---+
only showing top 3 rows

test input


Unnamed: 0,time_only,dsd
0,00:00,0
1,01:00,-1
2,02:00,0
3,03:00,1
4,04:00,0
5,05:00,0
6,06:00,0
7,07:00,0
8,08:00,-2
9,09:00,0


test output


Unnamed: 0,time_only,dsd
0,00:00,0
1,01:00,0
2,02:00,0
3,03:00,0
4,04:00,0
5,05:00,0
6,06:00,0
7,07:00,0
8,08:00,0
9,09:00,0


beta count
87579
sc count
87579


In [None]:
r