# Setting up our Schema

Spark can automatically create a schema for CSV files, but ours don't have headings. Let's set this up here:

In [1]:
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType

feats = []
f = open('features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

print(schema)


StructType(List(StructField(Timestamp,LongType,true),StructField(Geohash,StringType,true),StructField(geopotential_height_lltw,FloatType,true),StructField(water_equiv_of_accum_snow_depth_surface,FloatType,true),StructField(drag_coefficient_surface,FloatType,true),StructField(sensible_heat_net_flux_surface,FloatType,true),StructField(categorical_ice_pellets_yes1_no0_surface,FloatType,true),StructField(visibility_surface,FloatType,true),StructField(number_of_soil_layers_in_root_zone_surface,FloatType,true),StructField(categorical_freezing_rain_yes1_no0_surface,FloatType,true),StructField(pressure_reduced_to_msl_msl,FloatType,true),StructField(upward_short_wave_rad_flux_surface,FloatType,true),StructField(relative_humidity_zerodegc_isotherm,FloatType,true),StructField(categorical_snow_yes1_no0_surface,FloatType,true),StructField(u-component_of_wind_tropopause,FloatType,true),StructField(surface_wind_gust_surface,FloatType,true),StructField(total_cloud_cover_entire_atmosphere,FloatType,tru

# Creating a Dataframe

Let's load our CSV into a 'dataframe' - Spark's abstraction for working with tabular data (built on top of RDDs)

In [3]:
#df = spark.read.format('csv').option('sep', '\t').schema(schema).load('/Volumes/evo/Datasets/NAM_2015_S/*')
df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion12:50000/nam_tiny.tdv')
df.take(1)

[Row(Timestamp=1426377600000, Geohash='953rtrfmww20', geopotential_height_lltw=3975.03125, water_equiv_of_accum_snow_depth_surface=0.0, drag_coefficient_surface=0.0, sensible_heat_net_flux_surface=10.42236328125, categorical_ice_pellets_yes1_no0_surface=0.0, visibility_surface=24221.587890625, number_of_soil_layers_in_root_zone_surface=0.0, categorical_freezing_rain_yes1_no0_surface=0.0, pressure_reduced_to_msl_msl=101366.0, upward_short_wave_rad_flux_surface=18.375, relative_humidity_zerodegc_isotherm=30.0, categorical_snow_yes1_no0_surface=0.0, u-component_of_wind_tropopause=5.4028778076171875, surface_wind_gust_surface=3.15878963470459, total_cloud_cover_entire_atmosphere=100.0, upward_long_wave_rad_flux_surface=424.9310302734375, land_cover_land1_sea0_surface=0.0, vegitation_type_as_in_sib_surface=0.0, v-component_of_wind_pblri=-1.61834716796875, albedo_surface=6.0, lightning_surface=0.0, ice_cover_ice1_no_ice0_surface=0.0, convective_inhibition_surface=-0.65234375, pressure_surfac

# Playtime

In [4]:
really_hot = df.filter(df.temperature_surface > 320).count()
print(really_hot)

hot_and_humid = df.filter(df.temperature_surface > 313).filter(df.relative_humidity_zerodegc_isotherm > .8).count()
print(hot_and_humid)

0
0


In [5]:
df.filter(df.snow_cover_surface > .85).take(5)

[Row(Timestamp=1426377600000, Geohash='cf7ecr4h2ps0', geopotential_height_lltw=136.53125, water_equiv_of_accum_snow_depth_surface=77.0, drag_coefficient_surface=0.0, sensible_heat_net_flux_surface=-39.57763671875, categorical_ice_pellets_yes1_no0_surface=0.0, visibility_surface=24221.587890625, number_of_soil_layers_in_root_zone_surface=3.0, categorical_freezing_rain_yes1_no0_surface=0.0, pressure_reduced_to_msl_msl=99602.0, upward_short_wave_rad_flux_surface=6.625, relative_humidity_zerodegc_isotherm=34.0, categorical_snow_yes1_no0_surface=0.0, u-component_of_wind_tropopause=27.527877807617188, surface_wind_gust_surface=16.158788681030273, total_cloud_cover_entire_atmosphere=100.0, upward_long_wave_rad_flux_surface=314.0560302734375, land_cover_land1_sea0_surface=1.0, vegitation_type_as_in_sib_surface=18.0, v-component_of_wind_pblri=12.31915283203125, albedo_surface=38.75, lightning_surface=0.0, ice_cover_ice1_no_ice0_surface=0.0, convective_inhibition_surface=-0.65234375, pressure_su

# SQL

In [6]:
# Creating an SQL 'table'
df.createOrReplaceTempView("TEMP_DF")

# Let's get all the snow cover values:
snow = spark.sql("SELECT snow_cover_surface FROM TEMP_DF").collect()
# .collect() gives us a list of rows. Let's grab the first 10:
for i in range(10):
    print(snow[i])


# What's the maximum value?
snowmax = spark.sql("SELECT MAX(snow_cover_surface) as maxval FROM TEMP_DF").collect()

print('Max val observed:', snowmax)


Row(snow_cover_surface=0.0)
Row(snow_cover_surface=0.0)
Row(snow_cover_surface=0.0)
Row(snow_cover_surface=100.0)
Row(snow_cover_surface=0.0)
Row(snow_cover_surface=0.0)
Row(snow_cover_surface=100.0)
Row(snow_cover_surface=0.0)
Row(snow_cover_surface=0.0)
Row(snow_cover_surface=0.0)
Max val observed: [Row(maxval=100.0)]


In [6]:
from pyspark.sql.functions import avg

df.select(avg(df.wilting_point_surface)).show()

+--------------------------+
|avg(wilting_point_surface)|
+--------------------------+
|      0.029712499007582664|
+--------------------------+



# Sampling

We can even create a sample dataset with Spark! Let's create a 10% sample (without replacement)

In [None]:
samp = df.sample(False, .1)

# Write it out to a file
samp.write.format('csv').save('hdfs://orion12:50000/sampled_output.csv')