# Using pyspark Data-PreProcessing


In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [4]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DoubleType, DateType
schema = StructType([
    StructField("properties", StructType([
        StructField("parameter", StructType([
            StructField("WD10M", MapType(StringType(),DoubleType()),True),
            StructField("WS10M", MapType(StringType(),DoubleType()),True),
            StructField("WD2M",  MapType(StringType(),DoubleType()),True),
        ]), True)
    ]), True),
    StructField("geometry", StructType([
        StructField("coordinates", ArrayType(DoubleType()),True)
    ]), True)
])

# Load the json file
data_df = spark.read.json('file:///home/talentum/test-jupyter/Daily/Adilabad_19.4000_78.3100_*.json', multiLine=True, schema=schema)

#print(data_df.printSchema())
#print(data_df.show())

In [5]:
data_df = data_df.select(F.col("properties.parameter.*"),F.col("geometry.coordinates"))

In [7]:
# create dataframe for features "PS","PSC","T2M","T2MWET","T2MDEW" 
from pyspark.sql.functions import explode
WD2M =  data_df.select('coordinates',explode(data_df.WD2M).alias("Date", "WD2M"))
WD10M = data_df.select(explode(data_df.WD10M).alias("Date", "WD10M"))
WS10M = data_df.select(explode(data_df.WS10M).alias("Date", "WS10M"))
#"WD10M",WS10M, WD2M

In [8]:
from pyspark.sql.functions import desc
final_df = WD2M.join(WD10M, WD2M.Date == WD10M.Date, 'inner') \
.join(WS10M , WD2M.Date == WS10M.Date, 'inner') \
.select(PS.Date, "WD2M","WD10M","WS10M")

# another approach is by renaming.

In [9]:
final_df.show()

+----------+-----+------+-----+------+------+
|       key|   PS|   PSC|  T2M|T2MWET|T2MDEW|
+----------+-----+------+-----+------+------+
|2004010105|97.34|101.68|10.65|  8.47|  6.27|
|2004010106|97.41|101.74|10.91|  8.72|  6.52|
|2004010107|97.48|101.76|14.73| 10.87|   7.0|
|2004010108|97.54|101.78|18.01| 12.63|  7.26|
|2004010109|97.55| 101.7|23.77|  14.6|  5.43|
|2004010110|97.48|101.57|27.27| 15.38|  3.48|
|2004010111|97.36|101.43|28.83| 15.85|  2.86|
|2004010112|97.22|101.28|29.59| 16.07|  2.55|
|2004010113|97.13|101.18|29.78| 16.03|  2.29|
|2004010114|97.08|101.13| 29.4| 15.76|  2.13|
|2004010115|97.08|101.15|28.41| 15.33|  2.26|
|2004010116|97.11|101.23|24.76|  16.5|  8.24|
|2004010117|97.16|101.34| 20.2| 13.82|  7.43|
|2004010118|97.23|101.44|18.73| 12.58|  6.42|
|2004010119|97.27| 101.5|17.75|  11.8|  5.84|
|2004010120|97.28|101.52|16.85| 11.23|   5.6|
|2004010121|97.26|101.52|15.95| 10.76|  5.56|
|2004010122|97.24| 101.5|15.06| 10.37|  5.67|
|2004010123|97.21|101.49|14.23|  1

In [None]:
# for column in df2.columns:
#     attribute =  df2.select('coordinates',explode(df2[column]).alias("Date", column))

#     # Perform a left join between the empty DataFrame and the non-empty DataFrame on the "key" column
#     #joined_df = empty_df.join(attribute, on="Date", how="left")

# # Show the joined DataFrame
# #joined_df.show()
# final_df.show()
    

In [None]:
# df2 =  df2.select('coordinates',explode(df2.PS).alias("key", "PS"))

In [None]:
# print(PS.show())
# # print(PSC.show())

# PS.printSchema()
# # data_df = voter_df.filter(voter_df.VOTER_NAME.isNotNull())
# # voter_df = voter_df.fillna("abc")   if null rows are not allowed to delete than fill it with something
# # voter_df = voter_df.dropna()       to delete null values row

In [None]:
# # Define schema for the empty DataFrame
# #schema = StructType([
# #    StructField("Date", StringType(), True)
# #])

# # Create an empty DataFrame with the specified schema
# final_df = spark.createDataFrame([], schema)

