In [57]:
import logging
from pyspark.sql import SparkSession

##### Create Spark Session. This is our entry point for Dataframes or Datasets, not RDDs.

In [58]:
def rdd_to_dataframe(data, schema):
  """ 
  Example: This fn creates a Spark RDD, loads it into a Spark DataFrame, and returns the DataFrame.
  """
  # create a SparkSession
  spark = SparkSession.builder.appName('RDDToDataFrame').getOrCreate()

  try:
    # creat an RDD from the input data, using Spark Context not Session!
    rdd = spark.sparkContext.parallelize(data)

    # convert RDD to DataFrame
    df = spark.createDataFrame(rdd, schema)

    # return the DataFrame, without stopping the SparkSession
    return df
  
  except Exception as e:
    # Log error and Stop the SparkSession
    logging.error('Error while transforming RDD to DF: {}'.format(e))
    spark.stop()

##### Create some random data

In [59]:
dept_data = [(1, 'Big Data'), (2, 'Finance'), (3, 'Marketing')]
dept_schema = ['department_id', 'department_name']

In [60]:
emp_data = [(1, 'Carlos', 17), (1, 'Bob', 30), (2 ,'Jasmin', 26)]
emp_schema = ['department_id', 'employee_name', 'age']

#### Let's now use the Spark RDD as SparkSchema

In [61]:
df_emp = rdd_to_dataframe(emp_data, emp_schema)
df_dept = rdd_to_dataframe(dept_data, dept_schema)

In [62]:
# Show Schema
df_dept.show()

+-------------+---------------+
|department_id|department_name|
+-------------+---------------+
|            1|       Big Data|
|            2|        Finance|
|            3|      Marketing|
+-------------+---------------+



In [63]:
print(df_dept)

DataFrame[department_id: bigint, department_name: string]


### Use Spark SQL, to join 2 datasets

In [64]:
# Do we have a session running
spark = SparkSession.builder.appName('RDDToDataFrame').getOrCreate()

In [65]:
# Register as view
df_emp.createOrReplaceTempView('employees')
df_dept.createOrReplaceTempView('departments')

In [66]:
# Query sample, using Spark SQL
spark.sql('''
          select emp.*, dept.* 
          from employees as emp
            inner join departments as dept on (emp.department_id = dept.department_id)
          where age >= 18
          ''').show()

+-------------+-------------+---+-------------+---------------+
|department_id|employee_name|age|department_id|department_name|
+-------------+-------------+---+-------------+---------------+
|            1|          Bob| 30|            1|       Big Data|
|            2|       Jasmin| 26|            2|        Finance|
+-------------+-------------+---+-------------+---------------+



In [69]:
# Let's now save the JOINED Result into a new Temporart View -- NO WHER CLAUSE
spark.sql('''
          select emp.employee_name, emp.age, emp.department_id, dept.department_name
          from employees as emp
            inner join departments as dept on (emp.department_id = dept.department_id)
             where age >= 18
          ''').createOrReplaceTempView('dept_employees')

In [70]:
# Let's now save the JOINED RESULTSET into a new Temporary View -- NO WHER CLAUSE
spark.sql('''
        select * from dept_employees where department_id is not null
          ''').show()

+-------------+---+-------------+---------------+
|employee_name|age|department_id|department_name|
+-------------+---+-------------+---------------+
|          Bob| 30|            1|       Big Data|
|       Jasmin| 26|            2|        Finance|
+-------------+---+-------------+---------------+



#### Save the output for our Business Data Consumers

In [71]:
# Define output location
output_location = 'output/dept_employees/'

# let's now save the JOINED RESULTSET to local storage. This could be Amazon S3 or other.
spark.sql('''
        select * from dept_employees where department_id is not null
          ''').write.mode('append').csv(output_location)
          