In [33]:
import logging
from pyspark.sql import SparkSession

Here are some considerations for when to create an RDD first and when to create a DataFrame directly:

**Creating an RDD first (as in the provided code):**
1. **Custom Transformation:** If you have custom transformation logic that you want to apply to the data in the RDD before creating a DataFrame, you might choose to work with RDDs first. This allows you to use RDD transformations like `map`, `filter`, and `reduce` to process the data.

2. **Non-Structured Data:** If your data is unstructured or doesn't have a well-defined schema, working with RDDs can be more flexible because RDDs are schema-less compared to DataFrames.

3. **Fine-Grained Control:** RDDs provide fine-grained control over data partitioning and processing, which can be beneficial in specific use cases where you need to optimize performance at a low level.

**Creating a DataFrame directly:**
1. **Structured Data:** If your data is structured, such as CSV, JSON, or Parquet files, you can create a DataFrame directly from those sources using built-in Spark methods. For example, you can use `spark.read.csv("file.csv")` to create a DataFrame from a CSV file.

2. **Schema Inference:** DataFrames can automatically infer the schema from the data source, which simplifies the process, especially when dealing with large and complex datasets.

3. **Optimization:** DataFrames benefit from Spark's built-in optimizations, which can result in better performance for common operations like filtering, grouping, and aggregating.

##### Create Spark Session. This is our entry point for Dataframes or Datasets, not RDDs.

In [34]:
def rdd_to_dataframe(data, schema):
  """ 
  Example: This fn creates a Spark RDD, loads it into a Spark DataFrame, and returns the DataFrame.
  """
  # create a SparkSession
  spark = SparkSession.builder.appName('RDDToDataFrame').getOrCreate()

  try:
    # creat an RDD from the input data, using Spark Context not Session!
    # rdd = spark.sparkContext.parallelize(data)

    # convert RDD to DataFrame
    df = spark.createDataFrame(data, schema)

    # return the DataFrame, without stopping the SparkSession
    return df
  
  except Exception as e:
    # Log error and Stop the SparkSession
    logging.error('Error while transforming RDD to DF: {}'.format(e))
    spark.stop()

##### Create some random data

In [35]:
dept_data = [(1, 'Big Data'), (2, 'Finance'), (3, 'Marketing')]
dept_schema = ['department_id', 'department_name']

In [36]:
emp_data = [(1, 'Carlos', 17), (1, 'Bob', 30), (2 ,'Jasmin', 26)]
emp_schema = ['department_id', 'employee_name', 'age']

#### Let's now use the Spark RDD as SparkSchema

In [37]:
df_emp = rdd_to_dataframe(emp_data, emp_schema)
df_dept = rdd_to_dataframe(dept_data, dept_schema)

In [38]:
# Show Schema
df_dept.show()

+-------------+---------------+
|department_id|department_name|
+-------------+---------------+
|            1|       Big Data|
|            2|        Finance|
|            3|      Marketing|
+-------------+---------------+



In [39]:
print(df_dept)

DataFrame[department_id: bigint, department_name: string]


### Use Spark SQL, to join 2 datasets

In [40]:
# Do we have a session running
spark = SparkSession.builder.appName('RDDToDataFrame').getOrCreate()

In [41]:
# Register as view
df_emp.createOrReplaceTempView('employees')
df_dept.createOrReplaceTempView('departments')

In [42]:
spark.sql('''select * from employees where age >= 18 ''').show()

+-------------+-------------+---+
|department_id|employee_name|age|
+-------------+-------------+---+
|            1|          Bob| 30|
|            2|       Jasmin| 26|
+-------------+-------------+---+



In [61]:
# Query sample, using Spark SQL
spark.sql('''select emp.*, dept.*
          from employees as emp
          inner join departments as dept on (emp.department_id = dept.department_id)
          where age >= 18
          ''').show()

+-------------+-------------+---+-------------+---------------+
|department_id|employee_name|age|department_id|department_name|
+-------------+-------------+---+-------------+---------------+
|            1|          Bob| 30|            1|       Big Data|
|            2|       Jasmin| 26|            2|        Finance|
+-------------+-------------+---+-------------+---------------+



In [62]:
# Let's now save the JOINED Result into a new Temporart View -- NO WHERE CLAUSE
spark.sql('''
          select emp.employee_name, emp.age, emp.department_id, dept.department_name
          from employees as emp
            inner join departments as dept on (emp.department_id = dept.department_id)
             where age >= 18
          ''').createOrReplaceTempView('dept_employees')

In [66]:
# Show the JOINED RESULTSET into a new Temporary View -- NO WHER CLAUSE
spark.sql('''
        select * from dept_employees where department_id is not null
          ''').show()

+-------------+---+-------------+---------------+
|employee_name|age|department_id|department_name|
+-------------+---+-------------+---------------+
|          Bob| 30|            1|       Big Data|
|       Jasmin| 26|            2|        Finance|
+-------------+---+-------------+---------------+



#### Save the output for our Business Data Consumers

In [47]:
# # Define output location
# output_location = 'output/dept_employees/'

# # let's now save the JOINED RESULTSET to local storage. This could be Amazon S3 or other.
# spark.sql('''
#         select * from dept_employees where department_id is not null
#           ''').write.mode('append').csv(output_location)
          