# RDD
- RDD is the underlying data structure beneath both spark tables and dataframe.
- Using the APIs whatever transformation we do they convert into RDDs and then processes using spark engine. 
- Stands for resilient distributed datset.
- Is a dataset, data structure to hold data similar to dataframe.
- Unlike dataframe these are language naitve object and dont have row column structure and schema.
- Rdds are broken internally as partitions for distributed process.
- They are resilient as they are fault tolerant as they store data about how it was created.
    example while loading a partition if one core fails then driver allocates the assigned partitions to other core and they create the partition as the partition has the details on how to create it.

In [23]:
from collections import namedtuple
from webbrowser import get


if __name__ == "__main__":
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
    import pyspark.sql.functions as F

    conf= SparkConf().setAppName("RDD").setMaster("local[*]")
    
    # dataframe apis are based on spark session whereas RDD apis are based on spark context
    # description of what spark context is:
    # SparkContext is the entry point to Spark functionality. 
    # It is the main entry point for Spark functionality and is responsible for coordinating the execution of tasks across the cluster.
    # It allows the Spark application to access the cluster, create RDDs, and perform operations on them.
    
    # 2 methods to get spark context

    #sc = SparkContext(conf=conf) # directly creating a spark context object
    
    spark = (SparkSession
             .builder
             .appName('Hello RDD')
             .master('local[*]')
             .getOrCreate())
    sc2= spark.sparkContext 
    # creating spark context using spark session
    # Spark session is higher level object created on top of context for improvement and still holds context and uses internally
    

    # creating an RDD from a list of tuples
    linesrdd = sc2.textFile (r"C:\Users\shubh\OneDrive\Desktop\orders.txt")
    # each record in RDD is line of text from from.
    # spark context helps read text, binary, sequence, hadoop and object file it means that are raw and fundamental.
    # doesn't let you work with files like csv, excel etc.
    # it is a low level API and gives you more control over the data and how it is processed.


    # How to process RDD.
    # RDDs offered only basic transformation like map, reduce, filter.
    # most of the transformations were done by lambda functions taking custom code leaving the user to do the heavy lifting.

    partitionedrdd= linesrdd.repartition(2)
    structuredrdd = partitionedrdd.map(lambda line : line.replace('"','').split(',')) # since the data file is loaded as line of text in txt file we need to restructure the data
    # with this the input was line of text and the result is list of text
    # the txt files should not have header in it as if there then transforamtion breaks
    # if header is present remove the first row

        # # Extract header
        # header = rawRDD.first()

        # # Remove the header
        # dataRDD = rawRDD.filter(lambda line: line != header)

    orders = namedtuple('orders',["order_id","order_date","item_type","unit_price","unit_cost","total_revenue","total_cost","total_profit"])
    # this tuple helps create a schema for the data and gives a name to each column in the data.

    # couple of transformations:
    selectRDD = structuredrdd.map(lambda x: orders(x[0],x[1],x[2],float(x[3]),float(x[4]),float(x[5]),float(x[6]),float(x[7])))
    filteredRDD = selectRDD.filter(lambda x: x.item_type == 'Clothes')
    
    for i in filteredRDD.take(10):
        print(i)


orders(order_id='1007', order_date='01/18/2023', item_type='Clothes', unit_price=132.45, unit_cost=90.1, total_revenue=13245.0, total_cost=9010.0, total_profit=4235.0)
orders(order_id='1012', order_date='01/23/2023', item_type='Clothes', unit_price=145.9, unit_cost=112.0, total_revenue=14590.0, total_cost=11200.0, total_profit=3390.0)
orders(order_id='1017', order_date='01/28/2023', item_type='Clothes', unit_price=110.25, unit_cost=85.0, total_revenue=11025.0, total_cost=8500.0, total_profit=2525.0)
orders(order_id='1022', order_date='02/02/2023', item_type='Clothes', unit_price=128.4, unit_cost=101.3, total_revenue=12840.0, total_cost=10130.0, total_profit=2710.0)
