In [None]:
# Pyspark.SparkContext is an entry point to the PySpark functionality that is used to communicate with the cluster and to create an RDD, accumulator, and broadcast variables. 
# Note that you can create only one SparkContext per JVM, in order to create another first you need to stop the existing one using stop() method.
# At any given time only one SparkContext instance should be active per JVM. In case you want to create another you should stop existing SparkContext using stop() before creating a new one.

In [None]:
# Create SparkContext in Apache Spark version 1.x (prior to 2.0) - SparkContext is low level interacton.

# Import required modules
from pyspark import SparkContext, SparkConf

# Create a SparkContext object
sc = SparkContext("local", "MySparkApplication")
print(sc.appName)

# Get active SparkContext information
sc

# Create a Spark configuration with custom parameters
# Set the master to run locally with as many threads as available
# Set the amount of memory allocated per executor
# Set the amount of memory allocated to the driver
# Set the number of cores used by each executor
# Set the default number of partitions

conf = SparkConf().setAppName("AppName") \
                   .setMaster("local[*]") \
                   .set("spark.executor.memory", "2g") \
                   .set("spark.driver.memory", "1g") \
                   .set("spark.executor.cores", "2") \
                   .set("spark.default.parallelism", "4")  


# Create a Spark context
sc = SparkContext(conf=conf)
sc = SparkContext.getOrCreate(conf)
print(sc.appName)

# Get the application ID
app_id = sc.applicationId
print("Application ID:", app_id)

# Get the PySpark version
spark_version = sc.version
print("PySpark version:", spark_version)

# Get the URL of the Spark Web UI
web_ui_url = sc.uiWebUrl
print("Spark Web UI URL:", web_ui_url)

# Stop the SparkContext
sc.stop()

In [None]:
# Generate some sample data for RDD processing.

import random

# Function to generate random sales data
def generate_sales_data():
    products = ["ProductA", "ProductB", "ProductC"]
    locations = ["Location1", "Location2", "Location3"]
    sales = random.randint(1, 1000)
    product = random.choice(products)
    location = random.choice(locations)
    return f"{product},{location},{sales}"

# Number of records to generate
num_records = 100

# Generate and save the sales data
with open("sample_data.txt", "w") as file:
    for _ in range(num_records):
        sales_record = generate_sales_data()
        file.write(sales_record + "\n")

print(f"{num_records} sales records generated and saved to sales_data.txt.")

In [None]:
# Create an RDD from a range (0-1000, every 5)
rdd_range = sc.parallelize(range(0, 1001, 5))

# Create an RDD from a distributed collection (list in this case)
data_list = [10, 20, 30, 40, 50]
rdd_from_list = sc.parallelize(data_list)

# Create an RDD from a list of objects (id, name, age)
data_objects = [
    {"id": 1, "name": "John", "age": 25},
    {"id": 2, "name": "Jane", "age": 30},
    {"id": 3, "name": "Bob", "age": 22},
]
rdd_objects = sc.parallelize(data_objects)

# Create an RDD from a text file
sales_rdd = sc.textFile("sample_data.txt")

In [None]:
# Once you have an RDD, you can perform transformation and action operations. Any operation you perform on RDD runs in parallel.

# Now, you can perform operations on the RDD, like filtering, mapping, and reducing
total_sales = sales_rdd \
    .filter(lambda line: "ProductA" in line) \
    .map(lambda line: float(line.split(",")[1])) \
    .reduce(lambda x, y: x + y)

print("Total sales of ProductA: ", total_sales)

# On PySpark RDD, you can perform two kinds of operations.

# RDD transformations – Transformations are lazy operations. When you run a transformation (for example update), 
# instead of updating a current RDD, these operations return another RDD.
# Lazy meaning they don’t execute until you call an action on RDD. 
# Some transformations on RDDs are flatMap(), map(), reduceByKey(), filter(), sortByKey() and return a new RDD instead of updating the current.

# RDD Action operations return the values from an RDD to a driver program. In other words, any RDD function that returns non-RDD is considered as an action. 
# RDD Actions – operations that trigger computation and return RDD values to the driver.
# RDD Action operation returns the values from an RDD to a driver node. 
# In other words, any RDD function that returns non RDD[T] is considered as an action. 

In [None]:
# Transformations examples

# Map transformation to extract product names
product_names_rdd = sales_rdd.map(lambda line: line.split(',')[0])

# Filter transformation to get sales greater than 500
high_sales_rdd = sales_rdd.filter(lambda line: int(line.split(',')[2]) > 500)

# FlatMap transformation to flatten the product names
flat_mapped_rdd = product_names_rdd.flatMap(lambda name: name)

# Union transformation to combine two RDDs
combined_rdd = product_names_rdd.union(high_sales_rdd.map(lambda line: line.split(',')[0]))

# GroupBy transformation to group data by product
grouped_rdd = sales_rdd.groupBy(lambda line: line.split(',')[0])

# Sample transformation to take a random sample of the data
sample_rdd = sales_rdd.sample(False, 0.2)  # 20% of the data

# Distinct transformation to get unique products
distinct_products_rdd = product_names_rdd.distinct()

# KeyBy transformation to create key-value pairs with product as the key
key_value_rdd = product_names_rdd.keyBy(lambda name: name[0])

# Cartesian transformation to find the Cartesian product with another RDD
other_rdd = sc.parallelize([(1, "Info1"), (2, "Info2")])
cartesian_result = sales_rdd.cartesian(other_rdd)


In [None]:
# Actions examples

# Collect action to retrieve all elements
all_elements = sales_rdd.collect()

# Count action to count the number of records
record_count = sales_rdd.count()

# First action to get the first element
first_element = sales_rdd.first()

# Take action to get a specified number of elements
sample_elements = sales_rdd.take(5)

# Reduce action to calculate the total sales
total_sales = sales_rdd.map(lambda line: int(line.split(',')[2])).reduce(lambda x, y: x + y)

# ForEach action to print each element
sales_rdd.foreach(lambda line: print(line))

# SaveAsTextFile action to save the RDD to a text file
sales_rdd.saveAsTextFile("output_directory")

# Stop the Spark context
sc.stop()

In [None]:
# Repartition transformation to change the number of partitions
repartitioned_rdd = sales_rdd.repartition(4)  # Change the number of partitions to 4

# Coalesce transformation to decrease the number of partitions
coalesced_rdd = repartitioned_rdd.coalesce(2)  # Decrease the number of partitions to 2


In [None]:
# Broadcast transformation to efficiently distribute a read-only variable to all nodes
broadcast_variable = sc.broadcast(["Location1", "Location2", "Location3"])
filtered_broadcast_rdd = sales_rdd.filter(lambda line: line.split(',')[1] in broadcast_variable.value)

# Accumulator to accumulate values across multiple tasks
# Initialize an accumulator with an initial value of 0
accumulator = sc.accumulator(0)

sales_rdd.foreach(lambda line: accumulator.add(int(line.split(',')[2])))
total_sales_acc = accumulator.value