In [None]:
# Q1.Working with RDDs:

# a) Create an RDD from a local data source:

from pyspark import SparkContext

sc = SparkContext("local", "RDD Example")
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# b) Implement transformations and actions on the RDD:

squared_rdd = rdd.map(lambda x: x ** 2)
sum_result = squared_rdd.reduce(lambda x, y: x + y)

# c) Analyze and manipulate data using RDD operations:

filtered_rdd = rdd.filter(lambda x: x % 2 == 0)
max_value = rdd.max()
min_value = rdd.min()

In [None]:
# Q2.Spark DataFrame Operations:

# a) Load a CSV file into a Spark DataFrame:

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataFrame Example").getOrCreate()
df = spark.read.csv("file.csv", header=True, inferSchema=True)

# b) Perform common DataFrame operations:

filtered_df = df.filter(df["age"] > 30)
grouped_df = df.groupBy("department").agg({"salary": "avg"})
joined_df = df.join(other_df, on="id", how="inner")

# c) Apply Spark SQL queries on the DataFrame:

df.createOrReplaceTempView("employees")
result = spark.sql("SELECT * FROM employees WHERE age > 30")

In [None]:
# Q3.Spark Streaming:

# a) Create a Spark Streaming application:

from pyspark.streaming import StreamingContext

ssc = StreamingContext(sparkContext=sc, batchDuration=1)

# b) Configure the application to consume data from a streaming source:

stream = ssc.socketTextStream("localhost", 9999)

# c) Implement streaming transformations and actions:

word_counts = stream.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
word_counts.pprint()
ssc.start()
ssc.awaitTermination()

In [None]:
# Q4.Spark SQL and Data Source Integration:

# a) Connect Spark with a relational database:

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark SQL Example").config("spark.driver.extraClassPath", "mysql-connector-java.jar").getOrCreate()
df = spark.read.format("jdbc").options(
    url="jdbc:mysql://localhost:3306/database_name",
    driver="com.mysql.jdbc.Driver",
    dbtable="table_name",
    user="username",
    password="password"
).load()

# b) Perform SQL operations on the data stored in the database:

df.createOrReplaceTempView("data_table")
result = spark.sql("SELECT * FROM data_table WHERE age > 30")

# c) Explore integration capabilities with other data sources:

df = spark.read.csv("hdfs://localhost:9000/path/to/file.csv", header=True, inferSchema=True)
df = spark.read.json("s3a://bucket/path/to/file.json")
df = spark.read.parquet("s3a://bucket/path/to/file.parquet")