In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [2]:
# Create Spark session
spark = SparkSession.builder.appName("DataFrameSetup").getOrCreate()

In [3]:
# Create 'orders' DataFrame
orders_data = [
    (1, '2025-05-01', 1001, 250.0),
    (2, '2025-05-02', 1002, 300.0),
    (3, '2025-05-03', 1001, 150.0),
    (4, '2025-05-04', 1003, 400.0),
    (5, '2025-05-05', 1002, 500.0)
]
orders_schema = ['order_id', 'order_date', 'cust_id', 'amount']
orders = spark.createDataFrame(orders_data, orders_schema)

# Create 'products' DataFrame
products_data = [
    (1, 'Laptop'),
    (2, 'Phone'),
    (3, 'Monitor'),
    (4, 'Keyboard'),
    (5, 'Mouse')
]
products_schema = ['order_id', 'product_name']
products = spark.createDataFrame(products_data, products_schema)

# Create 'df2' DataFrame for the second join
df2_data = [
    (1, 'Electronics'),
    (2, 'Electronics'),
    (3, 'Accessories'),
    (4, 'Accessories'),
    (5, 'Accessories')
]
df2_schema = ['order_id', 'category']
df2 = spark.createDataFrame(df2_data, df2_schema)

# Show the DataFrames
orders.show()
products.show()
df2.show()


+--------+----------+-------+------+
|order_id|order_date|cust_id|amount|
+--------+----------+-------+------+
|       1|2025-05-01|   1001| 250.0|
|       2|2025-05-02|   1002| 300.0|
|       3|2025-05-03|   1001| 150.0|
|       4|2025-05-04|   1003| 400.0|
|       5|2025-05-05|   1002| 500.0|
+--------+----------+-------+------+

+--------+------------+
|order_id|product_name|
+--------+------------+
|       1|      Laptop|
|       2|       Phone|
|       3|     Monitor|
|       4|    Keyboard|
|       5|       Mouse|
+--------+------------+

+--------+-----------+
|order_id|   category|
+--------+-----------+
|       1|Electronics|
|       2|Electronics|
|       3|Accessories|
|       4|Accessories|
|       5|Accessories|
+--------+-----------+



In [8]:
#Joins data frames
df = orders.join(products, "order_id", "inner") 
df.join(df2, 'order_id').groupBy('category').count().orderBy(desc('count'))

#apply group by function and the aggregation would by any
df1=df.groupBy("cust_id").agg(sum("amount").alias("bill")) 

df.groupBy("product_name").agg(
    count("amount").alias("count"),
    sum("amount").alias("sum"),
    max("amount").alias("maximum"),
    min("amount").alias("minimum"),
    avg("amount").alias("average")
).show()

+------------+-----+-----+-------+-------+-------+
|product_name|count|  sum|maximum|minimum|average|
+------------+-----+-----+-------+-------+-------+
|       Phone|    1|300.0|  300.0|  300.0|  300.0|
|      Laptop|    1|250.0|  250.0|  250.0|  250.0|
|       Mouse|    1|500.0|  500.0|  500.0|  500.0|
|    Keyboard|    1|400.0|  400.0|  400.0|  400.0|
|     Monitor|    1|150.0|  150.0|  150.0|  150.0|
+------------+-----+-----+-------+-------+-------+



In [10]:
from pyspark.sql.functions import col, desc

# 1. Drop columns
df_dropped = df.drop("order_date", "product_name")  # Just as an example

# 2. Create a temporary table
df.createOrReplaceTempView("orders_view")

# 3. Return the first row by descending order of 'amount'
df.orderBy(desc("amount")).first()

# 4. Another way to return the highest value record
df.orderBy(col("amount").desc()).first()

# 5. Top 5 records by amount
df.orderBy(col("amount").desc()).limit(5).show()

# 6. Apply a filter (e.g., amount > 300)
df.filter(df.amount > 300).show()

# 7. Select specific columns with a filter condition
df.select("order_id", "cust_id", "amount").where(col("amount") > 300).show()

# 8. Sort by a column
df.sort("amount").show()

# 9. Rename a column
df_renamed = df.withColumnRenamed("amount", "total_amount")
df_renamed.show()


+--------+----------+-------+------+------------+
|order_id|order_date|cust_id|amount|product_name|
+--------+----------+-------+------+------------+
|       5|2025-05-05|   1002| 500.0|       Mouse|
|       4|2025-05-04|   1003| 400.0|    Keyboard|
|       2|2025-05-02|   1002| 300.0|       Phone|
|       1|2025-05-01|   1001| 250.0|      Laptop|
|       3|2025-05-03|   1001| 150.0|     Monitor|
+--------+----------+-------+------+------------+

+--------+----------+-------+------+------------+
|order_id|order_date|cust_id|amount|product_name|
+--------+----------+-------+------+------------+
|       4|2025-05-04|   1003| 400.0|    Keyboard|
|       5|2025-05-05|   1002| 500.0|       Mouse|
+--------+----------+-------+------+------------+

+--------+-------+------+
|order_id|cust_id|amount|
+--------+-------+------+
|       4|   1003| 400.0|
|       5|   1002| 500.0|
+--------+-------+------+

+--------+----------+-------+------+------------+
|order_id|order_date|cust_id|amount|produ

In [11]:
from pyspark.sql.functions import year, month, dayofmonth, quarter

# Extract and display distinct years
df.select(year("order_date").alias("year")).distinct().orderBy("year").show()

# Extract and display distinct months
df.select(month("order_date").alias("month")).distinct().orderBy("month").show()

# Extract and display distinct days
df.select(dayofmonth("order_date").alias("day")).distinct().orderBy("day").show()

# Add columns for year, month, day, and quarter
df = df.withColumn("orderyear", year(col("order_date"))) \
       .withColumn("ordermonth", month(col("order_date"))) \
       .withColumn("orderday", dayofmonth(col("order_date"))) \
       .withColumn("orderquarter", quarter(col("order_date")))

# Show the result
df.show()

+----+
|year|
+----+
|2025|
+----+

+-----+
|month|
+-----+
|    5|
+-----+

+---+
|day|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
+---+

+--------+----------+-------+------+------------+---------+----------+--------+------------+
|order_id|order_date|cust_id|amount|product_name|orderyear|ordermonth|orderday|orderquarter|
+--------+----------+-------+------+------------+---------+----------+--------+------------+
|       1|2025-05-01|   1001| 250.0|      Laptop|     2025|         5|       1|           2|
|       2|2025-05-02|   1002| 300.0|       Phone|     2025|         5|       2|           2|
|       3|2025-05-03|   1001| 150.0|     Monitor|     2025|         5|       3|           2|
|       4|2025-05-04|   1003| 400.0|    Keyboard|     2025|         5|       4|           2|
|       5|2025-05-05|   1002| 500.0|       Mouse|     2025|         5|       5|           2|
+--------+----------+-------+------+------------+---------+----------+--------+------------+

