**Task 1:** Create a simple DataFrame in Spark and explore some of the basic functions available.

In [None]:
from pyspark.sql import functions as SparkFuncs
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Test").getOrCreate()

# Main data
data = [("James", "Sales", 3000),
        ("Michael", "Sales", 4600),
        ("Robert", "Sales", 4100),
        ("Maria", "Finance", 3000)]
columns = ["EmployeeName", "Department", "Salary"]

dataframe = spark.createDataFrame(data, schema=columns)
print("Main DataFrame:")
dataframe.show()

# Bonus data
bonus_data = [("James", 500), ("Maria", 3000)]
bonus_columns = ["EmployeeName", "Bonus"]

bonus_dataframe = spark.createDataFrame(bonus_data, schema=bonus_columns)
print("Bonus DataFrame:")
bonus_dataframe.show()

# Function 1: show()
print("Show the first 2 rows:")
dataframe.show(2)

# Function 2: orderBy()
print("Ordered by salary (descending):")
dataframe.orderBy("Salary", ascending=False).show()

# Function 3: count()
print("Number of rows:", dataframe.count(), "\n")

# Function 4: groupBy()
print("Group by department:")
dataframe.groupBy("Department").count().show()

# Function 5: printSchema()
print("DataFrame's schema:")
dataframe.printSchema()

# Function 6: select()
print("Only show the EmployeeName and Department:")
dataframe.select("EmployeeName", "Department").show()

# Function 7: alias()
print("Alias:")
dataframe.select(SparkFuncs.col("EmployeeName").alias("Name")).show()

# Function 8: filter()
print("Filtered salaries larger than 4400:")
dataframe.filter(dataframe["Salary"] > 4400).show()

# Function 9: join()
print("dataframe and bonus_dataframe joined together:")
dataframe.join(bonus_dataframe, on="EmployeeName", how="full").show()

# Function 10: agg()
print("Salary average:")
dataframe.agg(SparkFuncs.avg("Salary")).show()

**Task 2:** Use `filter`, `select`, `groupBy` operations to extract information from a data, and perform data aggregation to gain insight into the dataset using commands such as `mean`, `max`, `sum`.

In [None]:
from pyspark.sql import functions as SparkFuncs
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Test").getOrCreate()

data = [("James", "Sales", 3000),
        ("Michael", "Sales", 4600),
        ("Robert", "Sales", 4100),
        ("Maria", "Finance", 3000)]
columns = ["EmployeeName", "Department", "Salary"]

dataframe = spark.createDataFrame(data, schema=columns)

print("Only show the EmployeeName and Salary:")
dataframe.select("EmployeeName", "Salary").show()

print("Filtered salaries larger than 3000:")
dataframe.filter(dataframe["Salary"] > 3000).show()

print("Average, minimum, maximum, and sum of salaries in each departments:")
dataframe.groupBy("Department").agg(
    SparkFuncs.avg("Salary").alias("Average"),
    SparkFuncs.min("Salary").alias("Minimum"),
    SparkFuncs.max("Salary").alias("Maximum"),
    SparkFuncs.sum("Salary").alias("Sum")
).show()

**Task 3:** Explore how to process complex data types in Spark DataFrames.

In [None]:
from pyspark.sql import functions as SparkFuncs
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Test").getOrCreate()

data = [("James", "Sales", 3000),
        ("Michael", "Sales", 4600),
        ("Robert", "Sales", 4100),
        ("Maria", "Finance", 3000)]
columns = ["EmployeeName", "Department", "Salary"]

dataframe = spark.createDataFrame(data, schema=columns)

print("Add SalaryBonus column to dataframe:")
dataframe = dataframe.withColumn("SalaryBonus", dataframe["Salary"] * 0.1)
dataframe.show()

print("Add TotalCompensation column to dataframe:")
dataframe = dataframe.withColumn("TotalCompensation", dataframe["Salary"] + dataframe["SalaryBonus"])
dataframe.show()

print("Array usage example:")
dataframe.select(SparkFuncs.array("EmployeeName", "Salary", "SalaryBonus")).show(truncate=False)

**Task 4:** Implement a window function to calculate running totals or rankings.

In [None]:
from pyspark.sql.window import Window
from pyspark.sql import functions as SparkFuncs
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Test").getOrCreate()

data = [("James", "Sales", 3000),
        ("Michael", "Sales", 4600),
        ("Robert", "Sales", 4100),
        ("Maria", "Finance", 3000)]
columns = ["EmployeeName", "Department", "Salary"]

dataframe = spark.createDataFrame(data, schema=columns)

window_spec = Window.partitionBy("Department").orderBy("Salary")
dataframe.withColumn("Rank", SparkFuncs.rank().over(window_spec)).show()

**Task 5:**
- Download a large dataset from [Kaggle](https://www.kaggle.com/) or another source.
- Input the downloaded CSV data, then load and save the data into PySpark.
- After the data has been successfully loaded using PySpark, manipulate the data to obtain the required information.

[Dataset URL](https://www.kaggle.com/datasets/jahaidulislam/car-specification-dataset-1945-2020)

In [None]:
from pyspark.sql import functions as SparkFuncs
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Test").getOrCreate()

dataframe = spark.read.csv("/content/Car Dataset 1945-2020.csv", header=True, inferSchema=True)
dataframe.printSchema()

dataframe = dataframe.select(
    SparkFuncs.col("Make").alias("Make"),
    SparkFuncs.col("Modle").alias("Model"),
    SparkFuncs.col("Trim").alias("Trim"),
    SparkFuncs.col("Year_from").alias("Year From"),
    SparkFuncs.col("Year_to").alias("Year To"),
    SparkFuncs.col("maximum_torque_n_m").alias("Max Torque (n/m)"),
    SparkFuncs.col("number_of_cylinders").alias("Cylinders"),
    SparkFuncs.col("capacity_cm3").alias("Displacement (l)"),
    SparkFuncs.col("engine_hp").alias("HP"),
    SparkFuncs.col("max_speed_km_per_h").alias("Top Speed (km/h)")
).dropna()

print("Top 10 Car Makers with the Most Models Produced:")
dataframe.groupBy("Make") \
  .count() \
  .withColumnRenamed("count", "Models Produced") \
  .orderBy("Models Produced", ascending=False) \
  .show(10)

print("Top 10 cars with 10 cylinders or more:")
dataframe.filter((dataframe["Cylinders"] > 10) & (dataframe["Year From"] >= 2000)).show(10)

print("Top 10 cars with the longest production span:")
dataframe.select("Make", "Model", "Year From", "Year To") \
  .withColumn("Production Span (Years)", dataframe["Year To"] - dataframe["Year From"]) \
  .groupBy("Make", "Model", "Year From", "Year To") \
  .agg(SparkFuncs.max("Production Span (Years)").alias("Production Span (Years)")) \
  .orderBy("Production Span (Years)", ascending=False) \
  .show(10)