In [1]:
from pyspark.sql import SparkSession

# create a SparkSession object
spark = SparkSession.builder.appName("Restaurant").getOrCreate()

# read the CSV files and create corresponding dataframes
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)
foods_df = spark.read.csv("foods.csv", header=True, inferSchema=True)
sales_df = spark.read.csv("sales.csv", header=True, inferSchema=True)
week2sales_df = spark.read.csv("week2sales.csv", header=True, inferSchema=True)

# print the schema of each dataframe
print("Schema of customers dataframe:")
customers_df.printSchema()

print("Schema of foods dataframe:")
foods_df.printSchema()

print("Schema of sales dataframe:")
sales_df.printSchema()

print("Schema of week2sales dataframe:")
week2sales_df.printSchema()


Schema of customers dataframe:
root
 |-- ID: integer (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Occupation: string (nullable = true)

Schema of foods dataframe:
root
 |-- Food ID: integer (nullable = true)
 |-- Food Item: string (nullable = true)
 |-- Price: double (nullable = true)

Schema of sales dataframe:
root
 |-- Customer ID: integer (nullable = true)
 |-- Food ID: integer (nullable = true)

Schema of week2sales dataframe:
root
 |-- Customer ID: integer (nullable = true)
 |-- Food ID: integer (nullable = true)



In [2]:
# join the customers, saless, and foods dataframes
joined_df = customers_df.join(sales_df.join(foods_df))

# compute the total amount spent by each customer
total_spent_df = joined_df.groupBy("Customer ID").sum("Price")

# filter the customers who have spent more than $10
result_df = total_spent_df.filter(total_spent_df["sum(Price)"] > 10)

# count the number of customers who have spent more than $10
num_customers = result_df.count()

# print the result
print("Number of customers who have spent more than $10:", num_customers)


Number of customers who have spent more than $10: 221


In [3]:
# count the number of transactions made by each customer
num_transactions_df = sales_df.groupBy("Customer ID").count()

# filter the customers who have made more than 3 transactions
result_df = num_transactions_df.filter(num_transactions_df["count"] > 3)

# count the number of customers who have made more than 3 transactions
num_customers = result_df.count()

# print the result
print("Number of customers who had more than 3 transactions:", num_customers)


Number of customers who had more than 3 transactions: 1


In [4]:
# join the foods and week2sales dataframes
joined_df = week2sales_df.join(foods_df, "Food ID", "inner")

# count the number of times each food item appears in the week2sales dataframe
most_consumed_df = joined_df.groupBy("Food ID", "Food Item").count()

# sort the result in descending order of count
most_consumed_df = most_consumed_df.sort("count", ascending=False)

# get the name of the most consumed food item
most_consumed_food = most_consumed_df.select("Food Item").first()[0]

# print the result
print("The most consumed food item is:", most_consumed_food)


The most consumed food item is: Drink
