In [1]:
# https://www.linkedin.com/feed/update/urn:li:activity:7231719926451859456/
# SQL Interview Questions:
# 𝐃𝐢𝐟𝐟𝐢𝐜𝐮𝐥𝐭𝐲 - 𝐌𝐞𝐝𝐢𝐮𝐦

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, DateType
from pyspark.sql import Window as W
import pyspark.sql.functions as F

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("SQL Interview Questions") \
    .getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/06 13:37:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
#  Schema : Using Employees (EmployeeID, Name, RegionID) and Sales (SaleID, EmployeeID, Amount)
employees_data = [
    ("E101", "John Doe", "R001"),
    ("E102", "Jane Smith", "R002"),
    ("E103", "Mike Johnson", "R001"),
    ("E104", "Emily Davis", "R003"),
    ("E105", "Sarah Brown", "R001"),
    ("E106", "Michelle Ramirez", "R002"),
    ("E107", "Michael Thompson", "R003"),
    ("E108", "Jessica Taylor", "R004"),
    ("E109", "Daniel Anderson", "R002"),
    ("E110", "Laura Martinez", "R003"),
    ("E111", "Christopher Lee", "R001"),
    ("E112", "Anthony Harris", "R004"),
    ("E113", "Patricia Adams", "R005"),
    ("E114", "Jennifer Clark", "R005"),
    ("E115", "Robert Young", "R004")
]

# Create DataFrame for Employees
employees_df = spark.createDataFrame(employees_data, ["EmployeeID", "Name", "RegionID"])

# Sample data for Sales
sales_data = [
    ("S001", "E101", 5000),
    ("S002", "E102", 3000),
    ("S003", "E103", 4000),
    ("S004", "E104", 2000),
    ("S005", "E105", 3500),
    ("S006", "E106", 4500),
    ("S007", "E107", 3000),
    ("S008", "E108", 6000),
    ("S009", "E109", 7000),
    ("S010", "E110", 8000),
    ("S011", "E111", 2500),
    ("S012", "E112", 4000),
    ("S013", "E113", 1500),
    ("S014", "E114", 3000),
    ("S015", "E115", 5000)
]

# Create DataFrame for Sales
sales_df = spark.createDataFrame(sales_data, ["SaleID", "EmployeeID", "Amount"])
employees_df.createOrReplaceTempView("Employees")
sales_df.createOrReplaceTempView("Sales")

# Find the Top 3 Employees with the Highest Total Sales in Each Region

In [6]:
employees_df.show(100,False)
sales_df.show(100,False)

                                                                                

+----------+----------------+--------+
|EmployeeID|Name            |RegionID|
+----------+----------------+--------+
|E101      |John Doe        |R001    |
|E102      |Jane Smith      |R002    |
|E103      |Mike Johnson    |R001    |
|E104      |Emily Davis     |R003    |
|E105      |Sarah Brown     |R001    |
|E106      |Michelle Ramirez|R002    |
|E107      |Michael Thompson|R003    |
|E108      |Jessica Taylor  |R004    |
|E109      |Daniel Anderson |R002    |
|E110      |Laura Martinez  |R003    |
|E111      |Christopher Lee |R001    |
|E112      |Anthony Harris  |R004    |
|E113      |Patricia Adams  |R005    |
|E114      |Jennifer Clark  |R005    |
|E115      |Robert Young    |R004    |
+----------+----------------+--------+

+------+----------+------+
|SaleID|EmployeeID|Amount|
+------+----------+------+
|S001  |E101      |5000  |
|S002  |E102      |3000  |
|S003  |E103      |4000  |
|S004  |E104      |2000  |
|S005  |E105      |3500  |
|S006  |E106      |4500  |
|S007  |E107   

In [12]:
# Find the Top 3 Employees with the Highest Total Sales in Each Region
spark.sql("""
with sales as (
    select EmployeeID,sum(Amount) as Amount from sales group by 1
)
select * from (
    select *, row_number() over(partition by RegionID order by Amount desc) as rnk 
    from Employees e inner join Sales s 
    on e.EmployeeID = s.EmployeeID
    )x
where rnk <= 3
order by RegionID,rnk
""").show(100,False)

[Stage 59:>                                                         (0 + 1) / 1]

+----------+----------------+--------+----------+------+---+
|EmployeeID|Name            |RegionID|EmployeeID|Amount|rnk|
+----------+----------------+--------+----------+------+---+
|E101      |John Doe        |R001    |E101      |5000  |1  |
|E103      |Mike Johnson    |R001    |E103      |4000  |2  |
|E105      |Sarah Brown     |R001    |E105      |3500  |3  |
|E109      |Daniel Anderson |R002    |E109      |7000  |1  |
|E106      |Michelle Ramirez|R002    |E106      |4500  |2  |
|E102      |Jane Smith      |R002    |E102      |3000  |3  |
|E110      |Laura Martinez  |R003    |E110      |8000  |1  |
|E107      |Michael Thompson|R003    |E107      |3000  |2  |
|E104      |Emily Davis     |R003    |E104      |2000  |3  |
|E108      |Jessica Taylor  |R004    |E108      |6000  |1  |
|E115      |Robert Young    |R004    |E115      |5000  |2  |
|E112      |Anthony Harris  |R004    |E112      |4000  |3  |
|E114      |Jennifer Clark  |R005    |E114      |3000  |1  |
|E113      |Patricia Ada

                                                                                

In [22]:
#  Schema :  Sales (ProductID, SaleDate, Quantity).
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from datetime import date

# Define the schema for Sales
sales_schema = StructType([
    StructField("ProductID", StringType(), True),
    StructField("SaleDate", DateType(), True),
    StructField("Quantity", IntegerType(), True)
])

# Sample data for Sales (with increasing sales across three months)
sales_data = [
    ("P001", date(2024, 7, 1), 100),
    ("P001", date(2024, 8, 1), 150),
    ("P001", date(2024, 9, 1), 200),
    ("P002", date(2024, 7, 2), 80),
    ("P002", date(2024, 8, 2), 70),
    ("P002", date(2024, 9, 2), 180),
    ("P003", date(2024, 7, 3), 90),
    ("P003", date(2024, 8, 3), 140),
    ("P003", date(2024, 9, 3), 190),
    ("P004", date(2024, 7, 4), 70),
    ("P004", date(2024, 8, 4), 60),
    ("P004", date(2024, 9, 4), 130),
    ("P005", date(2024, 7, 5), 110),
    ("P005", date(2024, 8, 5), 140),
    ("P005", date(2024, 9, 5), 170),
    ("P006", date(2024, 7, 6), 60),
    ("P006", date(2024, 8, 6), 110),
    ("P006", date(2024, 9, 6), 160),
    ("P006", date(2024, 10, 1), 160),
    ("P006", date(2024, 10, 2), 160),
]

# Create DataFrame for Sales
sales_df = spark.createDataFrame(sales_data, sales_schema)
sales_df.createOrReplaceTempView("sales")
sales_df.show(100,False)
spark.sql("select ProductID,substr(SaleDate,0,7) as sale_month, sum(Quantity) from sales group by 1,2 order by 1,2").show(100,False)

+---------+----------+--------+
|ProductID|SaleDate  |Quantity|
+---------+----------+--------+
|P001     |2024-07-01|100     |
|P001     |2024-08-01|150     |
|P001     |2024-09-01|200     |
|P002     |2024-07-02|80      |
|P002     |2024-08-02|70      |
|P002     |2024-09-02|180     |
|P003     |2024-07-03|90      |
|P003     |2024-08-03|140     |
|P003     |2024-09-03|190     |
|P004     |2024-07-04|70      |
|P004     |2024-08-04|60      |
|P004     |2024-09-04|130     |
|P005     |2024-07-05|110     |
|P005     |2024-08-05|140     |
|P005     |2024-09-05|170     |
|P006     |2024-07-06|60      |
|P006     |2024-08-06|110     |
|P006     |2024-09-06|160     |
|P006     |2024-10-01|160     |
|P006     |2024-10-02|160     |
+---------+----------+--------+

+---------+----------+-------------+
|ProductID|sale_month|sum(Quantity)|
+---------+----------+-------------+
|P001     |2024-07   |100          |
|P001     |2024-08   |150          |
|P001     |2024-09   |200          |
|P002    

In [28]:
# Identify Products with Sales Increasing for Three Consecutive Months

spark.sql("""
with sales as (
select ProductID,substr(SaleDate,0,7) as sale_month, sum(Quantity) as Quantity from sales group by 1,2 order by 1,2
) 
select distinct ProductID from(
    select *, 
    lead(Quantity, 1) over(partition by ProductID order by sale_month) as month2,
    lead(Quantity, 2) over(partition by ProductID order by sale_month) as month3
    from sales
)x
where month3 > month2 and month2 > Quantity
""").show(100,False)

+---------+
|ProductID|
+---------+
|P001     |
|P003     |
|P005     |
|P006     |
+---------+



In [55]:
# Schema : Orders (OrderID, CustomerID, OrderDate)

from pyspark.sql.types import StructType, StructField, StringType, DateType
from datetime import date

# Define the schema for Orders
orders_schema = StructType([
    StructField("OrderID", StringType(), True),
    StructField("CustomerID", StringType(), True),
    StructField("OrderDate", DateType(), True)
])

# Sample data for Orders (with orders placed in consecutive months)
orders_data = [
    ("O001", "C001", date(2024, 7, 10)),
    ("O002", "C001", date(2024, 8, 12)),
    ("O003", "C001", date(2024, 9, 15)),
    ("O004", "C002", date(2024, 7, 20)),
    ("O005", "C002", date(2024, 8, 25)),
    ("O006", "C003", date(2024, 6, 5)),
    ("O007", "C003", date(2024, 7, 7)),
    ("O008", "C003", date(2024, 8, 9)),
    ("O009", "C004", date(2024, 5, 13)),
    ("O010", "C004", date(2024, 6, 14)),
    ("O011", "C004", date(2024, 7, 16)),
    ("O012", "C005", date(2024, 7, 22)),
    ("O013", "C005", date(2024, 9, 23)),
    ("O014", "C006", date(2024, 8, 3)),
    ("O015", "C006", date(2024, 9, 4)),
    ("O016", "C007", date(2024, 7, 30)),
    ("O017", "C007", date(2024, 8, 31)),
    ("O018", "C007", date(2024, 9, 1)),
]

# Create DataFrame for Orders
orders_df = spark.createDataFrame(orders_data, orders_schema)
orders_df.createOrReplaceTempView("Orders")

orders_df.show(100,False)

+-------+----------+----------+
|OrderID|CustomerID|OrderDate |
+-------+----------+----------+
|O001   |C001      |2024-07-10|
|O002   |C001      |2024-08-12|
|O003   |C001      |2024-09-15|
|O004   |C002      |2024-07-20|
|O005   |C002      |2024-08-25|
|O006   |C003      |2024-06-05|
|O007   |C003      |2024-07-07|
|O008   |C003      |2024-08-09|
|O009   |C004      |2024-05-13|
|O010   |C004      |2024-06-14|
|O011   |C004      |2024-07-16|
|O012   |C005      |2024-07-22|
|O013   |C005      |2024-09-23|
|O014   |C006      |2024-08-03|
|O015   |C006      |2024-09-04|
|O016   |C007      |2024-07-30|
|O017   |C007      |2024-08-31|
|O018   |C007      |2024-09-01|
+-------+----------+----------+



In [56]:
# List Customers Who Placed Orders in Consecutive Months
spark.sql("""
    select *, 
    lead(OrderDate , 1) over(partition by CustomerID order by OrderDate) as month2,
    lead(OrderDate , 2) over(partition by CustomerID order by OrderDate) as month3
    from Orders
""").show(100,False)

+-------+----------+----------+----------+----------+
|OrderID|CustomerID|OrderDate |month2    |month3    |
+-------+----------+----------+----------+----------+
|O001   |C001      |2024-07-10|2024-08-12|2024-09-15|
|O002   |C001      |2024-08-12|2024-09-15|NULL      |
|O003   |C001      |2024-09-15|NULL      |NULL      |
|O004   |C002      |2024-07-20|2024-08-25|NULL      |
|O005   |C002      |2024-08-25|NULL      |NULL      |
|O006   |C003      |2024-06-05|2024-07-07|2024-08-09|
|O007   |C003      |2024-07-07|2024-08-09|NULL      |
|O008   |C003      |2024-08-09|NULL      |NULL      |
|O009   |C004      |2024-05-13|2024-06-14|2024-07-16|
|O010   |C004      |2024-06-14|2024-07-16|NULL      |
|O011   |C004      |2024-07-16|NULL      |NULL      |
|O012   |C005      |2024-07-22|2024-09-23|NULL      |
|O013   |C005      |2024-09-23|NULL      |NULL      |
|O014   |C006      |2024-08-03|2024-09-04|NULL      |
|O015   |C006      |2024-09-04|NULL      |NULL      |
|O016   |C007      |2024-07-

In [57]:
# List Customers Who Placed Orders in Consecutive Months
spark.sql("""
select CustomerID from(
    select *, 
    lead(OrderDate , 1) over(partition by CustomerID order by OrderDate) as month2,
    lead(OrderDate , 2) over(partition by CustomerID order by OrderDate) as month3
    from Orders
)x
where date_diff(substr(month2,0,7),substr(OrderDate,0,7)) <= 31 and date_diff(substr(month3,0,7),substr(month2,0,7)) <= 31 
""").show(100,False)

+----------+
|CustomerID|
+----------+
|C001      |
|C003      |
|C004      |
|C007      |
+----------+



                                                                                

In [66]:
#  Schema: Employees (EmployeeID, Name, DepartmentID, Salary)
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# Define the schema for Employees
employees_schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("DepartmentID", StringType(), True),
    StructField("Salary", FloatType(), True)
])

# Sample data for Employees (with varying salaries within each department)
employees_data = [
    ("E001", "John Doe", "D001", 75000.0),
    ("E002", "Jane Smith", "D001", 85000.0),
    ("E003", "Mike Johnson", "D001", 95000.0),
    ("E004", "Emily Davis", "D001", 65000.0),
    ("E005", "Sarah Brown", "D002", 72000.0),
    ("E006", "Michelle Ramirez", "D002", 82000.0),
    ("E007", "Michael Thompson", "D002", 92000.0),
    ("E008", "Jessica Taylor", "D002", 62000.0),
    ("E009", "Daniel Anderson", "D003", 87000.0),
    ("E010", "Laura Martinez", "D003", 97000.0),
    ("E011", "Christopher Lee", "D003", 87000.0),
    ("E012", "David Wilson", "D003", 67000.0),
    ("E013", "Kevin Brown", "D004", 80000.0),
    ("E014", "Nancy Green", "D004", 70000.0),
    ("E015", "Paul White", "D004", 90000.0),
    ("E016", "Laura Red", "D004", 60000.0)
]

# Create DataFrame for Employees
employees_df = spark.createDataFrame(employees_data, employees_schema)
employees_df.createOrReplaceTempView("Employees")
employees_df.show(100,False)


+----------+----------------+------------+-------+
|EmployeeID|Name            |DepartmentID|Salary |
+----------+----------------+------------+-------+
|E001      |John Doe        |D001        |75000.0|
|E002      |Jane Smith      |D001        |85000.0|
|E003      |Mike Johnson    |D001        |95000.0|
|E004      |Emily Davis     |D001        |65000.0|
|E005      |Sarah Brown     |D002        |72000.0|
|E006      |Michelle Ramirez|D002        |82000.0|
|E007      |Michael Thompson|D002        |92000.0|
|E008      |Jessica Taylor  |D002        |62000.0|
|E009      |Daniel Anderson |D003        |87000.0|
|E010      |Laura Martinez  |D003        |97000.0|
|E011      |Christopher Lee |D003        |87000.0|
|E012      |David Wilson    |D003        |67000.0|
|E013      |Kevin Brown     |D004        |80000.0|
|E014      |Nancy Green     |D004        |70000.0|
|E015      |Paul White      |D004        |90000.0|
|E016      |Laura Red       |D004        |60000.0|
+----------+----------------+--

In [73]:
# Find the Second Highest Salary in Each Department
spark.sql("""
with Employees as (
select *, rank() over(partition by DepartmentID order by Salary desc) rnk from Employees
) select * from Employees where rnk = 2 order by DepartmentID,Salary desc
""").show(100,False)

+----------+----------------+------------+-------+---+
|EmployeeID|Name            |DepartmentID|Salary |rnk|
+----------+----------------+------------+-------+---+
|E002      |Jane Smith      |D001        |85000.0|2  |
|E006      |Michelle Ramirez|D002        |82000.0|2  |
|E009      |Daniel Anderson |D003        |87000.0|2  |
|E011      |Christopher Lee |D003        |87000.0|2  |
|E013      |Kevin Brown     |D004        |80000.0|2  |
+----------+----------------+------------+-------+---+



In [75]:
#  Schema : Given Sales (SaleID, ProductID, SaleDate, Amount)
from pyspark.sql.types import StructType, StructField, StringType, DateType, FloatType
from datetime import date

# Define the schema for Sales
sales_schema = StructType([
    StructField("SaleID", StringType(), True),
    StructField("ProductID", StringType(), True),
    StructField("SaleDate", DateType(), True),
    StructField("Amount", FloatType(), True)
])

# Sample data for Sales (with sales over a span of months)
sales_data = [
    ("S001", "P001", date(2024, 1, 15), 200.0),
    ("S002", "P001", date(2024, 2, 10), 250.0),
    ("S003", "P001", date(2024, 3, 12), 300.0),
    ("S004", "P001", date(2024, 4, 15), 220.0),
    ("S005", "P002", date(2024, 1, 20), 150.0),
    ("S006", "P002", date(2024, 2, 25), 180.0),
    ("S007", "P002", date(2024, 3, 30), 210.0),
    ("S008", "P002", date(2024, 4, 10), 160.0),
    ("S009", "P003", date(2024, 1, 5), 300.0),
    ("S010", "P003", date(2024, 2, 8), 350.0),
    ("S011", "P003", date(2024, 3, 15), 400.0),
    ("S012", "P003", date(2024, 4, 12), 320.0),
    ("S013", "P004", date(2024, 1, 18), 100.0),
    ("S014", "P004", date(2024, 2, 22), 120.0),
    ("S015", "P004", date(2024, 3, 25), 130.0),
    ("S016", "P004", date(2024, 4, 20), 110.0),
]

# Create DataFrame for Sales
sales_df = spark.createDataFrame(sales_data, sales_schema)
sales_df.createOrReplaceTempView("Sales")
sales_df.sort("ProductID","SaleDate").show(100,False)

+------+---------+----------+------+
|SaleID|ProductID|SaleDate  |Amount|
+------+---------+----------+------+
|S001  |P001     |2024-01-15|200.0 |
|S002  |P001     |2024-02-10|250.0 |
|S003  |P001     |2024-03-12|300.0 |
|S004  |P001     |2024-04-15|220.0 |
|S005  |P002     |2024-01-20|150.0 |
|S006  |P002     |2024-02-25|180.0 |
|S007  |P002     |2024-03-30|210.0 |
|S008  |P002     |2024-04-10|160.0 |
|S009  |P003     |2024-01-05|300.0 |
|S010  |P003     |2024-02-08|350.0 |
|S011  |P003     |2024-03-15|400.0 |
|S012  |P003     |2024-04-12|320.0 |
|S013  |P004     |2024-01-18|100.0 |
|S014  |P004     |2024-02-22|120.0 |
|S015  |P004     |2024-03-25|130.0 |
|S016  |P004     |2024-04-20|110.0 |
+------+---------+----------+------+



In [82]:
# Calculate the Moving Average of Sales for Each Product Over the Last 3 Months
spark.sql("""
select *, 
avg(Amount) over(partition by ProductID order by SaleDate ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as MovingAverage 
from Sales
""").show(100,False)

+------+---------+----------+------+------------------+
|SaleID|ProductID|SaleDate  |Amount|MovingAverage     |
+------+---------+----------+------+------------------+
|S001  |P001     |2024-01-15|200.0 |200.0             |
|S002  |P001     |2024-02-10|250.0 |225.0             |
|S003  |P001     |2024-03-12|300.0 |250.0             |
|S004  |P001     |2024-04-15|220.0 |256.6666666666667 |
|S005  |P002     |2024-01-20|150.0 |150.0             |
|S006  |P002     |2024-02-25|180.0 |165.0             |
|S007  |P002     |2024-03-30|210.0 |180.0             |
|S008  |P002     |2024-04-10|160.0 |183.33333333333334|
|S009  |P003     |2024-01-05|300.0 |300.0             |
|S010  |P003     |2024-02-08|350.0 |325.0             |
|S011  |P003     |2024-03-15|400.0 |350.0             |
|S012  |P003     |2024-04-12|320.0 |356.6666666666667 |
|S013  |P004     |2024-01-18|100.0 |100.0             |
|S014  |P004     |2024-02-22|120.0 |110.0             |
|S015  |P004     |2024-03-25|130.0 |116.66666666

In [None]:

# You have a table named 'employee_performance' with columns (employee_id, review_date, performance_score). Write a query to calculate the average performance score for each employee over their last three reviews.


# Given a table 'website_traffic' with columns (visit_date, page_views, unique_visitors), write a query to calculate the percentage change in unique visitors from the previous day to the current day.


# You have two tables: 'orders' with columns (order_id, order_date, customer_id, order_total) and 'customers' with columns (customer_id, customer_name, signup_date). Write a query to find the total order amount for each customer who has made purchases in the last 6 months, along with their signup date.


# Using a table 'inventory' with columns (product_id, quantity_in_stock, reorder_level), write a query to identify products that need to be reordered, along with the percentage of stock remaining based on the reorder level.


# Given a table 'user_activity' with columns (user_id, activity_date, activity_type), write a query to determine the most common activity type for each user over the last month, along with the count of that activity.


In [None]:

# 𝐆𝐨𝐨𝐠𝐥𝐞'𝐬 𝐒𝐐𝐋 𝐢𝐧𝐭𝐞𝐫𝐯𝐢𝐞𝐰 𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧 !


# For each unique user in the dataset, find the latest date when their flags got reviewed. Then, find total number of distinct videos that were removed on that date (by any user).

# Output the the first and last name of the user (in two columns), the date and the number of removed videos. Only include these users who had at least one of their flags reviewed by Youtube. If no videos got removed on a certain date, output 0.

# Tables: 

# user_flags

# user_firstname : varchar
# user_lastname : varchar
# video_id : varchar 
# flag_id : varchar

# flag_review

# flag_id : varchar
# reviewed_by_yt : bool
# reviewed_date : datetime
# reviewed_outcome : varchar

In [None]:
# the question is :

# Suppose we have a cluster of 10 Nodes,16 cores per Node and 
# 64GB RAM per Node. How are you going to decide Distribution of Executors, Cores and Memory for a Spark Application execution :

# Let’s assign 5 core per executors => --executor-cores = 5 (for good HDFS throughput)

# Leave 1 core per node for Hadoop/Yarn daemons => Num cores available per node = 16-1 = 15
# So, Total available of cores in cluster = 15 x 10 = 150

# Number of available executors = (total cores/num-cores-per-executor) = 150/5 = 30

# Leaving 1 executor for ApplicationManager => --num-executors = 29
# Number of executors per node = 30/10 = 3

# Memory per executor = 64GB/3 = 21GB
# Counting off heap overhead = 7% of 21GB = 3GB. 

# So, actual --executor-memory = 21 - 3 = 18GB

# So, recommended config is: 29 executors, 18GB memory each and 5 cores each!!

# *You can keep the number of cores per executor in the range of 2-5*

In [None]:
#Deloitte
# 𝐒𝐐𝐋-𝐫𝐞𝐥𝐚𝐭𝐞𝐝 𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧𝐬:

# 1. Given a table named "financial_transaction" with columns like transaction_id, transaction_date, amount, and transaction_type (credit or debit), 
# write an SQL query to calculate the total balance for each day, considering both credit and debit transactions. Display the result with the columns "transaction_date" and "total_balance".

# 2. Given three tables: Employees, Departments, and Projects, write a SQL query to find the employees who have been involved in at least two projects from different departments. 
# The query should return the employee's name and the number of distinct departments they have been involved with.

# 𝐃𝐚𝐭𝐚 𝐖𝐚𝐫𝐞𝐡𝐨𝐮𝐬𝐢𝐧𝐠 𝐂𝐨𝐧𝐜𝐞𝐩𝐭𝐬:

# 3. Explain the difference between a Star Schema and a Snowflake Schema, including their respective structures and when one would be preferred over the other.

# 4. In a Star Schema, how would you handle slowly changing dimensions? Discuss the different types of slowly changing dimensions (Type 1, Type 2, Type 3) and techniques for managing them.

# 5. Why did you prefer the Snowflake Schema in your project? What were the key components you kept in mind while designing a Snowflake Schema?

# 𝐏𝐲𝐒𝐩𝐚𝐫𝐤-𝐫𝐞𝐥𝐚𝐭𝐞𝐝 𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧𝐬:

# 6. Explain how the Spark architecture works internally. Why is Spark faster than MapReduce?

# 7. Write code to implement incremental load in PySpark.

# 8. What optimizations have you used in your previous project? What is salting?

# 9. Why should we not use wide transformations? Give some scenarios where you can eliminate the effects of using wide transformations.

# 10. You are given a PySpark DataFrame `transactions` representing e-commerce transactions with a schema containing columns like order_date, customer_id, product_id, quantity, and price. Write PySpark code to perform the following tasks:
#  - Calculate the total revenue generated from all orders.
#  - Find the top 10 customers by total revenue.
#  - For each product, calculate the average price and the total quantity sold.
#  - Identify the products that have been sold in all months of the year.

In [None]:
# Resume Crafting: I’ve updated my resume to show my projects, skills, and achievements, including my experience with data platforms and cloud pipelines.
# Skill Enhancement: I’m focusing on problem-solving with SQL, Python, and PySpark by practising mock interviews with my mentor.
# Job Searching: I’ve updated my profile on Naukri, LinkedIn, and Instahyre, and also reaching out to my connections for referrals to maximize my opportunities.

In [None]:
# We have two tables, 
# 🚀 table A 
# 1
# 1
# 1
# 0

# 🚀 Table B
# 1
# 1
# 0
# 1
# Null

# ⭕ Now tell me the count for Left Join, Right Join, Inner Join, Full Join?

In [None]:
# PwC:
# Generally there were 3 Rounds of Interviews --

# Round 1 --
# Very crucial and nervous

# Current Project Explanation.
# Any Major issues resolved in the current project.
# What all optimization techniques have you used in your project
# Hadoop Architecture
# Three SQL questions majorly on joins, subqueries, Group By, inline view,with Clause, timestamp.
# Coding Questions Pyspark or scala Spark.
# What are different constraints in sql and why do they different?

# Round 2 --
# Mostly focused on Scenario based questions. Those include--

# How do you increase mappers?
# What if Running Job fails in sqoop?
# How do you update with Latest data on Sqoop?
# What you do if data skewing happens?
# What kind of file formats do you use? and where?
# Why we need RDD?
# What is the use driver in spark?
# How do you tackle Memory exceptions errors in spark?
# What kind of join do you use if one partition has more data and others have less data?
# Why do we need to use containers in spark?
# What happens if we increase More partitions in spark?
# Write down command to increase no.of partitions?
# Write down UDF query?

# Round 3 --
# Important to get through or rejected

# Architecture of MapReduce?
# What is Outliers in MapReduce?
# What is Partition, shuffle & sort?
# What is block report in Hadoop?
# Partition By Vs Bucketing in Hive?
# Difference between Cassandra and HBase?
# What is Catalyst optimizer?
# Explain the ETL tools have you used?
# Explain basic cloud concepts and more questions on specific cloud we worked?
# DataFrame Vs Dataset?
# Explain Broadcast join?

In [None]:
# SQL:
# Explain the various types of Joins?
# Explain Equi join?
# Explain the Right Outer Join?
# How do you alter the name of a column?
# how can you build a stored procedure?
# Distinguish between MongoDB and MySQL?
# Compare the 'Having' and 'Where' clauses in detail?
# What are the differences between COALESCE() and ISNULL()?
# What is the difference between “Stored Procedure” and “Function”?
# What is the difference between the “DELETE” and “TRUNCATE” commands?
# What is difference between “Clustered Index” and “Non Clustered Index”?
# What is the difference between “Primary Key” and “Unique Key”?
# What is the difference between a “Local Temporary Table” and “Global Temporary Table”?
# What is the difference between primary key and unique constraints?
# What are the differences between DDL, DML and DCL in SQL?
# What is a view in SQL? How to create view?
# What is a Trigger?
# What is the difference between Trigger and Stored Procedure?
# What are indexes?
# What are Primary Keys and Foreign Keys?
# What are wildcards used in database for Pattern Matching?
# What is Union, minus and Interact commands?
# What is RDBMS?
# What is OLTP?
# What is Aggregate Functions?
# What is the difference between UNION and UNION ALL?
# What is a foreign key, and what is it used for?

# Scenario Based Questions--
#  SQL Query to find second highest salary of Employee?
#  SQL Query to find Max Salary from each department?
#  Write SQL Query to display current date?
#  Write an SQL Query to check whether date passed to Query is date of given format or not?
#  Write a SQL Query to print the name of distinct employee whose DOB is between 01/01/1960 to 31/12/1975?
#  Write an SQL Query to find employee whose Salary is equal or greater than 10000?
#  Write an SQL Query to find name of employee whose name Start with ‘M’?
#  Find the 3rd MAX salary in the emp table?
#  Suppose there is annual salary information provided by emp table. 
# How to fetch monthly salary of each and every employee?
# Display the list of employees who have joined the company before 30th June 90 or after 31st dec 90?

In [None]:
# #apachespark is the most discussed topic in all of my interviews. Hence here iam sharing you the interviewquestions have asked in my interview rounds.

# Those includes --
# What is spark? Explain Architecture
# Explain where did you use spark in your project?
# What all optimization techniques have you used in spark?
# Explain transformations and actions have you used?
# What happens when you use shuffle in spark?
# Difference between ReduceByKey Vs GroupByKey?
# Explain the issues you resolved when you working with spark?
# Compare Spark vs Hadoop MapReduce?
# Difference between Narrow & wide transformations?
# What is partition and how spark Partitions the data?
# What is RDD?
# what is broadcast variable?
# Difference between Sparkcontext Vs Sparksession?
# Explain about transformations and actions in the spark?
# what is Executor memory in spark?
# What is lineage graph?
# What is DAG?
# Explain libraries that Spark Ecosystem supports?
# What is a DStream?
# What is Catalyst optimizer and explain it?
# Why parquet file format is best for spark?
# Difference between dataframe Vs Dataset Vs RDD?
# Explain features of Apache Spark?
# Explain Lazy evaluation and why is it need?
# Explain Pair RDD?
# What is Spark Core?
# What is the difference between persist() and cache()?
# What are the various levels of persistence in Apache Spark?
# Does Apache Spark provide check pointing?
# How can you achieve high availability in Apache Spark?
# Explain Executor Memory in a Spark?
# What are the disadvantages of using Apache Spark?
# What is the default level of parallelism in apache spark?
# Compare map() and flatMap() in Spark?
# Difference between repartition Vs coalesce?
# Explain Spark Streaming?
# Explain accumulators?
# What is the use of broadcast join?

# What is PySpark Architecture?
# What's the difference between an RDD, a DataFrame & DataSet?
# How can you create a DataFrame a) using existing RDD, and b) from a CSV file?
# Explain the use of StructType and StructField classes in PySpark with examples?
# What are the different ways to handle row duplication in a PySpark DataFrame?
# Explain PySpark UDF with the help of an example?
# Discuss the map() transformation in PySpark DataFrame
# what do you mean by ‘joins’ in PySpark DataFrame? What are the different types of joins?
# What is PySpark ArrayType?
# What is PySpark Partition?
# What is meant by PySpark MapType? How can you create a MapType using StructType?
# How can PySpark DataFrame be converted to Pandas DataFrame?
# What is the function of PySpark's pivot() method?
# In PySpark, how do you generate broadcast variables?
# When to use Client and Cluster modes used for deployment?
# How can data transfers be kept to a minimum while using PySpark?
# What are Sparse Vectors? What distinguishes them from dense vectors?
# What API does PySpark utilize to implement graphs?
# What is meant by Piping in PySpark?
# What are the various levels of persistence that exist in PySpark?
# List some of the benefits of using PySpark?
# Why do we use PySpark SparkFiles?
# Does PySpark provide a machine learning API?
# What are the types of PySpark’s shared variables and why are they useful?
# What PySpark DAGScheduler?

In [None]:
# Common spark interview questions that might help for cracking interviews as well as for better learning!!

# Picking up another conceptual topic related to spark which is most frequently asked i.e. :

# Off heap vs on heap memory :

# Talking about off heap memory :

# Each executor within worker node has access to off heap memory and it can be used by spark explicitly for storing its data.
# Amount of off heap memory can be governed by spark.memory.offHeap.size and can be enabled by setting spark.memory.offHeap.use to true.

# Talking about on heap memory :

# On heap memory comprises of three sections:

# 1. Reserved memory : Reserved by spark for internal purposes and is equal to 300 mb per executor.

# 2. User memory : For storing the data structures created and managed by the user's code and comprises of 40% of on heap memory.

# 3. Unified memory : It comprises of 60% of on heap memory.


# Unified memory = Execution memory + storage memory 

# Execution memory : JVM heap space used by data structures during shuffle operations(join and aggregations)

# Storage memory : JVM heap space reserved for cached data 

# To summarise , enabling off heap memory avoids Garbage collection scan overhead and accessing off heap is slightly slower than accessing on heap storage but still faster than reading or writing from a disk because of serialization /deserialization overload .On heap memory is managed and controlled by Garbage collector where as off heap memory is manged by OS.

In [None]:
# Common spark interview questions that might help for cracking interviews as well as for better learning!!

# Picking up another theoretical topic related to spark which is most frequently asked i.e. :

# Difference between parquet , avro and ORC file formats supported by pyspark.

# Parquet:
# Parquet is a columnar storage format developed by the Apache Software Foundation for use in the Hadoop ecosystem.
# It is optimized for query performance and efficient storage of data.
# Parquet stores data in a columnar format, meaning that values from the same column are stored together, allowing for better compression ("snappy", "gzip", "lzo", "brotli", or "zstd") and faster query processing, especially when dealing with selective column projections.
# It supports nested data structures, making it suitable for handling complex data types like arrays and structs.
# Parquet files are splittable, meaning they can be read in parallel, which is crucial for distributed processing frameworks like Apache Spark.

# Exaple to save DataFrame to Parquet with Snappy compression:
# df.write.parquet("/path/to/save/location", compression="snappy")

# Avro:
# Avro is a row-based data serialization system developed within the Apache Hadoop project.
# It is designed to be compact, fast, and suitable for both serialization and data exchange between systems.
# Avro supports schema evolution, meaning that you can change the schema of your data without needing to modify the entire dataset.
# Avro supports rich data structures, including nested records and complex types like arrays and maps.
# It provides features like data compression ( "snappy", "gzip", "deflate", "bzip2", or "zstandard") and schema resolution, making it efficient for use cases like data serialization and data exchange between different systems.
# Avro is self-describing, meaning that the schema is stored along with the data, making it easier to read and process without prior knowledge of the schema.

# Example to save DataFrame to Avro with Snappy compression:
# df.write.format("avro").option("compression", "snappy").save("/path/to/save/location")



# ORC (Optimized Row Columnar):
# ORC is a columnar storage format developed for use in the Apache Hive data warehouse system.
# It is optimized for both read and write performance, especially in the context of analytics and data warehousing workloads.
# ORC provides features like predicate pushdown and column projections, allowing for efficient query processing by reading only the necessary columns and rows.
# Similar to Parquet, ORC files are splittable and support compression ("snappy", "zlib", "lzo", "lz4", or "zstd"), making them suitable for use in distributed processing frameworks like Apache Spark.
# ORC also supports schema evolution, enabling you to evolve your data schema over time without requiring a full data rewrite.

# Example to save DataFrame to ORC with Snappy compression:
# df.write.orc("/path/to/save/location", compression="snappy")

In [None]:
# Common spark interview questions that might help for cracking interviews as well as for better learning!!

# Another rarely asked question is UDF in pyspark:

# PySpark UDF is a User Defined Function that is used to create a reusable function in Spark.

# CREATION OF UDF IN PYSPARK:
# i. Creating UDF ,first we write python function :

# def upperCase(str):
#  return str.upper()

# ii. Then Converting function to UDF using below code:

# upperCaseUDF = udf(lambda z: upperCase(z))

# iii. Then directly using it with dataframes as below:

# df.withColumn("Cureated Name", upperCaseUDF(col("Name"))) \
#  .show(truncate=False)

# CREATION OF UDF USING SPARK SQL:
# i. In order to use above defined upperCase() function on PySpark SQL, you need to register the function with PySpark by using spark.udf.register().

# spark.udf.register("upperCaseUDF", upperCase,StringType())
# df.createOrReplaceTempView("NAME_TABLE")
# spark.sql("select Seqno, upperCaseUDF(Name) as Name from NAME_TABLE").show(truncate=False)

# CREATION OF UDF USING ANNOTATIONS:
# i. Create it with just a single step by using annotations.

# @udf(returnType=StringType()) 
# def upperCase(str):
#  return str.upper()

# df.withColumn("Cureated Name", upperCase(col("Name"))) \
# .show(truncate=False)


In [None]:
# Common spark interview questions that might help for cracking interviews as well as for better learning!!

# Picking up a practical question related to spark i.e. :

# How to dynamically rename multiple columns in PySpark dataFrame ?

# Methods to dynamically rename multiple columns in PySpark data frame:
# i. Using loops :
# First obtain all the columns in the list using the columns function: 

# total_columns=data_frame.columns

# Further, run a loop to dynamically rename multiple columns in Pyspark data frame using prefix, suffix or doing any other changes:

# for i in range(len(total_columns)):
#  data_frame=data_frame.withColumnRenamed(total_columns[i], 'class_'+ total_columns[i])
#  hashtag#or
#  data_frame=data_frame.withColumnRenamed(total_columns[i], total_columns[i].replace('_','__'))

# ii. Using reduce() function :
# Dynamically rename multiple columns in PySpark data frame using prefix, suffix or doing any other changes using reduce() function:

# import functools
# data_frame = functools.reduce(lambda data_frame,
#  idx: data_frame.withColumnRenamed(list(data_frame.schema.names)[idx],
#  list(data_frame.schema.names)[idx] + '_suffix'),
#  range(len(list(data_frame.schema.names))), data_frame)

# Later on, dynamically rename multiple columns in PySpark data frame by replacing some characters using replace and reduce() function:

# data_frame = functools.reduce(lambda data_frame,
#  idx: data_frame.withColumnRenamed(list(data_frame.schema.names)[idx],
#  list(data_frame.schema.names)[idx].replace('hashtag#character','hashtag#other-character')),
#  range(len(list(data_frame.schema.names))), data_frame)

# iii. Using the alias() function:
# Dynamically rename multiple columns in PySpark data frame using prefix, suffix or doing any other changes using alias:

# updated_columns = [col(col_name).alias("prefix_" + col_name + "_suffix") for col_name in data_frame.columns]

#  v. Using toDF() function:
# Define the new column names which you want to give to all the columns:

# columns=['new_column_name_1','new_column_name_2','new_column_name_3']

# Finally, use the function toDF() and assign the names to the data frame and display it:

# data_frame.toDF(*columns).show()

In [None]:
# PySpark Data Engineer Interview experience at Big 4 - KPMG India Deloitte EY PwC (4 years of experience)

# 𝐈𝐧𝐭𝐫𝐨𝐝𝐮𝐜𝐭𝐢𝐨𝐧:
# 1. Can you provide an overview of your experience working with PySpark and big data processing?
# 2. What motivated you to specialize in PySpark, and how have you applied it in your previous roles?

# 𝐏𝐲𝐒𝐩𝐚𝐫𝐤 𝐁𝐚𝐬𝐢𝐜𝐬:
# 3. Explain the basic architecture of PySpark.
# 4. How does PySpark relate to Apache Spark, and what advantages does it offer in distributed data processing?

# 𝐃𝐚𝐭𝐚𝐅𝐫𝐚𝐦𝐞 𝐎𝐩𝐞𝐫𝐚𝐭𝐢𝐨𝐧𝐬:
# 5. Describe the difference between a DataFrame and an RDD in PySpark.
# 6. Can you explain transformations and actions in PySpark DataFrames?
# 7. Provide examples of PySpark DataFrame operations you frequently use.

# 𝐎𝐩𝐭𝐢𝐦𝐢𝐳𝐢𝐧𝐠 𝐏𝐲𝐒𝐩𝐚𝐫𝐤 𝐉𝐨𝐛𝐬:
# 8. How do you optimize the performance of PySpark jobs?
# 9. Can you discuss techniques for handling skewed data in PySpark?

# 𝐃𝐚𝐭𝐚 𝐒𝐞𝐫𝐢𝐚𝐥𝐢𝐳𝐚𝐭𝐢𝐨𝐧 𝐚𝐧𝐝 𝐂𝐨𝐦𝐩𝐫𝐞𝐬𝐬𝐢𝐨𝐧:
# 10. Explain how data serialization works in PySpark.
# 11. Discuss the significance of choosing the right compression codec for your PySpark applications.

# 𝐇𝐚𝐧𝐝𝐥𝐢𝐧𝐠 𝐌𝐢𝐬𝐬𝐢𝐧𝐠 𝐃𝐚𝐭𝐚:
# 12. How do you deal with missing or null values in PySpark DataFrames?
# 13. Are there any specific strategies or functions you prefer for handling missing data?

# 𝐖𝐨𝐫𝐤𝐢𝐧𝐠 𝐰𝐢𝐭𝐡 𝐏𝐲𝐒𝐩𝐚𝐫𝐤 𝐒𝐐𝐋:
# 14. Describe your experience with PySpark SQL.
# 15. How do you execute SQL queries on PySpark DataFrames?

# 𝐁𝐫𝐨𝐚𝐝𝐜𝐚𝐬𝐭𝐢𝐧𝐠 𝐢𝐧 𝐏𝐲𝐒𝐩𝐚𝐫𝐤:
# 16. What is broadcasting, and how is it useful in PySpark?
# 17. Provide an example scenario where broadcasting can significantly improve performance.

# 𝐏𝐲𝐒𝐩𝐚𝐫𝐤 𝐌𝐚𝐜𝐡𝐢𝐧𝐞 𝐋𝐞𝐚𝐫𝐧𝐢𝐧𝐠:
# 18. Discuss your experience with PySpark's MLlib.
# 19. Can you give examples of machine learning algorithms you've implemented using PySpark?

# 𝐉𝐨𝐛 𝐌𝐨𝐧𝐢𝐭𝐨𝐫𝐢𝐧𝐠 𝐚𝐧𝐝 𝐋𝐨𝐠𝐠𝐢𝐧𝐠:
# 20. How do you monitor and troubleshoot PySpark jobs?
# 21. Describe the importance of logging in PySpark applications.

# 𝐈𝐧𝐭𝐞𝐠𝐫𝐚𝐭𝐢𝐨𝐧 𝐰𝐢𝐭𝐡 𝐎𝐭𝐡𝐞𝐫 𝐓𝐞𝐜𝐡𝐧𝐨𝐥𝐨𝐠𝐢𝐞𝐬:
# 22. Have you integrated PySpark with other big data technologies or databases? If so, please provide examples.
# 23. How do you handle data transfer between PySpark and external systems?

# 𝐑𝐞𝐚𝐥-𝐰𝐨𝐫𝐥𝐝 𝐏𝐫𝐨𝐣𝐞𝐜𝐭 𝐒𝐜𝐞𝐧𝐚𝐫𝐢𝐨:
# 24. Explain the project that you worked on in your previous organizations.
# 25. Describe a challenging PySpark project you've worked on. What were the key challenges, and how did you overcome them?

# 𝐂𝐥𝐮𝐬𝐭𝐞𝐫 𝐌𝐚𝐧𝐚𝐠𝐞𝐦𝐞𝐧𝐭:
# 26. Explain your experience with cluster management in PySpark.
# 27. How do you scale PySpark applications in a cluster environment?

# 𝐏𝐲𝐒𝐩𝐚𝐫𝐤 𝐄𝐜𝐨𝐬𝐲𝐬𝐭𝐞𝐦:
# 28. Can you name and briefly describe some popular libraries or tools in the PySpark ecosystem, apart from the core PySpark functionality?

In [None]:
# 𝐑𝐞𝐚𝐥-𝐓𝐢𝐦𝐞 𝐒𝐩𝐚𝐫𝐤 𝐒𝐜𝐞𝐧𝐚𝐫𝐢𝐨 𝐈𝐧𝐭𝐞𝐫𝐯𝐢𝐞𝐰 𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧:

# 𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧:
# You have a clickstream table consisting of >10 billion rows, you wanted to write a sample query to fetch details of customers who clicked on a recently launched campaign. As the data is really huge its advised to run the query on the sample data first, how do you handle this situation and extract only a small percent of data to run your query.

# 𝐒𝐨𝐥𝐮𝐭𝐢𝐨𝐧:
# TABLESAMPLE x PERCENT 
# It is used to randomly select a sampling of data from a table. "x" specifies the percentage of data that will be included in the sample.

# It does not guarantee an exact percentage of rows to be selected as a sample. This is because the sample is selected randomly, so the actual percentage of selected rows may vary slightly.

# 𝐒𝐚𝐦𝐩𝐥𝐞. 𝐐𝐮𝐞𝐫𝐲:
# SELECT 
#  customer_id, click_time, product_id
# FROM clickstream_table TABLESAMPLE (1 PERCENT)
# WHERE campaign_id = 12345

In [None]:
# #Interview_Questions_for_Data_Engineer
# **Question 26: What are some best practices for tuning Spark applications for optimal performance?**

# Tuning Spark applications for optimal performance is essential to make the most of your cluster's resources. Here are some best practices for achieving this and an example code sample for each practice:
# 🔹Memory Management:
# Optimize memory allocation: Set the appropriate memory configurations, such as spark.driver.memory, spark.executor.memory, and spark.memory.fraction, to efficiently use available resources.

# from pyspark.sql import SparkSession

# spark = SparkSession.builder \
#  .appName("MemoryManagementExample") \
#  .config("spark.driver.memory", "2g") \
#  .config("spark.executor.memory", "4g") \
#  .getOrCreate()

# spark.stop()

# 🔹Data Serialization:
# Choose efficient serialization: Use a more efficient serialization format like Avro or Parquet, which can reduce the amount of data transfer and storage.

# spark.conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
# spark.conf.set("spark.kryo.registrator", "your.package.YourKryoRegistrator")

# 🔹Data Partitioning:
# Optimize data partitioning: Repartition your data using repartition or coalesce to balance data distribution and avoid data skew.

# df = df.repartition(100)

# 🔹Caching and Persistence:
# Cache and persist data: Use cache() or persist() to store frequently used data in memory for faster access.

# df = df.cache()

# 🔹Shuffling Optimization:
# Minimize shuffling: Avoid unnecessary shuffling by using operations like reduceByKey instead of groupByKey.

# rdd.reduceByKey(lambda a, b: a + b)

# 🔹Broadcasting:
# Broadcast small data: Broadcast small DataFrames to all nodes to avoid unnecessary data transfer.

# broadcast_small_df = spark.sparkContext.broadcast(small_df)

# 🔹Optimized Joins:
# Use optimized join strategies: Utilize broadcast join, bucketed join, or sort-merge join based on your data distribution.

# joined_df = large_df.join(broadcast(small_df), "id", "inner")

# 🔹Dynamic Resource Allocation:
# Enable dynamic allocation: Allow Spark to allocate and release resources dynamically based on the workload.
# spark.conf.set("spark.dynamicAllocation.enabled", "true")

In [None]:
# https://github.com/DataExpert-io/data-engineer-handbook

In [None]:
# Apache Spark has a distributed and modular internal architecture that allows it to efficiently process large volumes of data. Below, I'll provide an overview of Spark's internal architecture:

# 1. **Driver Program:**
#  - The Driver Program is the entry point of any Spark application. It runs the `main()` function and creates a SparkSession or SparkContext, which is used to coordinate the execution of Spark tasks.
#  - The Driver Program is responsible for dividing the application into tasks, scheduling these tasks on the cluster, and managing their execution.

# 2. **Cluster Manager:**
#  - Spark can run on various cluster managers, including Apache Hadoop YARN, Apache Mesos, and its standalone cluster manager.
#  - The cluster manager allocates resources (CPU and memory) to Spark applications and monitors their progress.

# 3. **Cluster Nodes (Executors):**
#  - Executors are worker nodes in the Spark cluster where the actual computation takes place.
#  - Each executor runs tasks in separate JVM processes. Executors communicate with the Driver Program and with each other for data shuffling and task coordination.

# 4. **Task:**
#  - A task is the smallest unit of work in Spark and represents a computation to be executed on a single partition of data.
#  - Tasks are spawned by the Driver Program and run on the Executors.

# 5. **Distributed Data Storage:**
#  - Spark can use distributed data storage systems like Hadoop Distributed File System (HDFS) and Apache HBase to store and access data.
#  - Data is divided into partitions and distributed across the cluster, enabling parallel processing.

# 6. **Resilient Distributed Dataset (RDD):**
#  - RDD is Spark's fundamental data structure. It represents distributed collections of data that can be processed in parallel.
#  - RDDs are immutable and fault-tolerant, meaning they can recover from node failures by recomputing lost data.

# 7. **Directed Acyclic Graph (DAG):**
#  - Spark uses a DAG to represent a logical execution plan of tasks and transformations.
#  - Each stage in the DAG corresponds to a set of transformations that can be executed in parallel.
#  - The DAG scheduler optimizes the execution plan and schedules tasks based on data dependencies.

# 8. **Transformations and Actions:**
#  - Transformations are operations applied to RDDs to create new RDDs (e.g., `map`, `filter`, `reduceByKey`).
#  - Actions are operations that trigger the execution of transformations and return results (e.g., `collect`, `count`, `saveAsTextFile`).

# 9. **Shuffling:**
#  - Shuffling is the process of redistributing data across partitions or nodes in the cluster.
#  - It's a costly operation and can impact performance, so minimizing shuffling is important for Spark optimization.

# 10. **Caching:**
#  - Spark allows you to cache (persist) RDDs or DataFrames in memory, improving performance for iterative algorithms and interactive data analysis.

# #apachespark #pyspark #dataengineering #dataengineer #bigdataengineer #bigdatadeveloper

In [None]:
# 10 trending questions asked in Apache Spark interviews

# 1. how are initial number of partitions calculated in a dataframe

# 2. what happens internally when you execute spark-submit

# 3. what is a partition skew and how to tackle it

# 4. what are the spark optimization techniques you have used

# 5. what is a broadcast join, how does it work internally

# 6. how do you optimize 2 large table joins

# 7. please explain about memory management in apache spark

# 8. what is caching in spark, and when do you consider caching a dataframe

# 9. how do you handle out of memory errors in spark

# 10. what is the difference between partitioning and bucketing, please explain with a usecase

# Do mention you answers in the comments!

# what other questions in apache spark are trending?

# P.S ~ I teach big data and my students are leading big data teams in top companies. My new batch is starting on coming Saturday.
# visit my website https://lnkd.in/gt_jpCyE to know more.

# #bigdata #dataengineering #apachespark 

In [None]:
# How to choose the number of Executor cores in Apache Spark?

# We struggle to decide on the Spark configurations and mostly rely on default ones. I agree that in 95 % of scenarios the default configuration is the most optimized one. But sometimes we also need to tune the configuration parameters for performance and so it should not be a black box for us. 

# Let's try to understand how to decide on the number of executor cores and get a framework for it. Before that keep the following things in mind:

# - Spark application requires daemon processes to run in the background. So set aside 1 core per node for that.
# - YARN Application Manager is responsible for negotiating resources with Resource manager. Set aside one executor in the cluster for that.
# - A lot of concurrent threads (More than 5) degrade the HDFS throughput.

# Cluster Configuration: 10 nodes, 16 cores, 64 GB

# There are two extreme ways to configure executors:

# 1. Tiny Executors: 1 core per executor, number of executor per node=16, executor-memory=64/16=4GB, total number of executors=10*16=160

# Problems: 1. Multithreading is not possible since only one core per executor. 2. When using shared variables like broadcast or accumulator it has to be replicated in all the 16 executors per node- A lot of shuffling


# 1. Fat Executors: 16 cores per executor, number of executor per node=1, executor-memory=64/1=64GB, total number of executors=1*10=10

# Problems: 1. HDFS throughput suffers because of a lot concurrent threading 2. Garbage Collection takes a lot time since memory per executor is very high

# Balanced Approach: Let's take 5 cores per executors. Why 5? Because beyond 5 HDFS throughput will suffer and we get good multithreading also.
# Leave 1 core per node for daemon processes, Number of cores available per node =16-1=15
# Total number of available cores=15*10=150
# Number of executors=150/5=30
# Leave 1 executor for AM, num-executors=30-1=29
# number of executors per node=30/10=3
# Memory per executor= 64/3 ~ 21GB
# Counting the 7% off heap memory actual Executor memory = 21*93% ~ 19GB

# Optimized Config: 29 Executors, 19GB memory, 5 cores each

# In practice one size does not fit all. You need to keep tuning as per cluster configuration. But in general the number of executor cores should be 2-5.

# I have created a document for the same. Do have a look.

# If my posts help you somehow, please follow Shuvajit Hazra and share the post.

# #Analytics #DataEngineering #ApacheSpark

In [None]:
# Spark Architecture Quick Overview

# Cluster 
# ======
# A Cluster is a group of JVMs (nodes) connected by the network, each of which runs Spark, either in Driver or Worker roles.

# Driver
# ======
# The Driver is one of the nodes in the Cluster.

# The driver does not run computations (filter, map, reduce, etc).
# It plays the role of a master node in the Spark cluster.

# When you call collect() on an RDD or Dataset, the whole data is sent to the Driver. This is why you should be careful when calling collect().

# Executor
# ========
# Executors are JVMs that run on Worker nodes. These are the JVMs that actually run Tasks on data Partitions.

# Application
# =========
# An application comprises of several jobs. A job is created, whenever you execute an action function like write().

# Job
# ====
# A job comprises of several stages. When Spark encounters a function that requires a shuffle it creates a new stage. 

# Transformation functions like reduceByKey(), Join() etc will trigger a shuffle and will result in a new stage. 

# Stage
# =====
# A stage comprises of several tasks and every task in the stage executes the same set of instructions.

# Task
# =====
# Task is the smallest execution unit in Spark. It is a single operation applied to single partition.
 
# Tasks are executed inside an executor. 

# A Spark application can have many jobs. 
# A job can have many stages. 
# A stage can have many tasks. 
# A task executes a series of instructions.
