<a href="https://colab.research.google.com/github/chetanpatil4160/Pyspark/blob/main/Spark_Coding_Question_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("interview").getOrCreate()

In [3]:
df = spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema","true") \
.load("/content/Employee_sample_data1.csv")

In [4]:
df.show()

+-----------+----------------+--------------------+-----------+--------------------+------+---------+---+----------+--------+-------+-------------+---------+---------+
|Employee_id|       full_name|                 job| department|       Business_unit|Gender|Ethnicity|Age| Hire Date|  Salary|Bonus %|      Country|     City|Exit Date|
+-----------+----------------+--------------------+-----------+--------------------+------+---------+---+----------+--------+-------+-------------+---------+---------+
|     E02002|          Kai Le|   Controls Engineer|Engineering|       Manufacturing|  Male|    Asian| 47|02-05-2022| 92368.0|     0%|United States| Columbus|     NULL|
|     E02003|    Robert Patel|             Analyst|      Sales|           Corporate|  Male|    Asian| 58|10/23/2013| 45703.0|     0%|United States|  Chicago|     NULL|
|     E02004|      Cameron Lo|Network Administr...|         IT|Research & Develo...|  Male|    Asian| 34| 3/24/2019| 83576.0|     0%|        China| Shanghai|   

**find higest salary by each department**

In [13]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, dense_rank

# Define the window specification
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())

# Select the top-paid employee in each department
top_paid_employees = (
    df.withColumn("dense_rank", dense_rank().over(window_spec))
      .filter(col("dense_rank") == 1)
      .select("Employee_id", "full_name", "job", "department")
)

# Show the results
top_paid_employees.show()

+-----------+---------------+--------------+---------------+
|Employee_id|      full_name|           job|     department|
+-----------+---------------+--------------+---------------+
|     E02861|  Anna Guerrero|          NULL|           NULL|
|     E02756|  Roman Mendoza|Vice President|     Accounting|
|     E02557|  Robert Rogers|Vice President|    Engineering|
|     E02840|       Mia Vang|          NULL|        Finance|
|     E02949|  Kinsley Huynh|Vice President|Human Resources|
|     E02593|   Samantha Zhu|Vice President|             IT|
|     E02318|     Carter Luu|Vice President|      Marketing|
|     E02256|Eloise Williams|Vice President|          Sales|
|     E02256|Eloise Williams|Vice President|          Sales|
+-----------+---------------+--------------+---------------+



**Salary Greater than Avg Salary of each departnment**

In [36]:
from pyspark.sql.functions import avg, round, col

# Calculate average salary per department
avg_salary = df.groupBy("department") \
               .agg(round(avg("salary"), 2).alias("avg_salary"))

# Define the columns to select
selected_columns = ["Employee_id", "full_name", "job", "department", "salary", "avg_salary"]

# Perform the join and filter employees earning above or equal to the department's average salary
result_df = (df.join(avg_salary, on="department", how="inner")
               .filter(col("salary") >= col("avg_salary"))
               .select(selected_columns))

# Show the first 10 records
result_df.show(10)



# 2nd Approach
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, round, col

# Define the window specification partitioned by department
window_spec = Window.partitionBy("department")

# Define the columns to select
selected_columns = ["Employee_id", "full_name", "job", "department", "salary", "avg_salary"]

# Calculate the average salary per department and filter employees earning above or equal to the department's average salary
result_df = (df.withColumn("avg_salary", round(avg("salary").over(window_spec), 2))
               .select(selected_columns)
               .filter(col("salary") >= col("avg_salary")))

# Show the result
result_df.show(10)

+-----------+----------------+--------------------+-----------+--------+----------+
|Employee_id|       full_name|                 job| department|  salary|avg_salary|
+-----------+----------------+--------------------+-----------+--------+----------+
|     E02005| Harper Castillo|IT Systems Architect|         IT| 98062.0|  91708.55|
|     E02006|Harper Dominguez|            Director|Engineering|175391.0| 110982.18|
|     E02010|   Gianna Holmes|System Administra...|         IT| 97630.0|  91708.55|
|     E02014|       Jose Wong|            Director|         IT|150558.0|  91708.55|
|     E02016|     Jacob Moore|         Sr. Manager|  Marketing|131422.0| 126982.61|
|     E02018|      Bella Tran|      Vice President|Engineering|254486.0| 110982.18|
|     E02020|    Jordan Kumar|Service Desk Analyst|         IT| 95729.0|  91708.55|
|     E02025|      Parker Lai|      Vice President| Accounting|246400.0| 127548.87|
|     E02026| Charles Simmons|             Manager|      Sales|113525.0| 102

**For Given Department Find Out the Employee Who have been Working for 2 years**

In [53]:
from pyspark.sql.functions import col, to_date, coalesce, months_between, round, current_date

df = df.withColumn(
    "hire_date",
    coalesce(
        to_date(col("Hire Date"), "dd-MM-yyyy"),
        to_date(col("Hire Date"), "M/dd/yyyy")
    )
).withColumn(
    "no_of_years",
    round(months_between(current_date(), col("hire_date")) / 12, 1)
).filter(col("hire_date").isNotNull()).filter(col("no_of_years") >= 2)  # Apply filter condition

df.show()

+-----------+----------------+--------------------+-----------+--------------------+------+---------+---+----------+--------+-------+-------------+---------+---------+----------+-----------+
|Employee_id|       full_name|                 job| department|       Business_unit|Gender|Ethnicity|Age| Hire Date|  Salary|Bonus %|      Country|     City|Exit Date| hire_date|no_of_years|
+-----------+----------------+--------------------+-----------+--------------------+------+---------+---+----------+--------+-------+-------------+---------+---------+----------+-----------+
|     E02002|          Kai Le|   Controls Engineer|Engineering|       Manufacturing|  Male|    Asian| 47|02-05-2022| 92368.0|     0%|United States| Columbus|     NULL|2022-05-02|        2.8|
|     E02003|    Robert Patel|             Analyst|      Sales|           Corporate|  Male|    Asian| 58|10/23/2013| 45703.0|     0%|United States|  Chicago|     NULL|2013-10-23|       11.3|
|     E02004|      Cameron Lo|Network Adminis

In [39]:
df.select("Hire Date").distinct().show()

+----------+
| Hire Date|
+----------+
|01-11-2005|
|01-11-2015|
|05-04-2020|
|09-12-2022|
| 2/20/2010|
|09-11-2006|
|06-02-2017|
|11/28/2020|
|05-04-2021|
|01-12-2010|
| 2/21/2014|
|03-11-2011|
|04-07-2018|
| 4/28/2000|
|11/26/2015|
|10-02-2020|
|08-01-2020|
| 1/25/2007|
| 5/19/2008|
| 1/24/2014|
+----------+
only showing top 20 rows

