## Create a table from your CSV file

In [0]:
# Step 1: Read the CSV file using Spark
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/Volumes/firstworkspace/default/files/employees.csv")

# Step 2: Create a temporary view to query the data using SQL
df.createOrReplaceTempView("employees_view")

# Step 3: Query the data using SQL
display(spark.sql("SELECT * FROM employees_view"))

EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
198,Donald,OConnell,DOCONNEL,650.507.9833,21-JUN-07,SH_CLERK,2600,-,124,50
199,Douglas,Grant,DGRANT,650.507.9844,13-JAN-08,SH_CLERK,2600,-,124,50
200,Jennifer,Whalen,JWHALEN,515.123.4444,17-SEP-03,AD_ASST,4400,-,101,10
201,Michael,Hartstein,MHARTSTE,515.123.5555,17-FEB-04,MK_MAN,13000,-,100,20
202,Pat,Fay,PFAY,603.123.6666,17-AUG-05,MK_REP,6000,-,201,20
203,Susan,Mavris,SMAVRIS,515.123.7777,07-JUN-02,HR_REP,6500,-,101,40
204,Hermann,Baer,HBAER,515.123.8888,07-JUN-02,PR_REP,10000,-,101,70
205,Shelley,Higgins,SHIGGINS,515.123.8080,07-JUN-02,AC_MGR,12008,-,101,110
206,William,Gietz,WGIETZ,515.123.8181,07-JUN-02,AC_ACCOUNT,8300,-,205,110
100,Steven,King,SKING,515.123.4567,17-JUN-03,AD_PRES,24000,-,-,90


In [0]:
# Display the schema to verify column types
df.printSchema()

# Show sample data
display(df)

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- COMMISSION_PCT: string (nullable = true)
 |-- MANAGER_ID: string (nullable = true)
 |-- DEPARTMENT_ID: integer (nullable = true)



EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
198,Donald,OConnell,DOCONNEL,650.507.9833,21-JUN-07,SH_CLERK,2600,-,124,50
199,Douglas,Grant,DGRANT,650.507.9844,13-JAN-08,SH_CLERK,2600,-,124,50
200,Jennifer,Whalen,JWHALEN,515.123.4444,17-SEP-03,AD_ASST,4400,-,101,10
201,Michael,Hartstein,MHARTSTE,515.123.5555,17-FEB-04,MK_MAN,13000,-,100,20
202,Pat,Fay,PFAY,603.123.6666,17-AUG-05,MK_REP,6000,-,201,20
203,Susan,Mavris,SMAVRIS,515.123.7777,07-JUN-02,HR_REP,6500,-,101,40
204,Hermann,Baer,HBAER,515.123.8888,07-JUN-02,PR_REP,10000,-,101,70
205,Shelley,Higgins,SHIGGINS,515.123.8080,07-JUN-02,AC_MGR,12008,-,101,110
206,William,Gietz,WGIETZ,515.123.8181,07-JUN-02,AC_ACCOUNT,8300,-,205,110
100,Steven,King,SKING,515.123.4567,17-JUN-03,AD_PRES,24000,-,-,90


In [0]:
# Count total records
display(spark.sql("SELECT COUNT(*) AS total_employees FROM employees_view"))

# Show unique job titles
display(spark.sql("SELECT DISTINCT JOB_ID FROM employees_view ORDER BY JOB_ID"))

# Salary statistics
display(spark.sql("""
  SELECT 
    MIN(SALARY) AS min_salary,
    MAX(SALARY) AS max_salary,
    AVG(SALARY) AS avg_salary,
    SUM(SALARY) AS total_payroll
  FROM employees_view
"""))

total_employees
50


JOB_ID
AC_ACCOUNT
AC_MGR
AD_ASST
AD_PRES
AD_VP
FI_ACCOUNT
FI_MGR
HR_REP
IT_PROG
MK_MAN


min_salary,max_salary,avg_salary,total_payroll
2100,24000,6182.32,309116


In [0]:
# 1. Create a department summary view
spark.sql("""
CREATE OR REPLACE TEMP VIEW department_stats AS
SELECT 
  DEPARTMENT_ID,
  COUNT(*) AS employee_count,
  ROUND(AVG(SALARY), 2) AS avg_salary,
  SUM(SALARY) AS total_salary
FROM employees_view
WHERE DEPARTMENT_ID IS NOT NULL
GROUP BY DEPARTMENT_ID
ORDER BY DEPARTMENT_ID
""")

# 2. Create an employee directory view
spark.sql("""
CREATE OR REPLACE TEMP VIEW employee_directory AS
SELECT 
  EMPLOYEE_ID,
  CONCAT(FIRST_NAME, ' ', LAST_NAME) AS full_name,
  EMAIL,
  PHONE_NUMBER,
  JOB_ID,
  SALARY,
  DEPARTMENT_ID
FROM employees_view
ORDER BY LAST_NAME, FIRST_NAME
""")

DataFrame[]

In [0]:
# Show department statistics
display(spark.sql("SELECT * FROM department_stats"))

# Search for employees in a specific department
display(spark.sql("""
  SELECT * FROM employee_directory 
  WHERE DEPARTMENT_ID = 50
  ORDER BY SALARY DESC
"""))

# Find high earners
display(spark.sql("""
  SELECT * FROM employee_directory
  WHERE SALARY > 10000
  ORDER BY SALARY DESC
"""))

DEPARTMENT_ID,employee_count,avg_salary,total_salary
10,1,4400.0,4400
20,2,9500.0,19000
30,6,4150.0,24900
40,1,6500.0,6500
50,23,3721.74,85600
60,5,5760.0,28800
70,1,10000.0,10000
90,3,19333.33,58000
100,6,8601.33,51608
110,2,10154.0,20308


EMPLOYEE_ID,full_name,EMAIL,PHONE_NUMBER,JOB_ID,SALARY,DEPARTMENT_ID
121,Adam Fripp,AFRIPP,650.123.2234,ST_MAN,8200,50
120,Matthew Weiss,MWEISS,650.123.1234,ST_MAN,8000,50
122,Payam Kaufling,PKAUFLIN,650.123.3234,ST_MAN,7900,50
123,Shanta Vollman,SVOLLMAN,650.123.4234,ST_MAN,6500,50
124,Kevin Mourgos,KMOURGOS,650.123.5234,ST_MAN,5800,50
137,Renske Ladwig,RLADWIG,650.121.1234,ST_CLERK,3600,50
133,Jason Mallin,JMALLIN,650.127.1934,ST_CLERK,3300,50
129,Laura Bissot,LBISSOT,650.124.5234,ST_CLERK,3300,50
138,Stephen Stiles,SSTILES,650.121.2034,ST_CLERK,3200,50
125,Julia Nayer,JNAYER,650.124.1214,ST_CLERK,3200,50


EMPLOYEE_ID,full_name,EMAIL,PHONE_NUMBER,JOB_ID,SALARY,DEPARTMENT_ID
100,Steven King,SKING,515.123.4567,AD_PRES,24000,90
102,Lex De Haan,LDEHAAN,515.123.4569,AD_VP,17000,90
101,Neena Kochhar,NKOCHHAR,515.123.4568,AD_VP,17000,90
201,Michael Hartstein,MHARTSTE,515.123.5555,MK_MAN,13000,20
205,Shelley Higgins,SHIGGINS,515.123.8080,AC_MGR,12008,110
108,Nancy Greenberg,NGREENBE,515.124.4569,FI_MGR,12008,100
114,Den Raphaely,DRAPHEAL,515.127.4561,PU_MAN,11000,30


In [0]:
# Save as a Delta table for better performance
df.write.format("delta").mode("overwrite").saveAsTable("employees_delta")

# Or save as a Parquet file
df.write.mode("overwrite").parquet("dbfs:/Volumes/firstworkspace/default/files/employees_processed.parquet")