In [0]:
from pyspark.sql.types import StructType, StructField, StringType

personData = [
    (("James", None, "Smith"), "OH", "M"),
    (("Anna", "Rose", ""), "NY", "F"),
    (("Julia", "", "Williams"), "OH", "F"),
    (("Maria", "Anne", "Jones"), "NY", "M"),
    (("Jen", "Mary", "Brown"), "NY", "M"),
    (("Mike", "Mary", "Williams"), "OH", "M")
]

personSchema=StructType([
    StructField("Name", StructType([
        StructField("firstName", StringType(), True),
        StructField("middleName", StringType(), True),
        StructField("lastName", StringType(), True)
    ])),
    StructField("state", StringType(), True),
    StructField("gender", StringType(), True)
])

person_df=spark.createDataFrame(data=personData, schema=personSchema)
person_df.printSchema()

root
 |-- Name: struct (nullable = true)
 |    |-- firstName: string (nullable = true)
 |    |-- middleName: string (nullable = true)
 |    |-- lastName: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import lit

schema = StructType([
  StructField("OrderId", StringType(),True),
  StructField("Product", StringType(),True),
  StructField("Quantity", IntegerType(),True),
  StructField("Price", FloatType(),True),
  StructField("Date", StringType(),True)
]
)

data = [
    ("1", "Laptop", 2, 1500.0, "2024-07-01 00:00:00"),
    ("2", "Mouse", 5, 25.0, "2024-07-02 00:00:00"),
    ("3", "Monitor", 3, 300.0, "2024-07-03 00:00:00"),
    ("4", "Keyboard", 4, 75.0, "2024-07-04 00:00:00"),
    ("5", "Mouse ", 2, 25.0, None),
    ("6", " Monitor ", 5, 300.0, "2024-07-07 00:00:00"),
    ("7", "Laptop", 3, 1500.0, "2024-07-09 00:00:00"),
    ("8", " Mouse", 7, None, "2024-07-10 00:00:00"),
    ("1", "Laptop", 3, 1500.0, "2024-07-01 00:00:00"),
    ("2", "Mouse", 5, 25.0, "2024-07-02 00:00:00")
]

#Dataframe Creation
df = spark.createDataFrame(data, schema)
df.show()

df.withColumn("discount",lit(0.1)).show()

+-------+---------+--------+------+-------------------+
|OrderId|  Product|Quantity| Price|               Date|
+-------+---------+--------+------+-------------------+
|      1|   Laptop|       2|1500.0|2024-07-01 00:00:00|
|      2|    Mouse|       5|  25.0|2024-07-02 00:00:00|
|      3|  Monitor|       3| 300.0|2024-07-03 00:00:00|
|      4| Keyboard|       4|  75.0|2024-07-04 00:00:00|
|      5|   Mouse |       2|  25.0|               NULL|
|      6| Monitor |       5| 300.0|2024-07-07 00:00:00|
|      7|   Laptop|       3|1500.0|2024-07-09 00:00:00|
|      8|    Mouse|       7|  NULL|2024-07-10 00:00:00|
|      1|   Laptop|       3|1500.0|2024-07-01 00:00:00|
|      2|    Mouse|       5|  25.0|2024-07-02 00:00:00|
+-------+---------+--------+------+-------------------+

+-------+---------+--------+------+-------------------+--------+
|OrderId|  Product|Quantity| Price|               Date|discount|
+-------+---------+--------+------+-------------------+--------+
|      1|   Laptop| 

In [0]:
from pyspark.sql.functions import col, sum,when
#display(df.withColumn("total value",col("Quantity")*col("price")))

display(df.groupBy("Product").agg(sum(col("Quantity")*col("price"))))

display(df.withColumn("discountedprice",when(col("price")>=25,col("price")*0.9).otherwise(col("price"))))

display(df.filter(col("price").isNull()))

Product,sum((Quantity * price))
Laptop,12000.0
Mouse,250.0
Keyboard,300.0
Monitor,900.0
Monitor,1500.0
Mouse,50.0
Mouse,


OrderId,Product,Quantity,Price,Date,discountedprice
1,Laptop,2,1500.0,2024-07-01 00:00:00,1350.0
2,Mouse,5,25.0,2024-07-02 00:00:00,22.5
3,Monitor,3,300.0,2024-07-03 00:00:00,270.0
4,Keyboard,4,75.0,2024-07-04 00:00:00,67.5
5,Mouse,2,25.0,,22.5
6,Monitor,5,300.0,2024-07-07 00:00:00,270.0
7,Laptop,3,1500.0,2024-07-09 00:00:00,1350.0
8,Mouse,7,,2024-07-10 00:00:00,
1,Laptop,3,1500.0,2024-07-01 00:00:00,1350.0
2,Mouse,5,25.0,2024-07-02 00:00:00,22.5


OrderId,Product,Quantity,Price,Date
8,Mouse,7,,2024-07-10 00:00:00


In [0]:
#regular expressions
from pyspark.sql.functions import regexp_extract, col


# Sample DataFrame with email addresses
data = [(1,"john1.doe@gmail.com",), (2,"jane2_smith@yahoo.com",), (3,"alice.joy3@example.com",)]
df = spark.createDataFrame(data, ["id","email"])




In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

schema = StructType([
  StructField("OrderId", StringType(),True),
  StructField("Product", StringType(),True),
  StructField("Quantity", IntegerType(),True),
  StructField("Price", FloatType(),True),
  StructField("Date", StringType(),True)
]
)

data = [
    ("1", "Laptop", 2, 1500.0, "2024-07-01 00:00:00"),
    ("2", "Mouse", 5, 25.0, "2024-07-02 00:00:00"),
    ("3", "Monitor", 3, 300.0, "2024-07-03 00:00:00"),
    ("4", "Keyboard", 4, 75.0, "2024-07-04 00:00:00"),
    ("5", "Laptop", 1, 1500.0, "2024-07-05 00:00:00"),
    ("6", "Mouse ", 2, 25.0, None),
    ("7", " Monitor ", 5, 300.0, "2024-07-07 00:00:00"),
    ("8", "Keyboard", 10, 75.0, "2024-07-08 00:00:00"),
    ("9", "Laptop", 3, 1500.0, "2024-07-09 00:00:00"),
    ("10", " Mouse", 7, None, "2024-07-10 00:00:00")
]

#Dataframe Creation
df = spark.createDataFrame(data, schema)

#show(): Display the DataFrame
df.show()

#printSchema(): Print the schema of the DataFrame
df.printSchema()


from pyspark.sql.functions import current_date
display(df.withColumn("columndate",current_date()))


from pyspark.sql.functions import to_date
display(df.withColumn("Datess",to_date(col("Date"),"yyyy-MM-DD HH:mm:ss")))

+-------+---------+--------+------+-------------------+
|OrderId|  Product|Quantity| Price|               Date|
+-------+---------+--------+------+-------------------+
|      1|   Laptop|       2|1500.0|2024-07-01 00:00:00|
|      2|    Mouse|       5|  25.0|2024-07-02 00:00:00|
|      3|  Monitor|       3| 300.0|2024-07-03 00:00:00|
|      4| Keyboard|       4|  75.0|2024-07-04 00:00:00|
|      5|   Laptop|       1|1500.0|2024-07-05 00:00:00|
|      6|   Mouse |       2|  25.0|               NULL|
|      7| Monitor |       5| 300.0|2024-07-07 00:00:00|
|      8| Keyboard|      10|  75.0|2024-07-08 00:00:00|
|      9|   Laptop|       3|1500.0|2024-07-09 00:00:00|
|     10|    Mouse|       7|  NULL|2024-07-10 00:00:00|
+-------+---------+--------+------+-------------------+

root
 |-- OrderId: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: float (nullable = true)
 |-- Date: string (nullable = true)



OrderId,Product,Quantity,Price,Date,columndate
1,Laptop,2,1500.0,2024-07-01 00:00:00,2025-11-02
2,Mouse,5,25.0,2024-07-02 00:00:00,2025-11-02
3,Monitor,3,300.0,2024-07-03 00:00:00,2025-11-02
4,Keyboard,4,75.0,2024-07-04 00:00:00,2025-11-02
5,Laptop,1,1500.0,2024-07-05 00:00:00,2025-11-02
6,Mouse,2,25.0,,2025-11-02
7,Monitor,5,300.0,2024-07-07 00:00:00,2025-11-02
8,Keyboard,10,75.0,2024-07-08 00:00:00,2025-11-02
9,Laptop,3,1500.0,2024-07-09 00:00:00,2025-11-02
10,Mouse,7,,2024-07-10 00:00:00,2025-11-02


OrderId,Product,Quantity,Price,Date,Datess
1,Laptop,2,1500.0,2024-07-01 00:00:00,
2,Mouse,5,25.0,2024-07-02 00:00:00,
3,Monitor,3,300.0,2024-07-03 00:00:00,
4,Keyboard,4,75.0,2024-07-04 00:00:00,
5,Laptop,1,1500.0,2024-07-05 00:00:00,
6,Mouse,2,25.0,,
7,Monitor,5,300.0,2024-07-07 00:00:00,
8,Keyboard,10,75.0,2024-07-08 00:00:00,
9,Laptop,3,1500.0,2024-07-09 00:00:00,
10,Mouse,7,,2024-07-10 00:00:00,


In [0]:
from pyspark.sql.functions import date_add

display(df.select("Date").withColumn("Datesadd",date_add(col("Date"),5)))


Date,Datesadd
2024-07-01 00:00:00,2024-07-06
2024-07-02 00:00:00,2024-07-07
2024-07-03 00:00:00,2024-07-08
2024-07-04 00:00:00,2024-07-09
2024-07-05 00:00:00,2024-07-10
,
2024-07-07 00:00:00,2024-07-12
2024-07-08 00:00:00,2024-07-13
2024-07-09 00:00:00,2024-07-14
2024-07-10 00:00:00,2024-07-15


In [0]:
display(df.select("Date").withColumn("Datesadd",date_add(col("Date"),5)))


#convert to unixtimestamp
from pyspark.sql.functions import unix_timestamp
display(df.select("Date").withColumn("unixtimestamp",unix_timestamp(col("Date"),"yyyy-MM-dd HH:mm:ss"))


[0;36m  File [0;32m<command-8895425144103781>, line 6[0;36m[0m
[0;31m    display(df.select("Date").withColumn("unixtimestamp",unix_timestamp(col("Date"),"yyyy-MM-dd HH:mm:ss"))[0m
[0m                                                                                                           ^[0m
[0;31mSyntaxError[0m[0;31m:[0m incomplete input


In [0]:
display(df.select("Date").withColumn("Datesadd",date_add(col("Date"),5)))


#convert to unixtimestamp
from pyspark.sql.functions import unix_timestamp
display(df.select("Date").withColumn("unixtimestamp",unix_timestamp(col("Date"),"yyyy-MM-dd HH:mm:ss")))

#from unixtimestamp to date using from_unixtimestamp.


Date,Datesadd
2024-07-01 00:00:00,2024-07-06
2024-07-02 00:00:00,2024-07-07
2024-07-03 00:00:00,2024-07-08
2024-07-04 00:00:00,2024-07-09
2024-07-05 00:00:00,2024-07-10
,
2024-07-07 00:00:00,2024-07-12
2024-07-08 00:00:00,2024-07-13
2024-07-09 00:00:00,2024-07-14
2024-07-10 00:00:00,2024-07-15


Date,unixtimestamp
2024-07-01 00:00:00,1719792000.0
2024-07-02 00:00:00,1719878400.0
2024-07-03 00:00:00,1719964800.0
2024-07-04 00:00:00,1720051200.0
2024-07-05 00:00:00,1720137600.0
,
2024-07-07 00:00:00,1720310400.0
2024-07-08 00:00:00,1720396800.0
2024-07-09 00:00:00,1720483200.0
2024-07-10 00:00:00,1720569600.0


In [0]:
# Creating the DataFrame for applying Pivot and Unpivot fuctions
data = [
    ('Electronics', 'Laptop', 1000),
    ('Electronics', 'Mouse', 50),
    ('Furniture', 'Desk', 300),
    ('Furniture', 'Chair', 150)
]

columns = ['Category', 'Product', 'Sales']
pivot = spark.createDataFrame(data, columns)

pivot.show()

#pivot

#from pyspark.sql.functions import pivot
display(pivot.groupBy("Category").agg(sum("Sales")))
# display

+-----------+-------+-----+
|   Category|Product|Sales|
+-----------+-------+-----+
|Electronics| Laptop| 1000|
|Electronics|  Mouse|   50|
|  Furniture|   Desk|  300|
|  Furniture|  Chair|  150|
+-----------+-------+-----+



Category,sum(Sales)
Electronics,1050
Furniture,450


In [0]:
#unpivot


data = [('IT',8,5),('Payroll',3,6),('HR',7,7)]

unpivot = spark.createDataFrame(data,['dept','male','female'])
unpivot.show()

from pyspark.sql.functions import stack, col, expr
unpvt=unpivot.select("dept",expr("stack(2, 'M', male, 'F', female) as (gender, count)"))
unpvt.show()

+-------+----+------+
|   dept|male|female|
+-------+----+------+
|     IT|   8|     5|
|Payroll|   3|     6|
|     HR|   7|     7|
+-------+----+------+

+-------+------+-----+
|   dept|gender|count|
+-------+------+-----+
|     IT|     M|    8|
|     IT|     F|    5|
|Payroll|     M|    3|
|Payroll|     F|    6|
|     HR|     M|    7|
|     HR|     F|    7|
+-------+------+-----+



In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


# Define schema
schema = StructType([
    StructField("OrderID", IntegerType(), True),
    StructField("Product", StringType(), True),
    StructField("Quantity", StringType(), True),
    StructField("Price", DoubleType(), True),
    StructField("Date", StringType(), True)
])

data=spark.read.format("csv").option("header", "true").option("","PERMISSIVE").schema(schema).load('dbfs:/FileStore/mithlesh123adf/employee.csv')

df=spark.createDataFrame(data, schema)
df.show()

from pyspark.sql.functions import col, sum

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-8895425144104403>, line 13[0m
[1;32m      4[0m [38;5;66;03m# Define schema[39;00m
[1;32m      5[0m schema [38;5;241m=[39m StructType([
[1;32m      6[0m     StructField([38;5;124m"[39m[38;5;124mOrderID[39m[38;5;124m"[39m, IntegerType(), [38;5;28;01mTrue[39;00m),
[1;32m      7[0m     StructField([38;5;124m"[39m[38;5;124mProduct[39m[38;5;124m"[39m, StringType(), [38;5;28;01mTrue[39;00m),
[0;32m   (...)[0m
[1;32m     10[0m     StructField([38;5;124m"[39m[38;5;124mDate[39m[38;5;124m"[39m, StringType(), [38;5;28;01mTrue[39;00m)
[1;32m     11[0m ])
[0;32m---> 13[0m data[38;5;241m=[39mspark[38;5;241m.[39mread[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mcsv[39m[38;5;124m"[39m)[38;5;241m.[39moption([38;5;124m"[39m[38;5;124mheader[39m[38;5;

In [0]:
# Use /dbfs/ prefix for pandas to access files uploaded to DBFS
excel_path = "/dbfs/FileStore/mithlesh123adf/sample_orders_with_issues.xlsx"
csv_path = "/dbfs/FileStore/mithlesh123adf/orders_temp.csv"

import pandas as pd

%pip install openpyxl pandas

pd.read_excel(excel_path).to_csv(csv_path, index=False)


# Use dbfs:/ prefix for Spark to access the same file
df = spark.read.csv(
    "dbfs:/FileStore/mithlesh123adf/orders_temp.csv",
    header=True,
    inferSchema=True
)
# display(df)


df1 = spark.read.csv(
    "dbfs:/FileStore/mithlesh123adf/orders_temp.csv",
    header=True,
    inferSchema=True,
    mode="DROPMALFORMED"
)
display(df1)




[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


OrderID,Product,Quantity,Price,Date,Unnamed: 5
1,Laptop,2,1500,2024-07-01,
2,Mouse,5,25,2024-07-02,
3,Monitor,3,300,2024-07-03,
4,Keyboard,4,75,2024-07-04,
5,Laptop,1,1500,2024-07-05,
6,Headphones,2,120,2024-07-06,
7,Webcam,3,80,2024-07-07,
8,Printer,1,450,2024-07-08,
9,Mousepad,6,10,2024-07-09,
10,Speaker,2,200,2024-07-10,


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

data = [
    (1, "John Doe", 50000.0),
    (2, "Jane Smith", 55000.0),
    (3, "Sam Brown", 60000.0),
    (4, "Lisa Ray", 52000.0),
    (5, "Tom White", 48000.0),
    (6, "Nancy Green", 62000.0),
    (7, "Gary Black", 45000.0),
    (8, "Monica Blue", 47000.0),
    (9, "Harry Pink", 53000.0),
    (10, "Megan Yellow", 58000.0),
    (11, "Chris Red", 61000.0),
    (12, "Katie Purple", 57000.0)
]

# Define the column names
columns = ["emp_id", "emp_name", "salary"]
    
# Create the DataFrame
df = spark.createDataFrame(data, columns)

# Show the DataFrame
# df.show(truncate=False)

df.rdd.getNumPartitions()

# df.rdd.glom().collect()

df2=df.repartition(2)
df2.rdd.getNumPartitions()
df2.rdd.glom().collect()

# df3=df.coalesce(2)
# df3.rdd.glom().collect()


[[Row(emp_id=1, emp_name='John Doe', salary=50000.0),
  Row(emp_id=4, emp_name='Lisa Ray', salary=52000.0),
  Row(emp_id=7, emp_name='Gary Black', salary=45000.0),
  Row(emp_id=9, emp_name='Harry Pink', salary=53000.0),
  Row(emp_id=12, emp_name='Katie Purple', salary=57000.0),
  Row(emp_id=10, emp_name='Megan Yellow', salary=58000.0)],
 [Row(emp_id=2, emp_name='Jane Smith', salary=55000.0),
  Row(emp_id=3, emp_name='Sam Brown', salary=60000.0),
  Row(emp_id=6, emp_name='Nancy Green', salary=62000.0),
  Row(emp_id=5, emp_name='Tom White', salary=48000.0),
  Row(emp_id=8, emp_name='Monica Blue', salary=47000.0),
  Row(emp_id=11, emp_name='Chris Red', salary=61000.0)]]

In [0]:
# jooins

employee_data = [
    (101, "Arun", "Manager", 1),
    (103, "Ravi", "Analyst", None),
    (104, "Neha", "Manager", 4),
    (105, "Kiran", "Developer", 2),
    (106, "Priya", "Analyst", 3),
    (107, "Priya", "Analyst", 6)
]

# Sample data for dept_df
dept_data = [
    (1, "HR"),
    (2, "IT"),
    (3, "Finance"),
    (4, "Marketing"),
    (5, "Sales")
]

# Define schema for DataFrames
employee_columns = ["emp_id", "name", "designation", "dept_id"]
dept_columns = ["dept_id", "dept_name"]

# Create DataFrames
a = spark.createDataFrame(employee_data, employee_columns)
b = spark.createDataFrame(dept_data, dept_columns)

from pyspark.sql.functions import col
# Show the DataFrames
# employee_df.show()
# dept_df.show()
empdf=a.join(b,a["dept_id"]==b["dept_id"],"fullouter")

#inner join vs left semi---left semi dont bring RECORDS from right table
display(empdf)

empdf1=a.join(b,a["dept_id"]==b["dept_id"],"left_semi")
display(empdf1)


empdf12=a.join(b,a["dept_id"]==b["dept_id"],"left_anti") #BRING DATA FROM left that does not have matches to right
display(empdf12)


emp_id,name,designation,dept_id,dept_id.1,dept_name
103.0,Ravi,Analyst,,,
101.0,Arun,Manager,1.0,1.0,HR
105.0,Kiran,Developer,2.0,2.0,IT
106.0,Priya,Analyst,3.0,3.0,Finance
104.0,Neha,Manager,4.0,4.0,Marketing
,,,,5.0,Sales
107.0,Priya,Analyst,6.0,,


emp_id,name,designation,dept_id
101,Arun,Manager,1
105,Kiran,Developer,2
106,Priya,Analyst,3
104,Neha,Manager,4


emp_id,name,designation,dept_id
103,Ravi,Analyst,
107,Priya,Analyst,6.0


In [0]:
target_data = [
    (1, 'Alice Smith', 'Marketing', 6800),
    (2, 'Carol White', 'HR', 8000),
    (3, 'Bob Brown', 'IT', 7800)
]
schema = ["employee_id", "name", "department", "salary"]
target_df = spark.createDataFrame(target_data,schema)
target_df.show()


source_data = [
    (1, 'Alice Smith', 'Marketing', 6800),
    (4, 'David Green', 'Sales', 7200),
]
schema = ["employee_id", "name", "department", "salary"]
source_df = spark.createDataFrame(source_data,schema)
source_df.show()

+-----------+-----------+----------+------+
|employee_id|       name|department|salary|
+-----------+-----------+----------+------+
|          1|Alice Smith| Marketing|  6800|
|          2|Carol White|        HR|  8000|
|          3|  Bob Brown|        IT|  7800|
+-----------+-----------+----------+------+

+-----------+-----------+----------+------+
|employee_id|       name|department|salary|
+-----------+-----------+----------+------+
|          1|Alice Smith| Marketing|  6800|
|          4|David Green|     Sales|  7200|
+-----------+-----------+----------+------+



In [0]:
records_upsert = source_df.join(target_df, on=["employee_id"],how="left_anti")
records_upsert.show(truncate=False)
# final_df = records_upsert.drop("hash64")
# final_df.show(truncate=False)

+-----------+-----------+----------+------+
|employee_id|name       |department|salary|
+-----------+-----------+----------+------+
|4          |David Green|Sales     |7200  |
+-----------+-----------+----------+------+



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

# existing data in the target table
data_target = [
    (1, "Alice Smith", "Marketing", 6800),
    (2, "Carol White", "HR", 7000),
    (3, "Bob Brown", "IT", 7800)
]
cols = ["employee_id", "name", "department", "salary"]

df_target = spark.createDataFrame(data_target, cols)
df_target.show()


data_source = [
    (1, "Alice Smith", "Marketing", 6800),  # same
    (2, "Carol White", "HR", 8000),         # salary changed
    (3, "Bob Brown", "IT", 7800),           # same
    (4, "David Green", "Sales", 7200)       # new record
]

df_source = spark.createDataFrame(data_source, cols)
# df_source.show()

leftantidf=df_source.join(df_target, on=["employee_id"],how="left_anti")
# leftantidf.show(truncate=False)

#lets use hashfunction
df_src=df_source.withColumn("hash64", F.hash(F.concat(F.col("employee_id"), F.col("name")), F.col("department"), F.col("salary")))

df_tgt=df_target.withColumn("hash64", F.hash(F.concat(F.col("employee_id"), F.col("name")), F.col("department"), F.col("salary")))

df_src.show(truncate=False)
df_tgt.show(truncate=False)

df_src.join(df_tgt, on=["hash64"],how="left_anti").show(truncate=False)
#without hashing we mis carolwhite as left anti on employee_id ignores carlowhite


+-----------+-----------+----------+------+
|employee_id|       name|department|salary|
+-----------+-----------+----------+------+
|          1|Alice Smith| Marketing|  6800|
|          2|Carol White|        HR|  7000|
|          3|  Bob Brown|        IT|  7800|
+-----------+-----------+----------+------+

+-----------+-----------+----------+------+----------+
|employee_id|name       |department|salary|hash64    |
+-----------+-----------+----------+------+----------+
|1          |Alice Smith|Marketing |6800  |95159712  |
|2          |Carol White|HR        |8000  |-792810495|
|3          |Bob Brown  |IT        |7800  |717746182 |
|4          |David Green|Sales     |7200  |-471694084|
+-----------+-----------+----------+------+----------+

+-----------+-----------+----------+------+-----------+
|employee_id|name       |department|salary|hash64     |
+-----------+-----------+----------+------+-----------+
|1          |Alice Smith|Marketing |6800  |95159712   |
|2          |Carol White|H