In [0]:
read_csv_data = spark.read.format('csv')\
    .option('header', 'true')\
        .option('inferSchema', 'true')\
            .option('mode', 'permissive')\
                .load('/FileStore/tables/flight_data.csv')

read_csv_data.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [0]:
%fs
ls /FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/corrupted_json.json,corrupted_json.json,214,1745986381000
dbfs:/FileStore/tables/disk_part.csv,disk_part.csv,429,1745986381000
dbfs:/FileStore/tables/employee.csv,employee.csv,225,1745986369000
dbfs:/FileStore/tables/flight_data.csv,flight_data.csv,7120,1745986369000
dbfs:/FileStore/tables/line_deli_extra_fields.json,line_deli_extra_fields.json,228,1745986382000
dbfs:/FileStore/tables/line_delimited_json.json,line_delimited_json.json,215,1745986382000
dbfs:/FileStore/tables/multi_line_incorrect.json,multi_line_incorrect.json,328,1745986382000
dbfs:/FileStore/tables/multiline_correct.json,multiline_correct.json,388,1745986382000
dbfs:/FileStore/tables/resturant_json_data.json,resturant_json_data.json,669503,1745986404000


In [0]:
read_csv_data.columns

Out[7]: ['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [0]:
read_csv_data.show(10)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+
only showing top 10 rows



In [0]:
from datetime import datetime

data1 = datetime.now()        # today
print(data1)

month1 = data1.month
print(month1)

m1 = data1.strftime('%B')
print(m1)

day_name = data1.strftime('%A')
print(day_name)

2025-04-30 04:14:28.061290
4
April
Wednesday


In [0]:
from pyspark.sql.functions import *

df = spark.createDataFrame(data=[(1, )], schema=['id']).withColumn('current_time', current_timestamp())
df.show(truncate=False)

+---+-----------------------+
|id |current_time           |
+---+-----------------------+
|1  |2025-04-30 04:14:28.454|
+---+-----------------------+



In [0]:
from pyspark.sql.functions import date_format

df2 = df.withColumn("month", month("current_time")) \
    .withColumn("month_name", date_format("current_time", "MMMM")) \
        .withColumn("day_name", date_format("current_time", "EEEE"))

display(df2)

id,current_time,month,month_name,day_name
1,2025-04-30T04:14:31.288+0000,4,April,Wednesday


In [0]:
from pyspark.sql.functions import current_date    # "*"

df1 = spark.createDataFrame(data=[(1, )], schema=['id']).withColumn('today', current_date())
# display(df1)

df_m_y = df1.withColumn('month_name', date_format('today', 'MMMM'))\
    .withColumn('year', year('today'))\
        .withColumn('month', month('today'))
display(df_m_y)

id,today,month_name,year,month
1,2025-04-30,April,2025,4


In [0]:
%pip install requests

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import requests
from pyspark.sql import SparkSession

# spark = SparkSession.builder.appName("API_Post").getOrCreate()   # already available in Databricks
BEARER_TOKEN = "dc4cff04-06d4-4549-ac19-b456b49c5586"
HEADERS = {
    "Authorization": f"Bearer {BEARER_TOKEN}",
    "Accept": "application/json",
}

response = requests.get('https://fabricate.mockaroo.com/api/v1/databases/report/api/users', headers=HEADERS)

res1 = requests.get("https://fabricate.mockaroo.com/api/v1/databases/report/api/products", headers=HEADERS)

data = response.json() 
data1 = res1.json() 

In [0]:
user_df = spark.createDataFrame(data)
user_df.show(5)

+--------------------+------+---+--------------+----------------+
|               email|gender| id|      location|            name|
+--------------------+------+---+--------------+----------------+
|arabele.jeskin@ho...|Female|  1|      Staxigoe|  Arabele Jeskin|
|saul.pantone@aol.com|  Male|  2|   Cluj-Napoca|    Saul Pantone|
|brand.baudic@plan...|  Male|  3|  Yeraganahlli|    Brand Baudic|
|annemarie.jedrys@...|Female|  4|     Lynnfield|Annemarie Jedrys|
|nelia.spafford@gm...|Female|  5|Mount Pleasant|  Nelia Spafford|
+--------------------+------+---+--------------+----------------+
only showing top 5 rows



In [0]:
product_df = spark.createDataFrame(data1)
product_df.show(5)

+---+----------------+--------------------+--------+-------+
| id|product_category|        product_name|quantity|user_id|
+---+----------------+--------------------+--------+-------+
|  1|         Fitness|Stainless Steel W...|       1|     17|
|  2|    Food - Dairy|Feta Cheese Crumbles|       2|     49|
|  3|         Kitchen|      Coffee Grinder|       3|     84|
|  4|    Food - Meats|Savory Breakfast ...|       4|     70|
|  5|         Kitchen|Stainless Steel C...|       5|     73|
+---+----------------+--------------------+--------+-------+
only showing top 5 rows



In [0]:
user_df.join(product_df, user_df['id']==product_df['user_id'], 'inner').show(5)

+--------------------+------+---+--------------------+------------------+---+--------------------+--------------------+--------+-------+
|               email|gender| id|            location|              name| id|    product_category|        product_name|quantity|user_id|
+--------------------+------+---+--------------------+------------------+---+--------------------+--------------------+--------+-------+
|shandie.giacovett...|Female|  7|         Ptaszkowice|Shandie Giacovetti| 26|             Outdoor|   Portable Campfire|      26|      7|
|merilee.alger@yah...|Female| 11|Воля (Новоселівська)|     Merilee Alger| 32|                Pets|Cat Tree with Scr...|      32|     11|
|brand.baudic@plan...|  Male|  3|        Yeraganahlli|      Brand Baudic| 53|                Pets|Portable Pet Wate...|      53|      3|
|sasha.nyssens@liv...|  Male|  8|     Wisconsin Dells|     Sasha Nyssens| 91| Food - Baking Goods|Sweetened Condens...|      91|      8|
|nelia.spafford@gm...|Female|  5|      Mo

In [0]:
customer_data = [(1,'manish','patna',"30-05-2022"),
(2,'vikash','kolkata',"12-03-2023"),
(3,'nikita','delhi',"25-06-2023"),
(4,'rahul','ranchi',"24-03-2023"),
(5,'mahesh','jaipur',"22-03-2023"),
(6,'prantosh','kolkata',"18-10-2022"),
(7,'raman','patna',"30-12-2022"),
(8,'prakash','ranchi',"24-02-2023"),
(9,'ragini','kolkata',"03-03-2023"),
(10,'raushan','jaipur',"05-02-2023")]

customer_schema=['customer_id','customer_name','address','date_of_joining']


sales_data = [(1,22,10,"01-06-2022"),
(1,27,5,"03-02-2023"),
(2,5,3,"01-06-2023"),
(5,22,1,"22-03-2023"),
(7,22,4,"03-02-2023"),
(9,5,6,"03-03-2023"),
(2,1,12,"15-06-2023"),
(1,56,2,"25-06-2023"),
(5,12,5,"15-04-2023"),
(11,12,76,"12-03-2023")]

sales_schema=['customer_id','product_id','quantity','date_of_purchase']

product_data = [(1, 'fanta',20),
(2, 'dew',22),
(5, 'sprite',40),
(7, 'redbull',100),
(12,'mazza',45),
(22,'coke',27),
(25,'limca',21),
(27,'pepsi',14),
(56,'sting',10)]

product_schema=['id','name','price']

cust_df = spark.createDataFrame(data=customer_data, schema=customer_schema)

sales_df = spark.createDataFrame(data=sales_data, schema=sales_schema)

product_df = spark.createDataFrame(data=product_data, schema=product_schema)

In [0]:
cust_df.join(sales_df, cust_df['customer_id']==sales_df['customer_id'], 'inner').select(sales_df['customer_id']).show()

+-----------+
|customer_id|
+-----------+
|          1|
|          1|
|          1|
|          2|
|          2|
|          5|
|          5|
|          7|
|          9|
+-----------+



In [0]:
cust_df.join(sales_df, cust_df['customer_id']==sales_df['customer_id'], 'outer').show()

+-----------+-------------+-------+---------------+-----------+----------+--------+----------------+
|customer_id|customer_name|address|date_of_joining|customer_id|product_id|quantity|date_of_purchase|
+-----------+-------------+-------+---------------+-----------+----------+--------+----------------+
|          1|       manish|  patna|     30-05-2022|          1|        22|      10|      01-06-2022|
|          1|       manish|  patna|     30-05-2022|          1|        27|       5|      03-02-2023|
|          1|       manish|  patna|     30-05-2022|          1|        56|       2|      25-06-2023|
|          2|       vikash|kolkata|     12-03-2023|          2|         5|       3|      01-06-2023|
|          2|       vikash|kolkata|     12-03-2023|          2|         1|      12|      15-06-2023|
|          3|       nikita|  delhi|     25-06-2023|       null|      null|    null|            null|
|          4|        rahul| ranchi|     24-03-2023|       null|      null|    null|        

In [0]:
cust_df.join(sales_df, cust_df['customer_id']==sales_df['customer_id'], 'left').where(sales_df['product_id'].isNull()).show()

+-----------+-------------+-------+---------------+-----------+----------+--------+----------------+
|customer_id|customer_name|address|date_of_joining|customer_id|product_id|quantity|date_of_purchase|
+-----------+-------------+-------+---------------+-----------+----------+--------+----------------+
|          3|       nikita|  delhi|     25-06-2023|       null|      null|    null|            null|
|          4|        rahul| ranchi|     24-03-2023|       null|      null|    null|            null|
|          6|     prantosh|kolkata|     18-10-2022|       null|      null|    null|            null|
|          8|      prakash| ranchi|     24-02-2023|       null|      null|    null|            null|
|         10|      raushan| jaipur|     05-02-2023|       null|      null|    null|            null|
+-----------+-------------+-------+---------------+-----------+----------+--------+----------------+



In [0]:
cust_df.join(sales_df, cust_df['customer_id']==sales_df['customer_id'], 'left_semi').show()

+-----------+-------------+-------+---------------+
|customer_id|customer_name|address|date_of_joining|
+-----------+-------------+-------+---------------+
|          1|       manish|  patna|     30-05-2022|
|          2|       vikash|kolkata|     12-03-2023|
|          5|       mahesh| jaipur|     22-03-2023|
|          7|        raman|  patna|     30-12-2022|
|          9|       ragini|kolkata|     03-03-2023|
+-----------+-------------+-------+---------------+



In [0]:
cust_df.join(sales_df, cust_df['customer_id']==sales_df['customer_id'], 'left_anti').show()

+-----------+-------------+-------+---------------+
|customer_id|customer_name|address|date_of_joining|
+-----------+-------------+-------+---------------+
|          3|       nikita|  delhi|     25-06-2023|
|          4|        rahul| ranchi|     24-03-2023|
|          6|     prantosh|kolkata|     18-10-2022|
|          8|      prakash| ranchi|     24-02-2023|
|         10|      raushan| jaipur|     05-02-2023|
+-----------+-------------+-------+---------------+



In [0]:
sales_df.crossJoin(product_df).count()

Out[11]: 90

In [0]:
emp_data = [(1,'manish',50000,'IT','m'),
(2,'vikash',60000,'sales','m'),
(3,'raushan',70000,'marketing','m'),
(4,'mukesh',80000,'IT','m'),
(5,'priti',90000,'sales','f'),
(6,'nikita',45000,'marketing','f'),
(7,'ragini',55000,'marketing','f'),
(8,'rashi',100000,'IT','f'),
(9,'aditya',65000,'IT','m'),
(10,'rahul',50000,'marketing','m'),
(11,'rakhi',50000,'IT','f'),
(12,'akhilesh',90000,'sales','m')]

emp_schema = ['id', 'name', 'salary', 'dept', 'gender']

emp_df = spark.createDataFrame(data=emp_data, schema=emp_schema)
emp_df.show()

+---+--------+------+---------+------+
| id|    name|salary|     dept|gender|
+---+--------+------+---------+------+
|  1|  manish| 50000|       IT|     m|
|  2|  vikash| 60000|    sales|     m|
|  3| raushan| 70000|marketing|     m|
|  4|  mukesh| 80000|       IT|     m|
|  5|   priti| 90000|    sales|     f|
|  6|  nikita| 45000|marketing|     f|
|  7|  ragini| 55000|marketing|     f|
|  8|   rashi|100000|       IT|     f|
|  9|  aditya| 65000|       IT|     m|
| 10|   rahul| 50000|marketing|     m|
| 11|   rakhi| 50000|       IT|     f|
| 12|akhilesh| 90000|    sales|     m|
+---+--------+------+---------+------+



In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

window1 = Window.partitionBy('dept')

emp_df.withColumn('total_sal', sum(col('salary')).over(window1)).show()

+---+--------+------+---------+------+---------+
| id|    name|salary|     dept|gender|total_sal|
+---+--------+------+---------+------+---------+
|  1|  manish| 50000|       IT|     m|   345000|
|  4|  mukesh| 80000|       IT|     m|   345000|
|  8|   rashi|100000|       IT|     f|   345000|
|  9|  aditya| 65000|       IT|     m|   345000|
| 11|   rakhi| 50000|       IT|     f|   345000|
|  3| raushan| 70000|marketing|     m|   220000|
|  6|  nikita| 45000|marketing|     f|   220000|
|  7|  ragini| 55000|marketing|     f|   220000|
| 10|   rahul| 50000|marketing|     m|   220000|
|  2|  vikash| 60000|    sales|     m|   240000|
|  5|   priti| 90000|    sales|     f|   240000|
| 12|akhilesh| 90000|    sales|     m|   240000|
+---+--------+------+---------+------+---------+



In [0]:
window1 = Window.partitionBy('dept', 'gender').orderBy('salary')

emp_df.withColumn('row_no', row_number().over(window1)).show()

+---+--------+------+---------+------+------+
| id|    name|salary|     dept|gender|row_no|
+---+--------+------+---------+------+------+
| 11|   rakhi| 50000|       IT|     f|     1|
|  8|   rashi|100000|       IT|     f|     2|
|  1|  manish| 50000|       IT|     m|     1|
|  9|  aditya| 65000|       IT|     m|     2|
|  4|  mukesh| 80000|       IT|     m|     3|
|  6|  nikita| 45000|marketing|     f|     1|
|  7|  ragini| 55000|marketing|     f|     2|
| 10|   rahul| 50000|marketing|     m|     1|
|  3| raushan| 70000|marketing|     m|     2|
|  5|   priti| 90000|    sales|     f|     1|
|  2|  vikash| 60000|    sales|     m|     1|
| 12|akhilesh| 90000|    sales|     m|     2|
+---+--------+------+---------+------+------+



In [0]:
window1 = Window.partitionBy('dept', 'gender').orderBy('salary')

emp_df.withColumn('row_no', row_number().over(window1))\
    .withColumn('rank', rank().over(window1))\
        .withColumn('dense_rank', dense_rank().over(window1))\
            .show()

+---+--------+------+---------+------+------+----+----------+
| id|    name|salary|     dept|gender|row_no|rank|dense_rank|
+---+--------+------+---------+------+------+----+----------+
| 11|   rakhi| 50000|       IT|     f|     1|   1|         1|
|  8|   rashi|100000|       IT|     f|     2|   2|         2|
|  1|  manish| 50000|       IT|     m|     1|   1|         1|
|  9|  aditya| 65000|       IT|     m|     2|   2|         2|
|  4|  mukesh| 80000|       IT|     m|     3|   3|         3|
|  6|  nikita| 45000|marketing|     f|     1|   1|         1|
|  7|  ragini| 55000|marketing|     f|     2|   2|         2|
| 10|   rahul| 50000|marketing|     m|     1|   1|         1|
|  3| raushan| 70000|marketing|     m|     2|   2|         2|
|  5|   priti| 90000|    sales|     f|     1|   1|         1|
|  2|  vikash| 60000|    sales|     m|     1|   1|         1|
| 12|akhilesh| 90000|    sales|     m|     2|   2|         2|
+---+--------+------+---------+------+------+----+----------+



In [0]:
# want higher salary employees (gendewise) with dense_rank <= 2

window1 = Window.partitionBy('dept', 'gender').orderBy(desc('salary'))

emp_df.withColumn('row_no', row_number().over(window1))\
    .withColumn('rank', rank().over(window1))\
    .withColumn('dense_rank', dense_rank().over(window1)).filter(col('dense_rank')<=2)\
    .show() 

+---+--------+------+---------+------+------+----+----------+
| id|    name|salary|     dept|gender|row_no|rank|dense_rank|
+---+--------+------+---------+------+------+----+----------+
|  8|   rashi|100000|       IT|     f|     1|   1|         1|
| 11|   rakhi| 50000|       IT|     f|     2|   2|         2|
|  4|  mukesh| 80000|       IT|     m|     1|   1|         1|
|  9|  aditya| 65000|       IT|     m|     2|   2|         2|
|  7|  ragini| 55000|marketing|     f|     1|   1|         1|
|  6|  nikita| 45000|marketing|     f|     2|   2|         2|
|  3| raushan| 70000|marketing|     m|     1|   1|         1|
| 10|   rahul| 50000|marketing|     m|     2|   2|         2|
|  5|   priti| 90000|    sales|     f|     1|   1|         1|
| 12|akhilesh| 90000|    sales|     m|     1|   1|         1|
|  2|  vikash| 60000|    sales|     m|     2|   2|         2|
+---+--------+------+---------+------+------+----+----------+



In [0]:
product_data1 = [
(1,"iphone","01-01-2023",1500000),
(2,"samsung","01-01-2023",1100000),
(3,"oneplus","01-01-2023",1100000),
(1,"iphone","01-02-2023",1300000),
(2,"samsung","01-02-2023",1120000),
(3,"oneplus","01-02-2023",1120000),
(1,"iphone","01-03-2023",1600000),
(2,"samsung","01-03-2023",1080000),
(3,"oneplus","01-03-2023",1160000),
(1,"iphone","01-04-2023",1700000),
(2,"samsung","01-04-2023",1800000),
(3,"oneplus","01-04-2023",1170000),
(1,"iphone","01-05-2023",1200000),
(2,"samsung","01-05-2023",980000),
(3,"oneplus","01-05-2023",1175000),
(1,"iphone","01-06-2023",1100000),
(2,"samsung","01-06-2023",1100000),
(3,"oneplus","01-06-2023",1200000)
]

prod_schema1 = ['product_id', 'product_name', 'sales_date', 'sales']

pr_df = spark.createDataFrame(data=product_data1, schema=prod_schema1)
pr_df.show()

+----------+------------+----------+-------+
|product_id|product_name|sales_date|  sales|
+----------+------------+----------+-------+
|         1|      iphone|01-01-2023|1500000|
|         2|     samsung|01-01-2023|1100000|
|         3|     oneplus|01-01-2023|1100000|
|         1|      iphone|01-02-2023|1300000|
|         2|     samsung|01-02-2023|1120000|
|         3|     oneplus|01-02-2023|1120000|
|         1|      iphone|01-03-2023|1600000|
|         2|     samsung|01-03-2023|1080000|
|         3|     oneplus|01-03-2023|1160000|
|         1|      iphone|01-04-2023|1700000|
|         2|     samsung|01-04-2023|1800000|
|         3|     oneplus|01-04-2023|1170000|
|         1|      iphone|01-05-2023|1200000|
|         2|     samsung|01-05-2023| 980000|
|         3|     oneplus|01-05-2023|1175000|
|         1|      iphone|01-06-2023|1100000|
|         2|     samsung|01-06-2023|1100000|
|         3|     oneplus|01-06-2023|1200000|
+----------+------------+----------+-------+



In [0]:
win1 = Window.partitionBy('product_id').orderBy('sales_date')

prev_df = pr_df.withColumn('prev_sale', lag(col('sales')).over(win1))
prev_df.show(10)

+----------+------------+----------+-------+---------+
|product_id|product_name|sales_date|  sales|prev_sale|
+----------+------------+----------+-------+---------+
|         1|      iphone|01-01-2023|1500000|     null|
|         1|      iphone|01-02-2023|1300000|  1500000|
|         1|      iphone|01-03-2023|1600000|  1300000|
|         1|      iphone|01-04-2023|1700000|  1600000|
|         1|      iphone|01-05-2023|1200000|  1700000|
|         1|      iphone|01-06-2023|1100000|  1200000|
|         2|     samsung|01-01-2023|1100000|     null|
|         2|     samsung|01-02-2023|1120000|  1100000|
|         2|     samsung|01-03-2023|1080000|  1120000|
|         2|     samsung|01-04-2023|1800000|  1080000|
+----------+------------+----------+-------+---------+
only showing top 10 rows



In [0]:
win1 = Window.partitionBy('product_id').orderBy('sales_date')

pr_df.withColumn('next_sale', lead(col('sales'), 2, 0).over(win1)).show()

+----------+------------+----------+-------+---------+
|product_id|product_name|sales_date|  sales|next_sale|
+----------+------------+----------+-------+---------+
|         1|      iphone|01-01-2023|1500000|  1600000|
|         1|      iphone|01-02-2023|1300000|  1700000|
|         1|      iphone|01-03-2023|1600000|  1200000|
|         1|      iphone|01-04-2023|1700000|  1100000|
|         1|      iphone|01-05-2023|1200000|        0|
|         1|      iphone|01-06-2023|1100000|        0|
|         2|     samsung|01-01-2023|1100000|  1080000|
|         2|     samsung|01-02-2023|1120000|  1800000|
|         2|     samsung|01-03-2023|1080000|   980000|
|         2|     samsung|01-04-2023|1800000|  1100000|
|         2|     samsung|01-05-2023| 980000|        0|
|         2|     samsung|01-06-2023|1100000|        0|
|         3|     oneplus|01-01-2023|1100000|  1160000|
|         3|     oneplus|01-02-2023|1120000|  1170000|
|         3|     oneplus|01-03-2023|1160000|  1175000|
|         

In [0]:
prev_df.withColumn('losses', (col('sales')-col('prev_sale'))).show()  

+----------+------------+----------+-------+---------+-------+
|product_id|product_name|sales_date|  sales|prev_sale| losses|
+----------+------------+----------+-------+---------+-------+
|         1|      iphone|01-01-2023|1500000|     null|   null|
|         1|      iphone|01-02-2023|1300000|  1500000|-200000|
|         1|      iphone|01-03-2023|1600000|  1300000| 300000|
|         1|      iphone|01-04-2023|1700000|  1600000| 100000|
|         1|      iphone|01-05-2023|1200000|  1700000|-500000|
|         1|      iphone|01-06-2023|1100000|  1200000|-100000|
|         2|     samsung|01-01-2023|1100000|     null|   null|
|         2|     samsung|01-02-2023|1120000|  1100000|  20000|
|         2|     samsung|01-03-2023|1080000|  1120000| -40000|
|         2|     samsung|01-04-2023|1800000|  1080000| 720000|
|         2|     samsung|01-05-2023| 980000|  1800000|-820000|
|         2|     samsung|01-06-2023|1100000|   980000| 120000|
|         3|     oneplus|01-01-2023|1100000|     null| 

In [0]:
# percentage loss/gain based on previous month sales

prev_df.withColumn('percent_losses', (col('sales')-col('prev_sale'))/ col('sales') *100).show()  

+----------+------------+----------+-------+---------+-------------------+
|product_id|product_name|sales_date|  sales|prev_sale|     percent_losses|
+----------+------------+----------+-------+---------+-------------------+
|         1|      iphone|01-01-2023|1500000|     null|               null|
|         1|      iphone|01-02-2023|1300000|  1500000|-15.384615384615385|
|         1|      iphone|01-03-2023|1600000|  1300000|              18.75|
|         1|      iphone|01-04-2023|1700000|  1600000|   5.88235294117647|
|         1|      iphone|01-05-2023|1200000|  1700000| -41.66666666666667|
|         1|      iphone|01-06-2023|1100000|  1200000| -9.090909090909092|
|         2|     samsung|01-01-2023|1100000|     null|               null|
|         2|     samsung|01-02-2023|1120000|  1100000| 1.7857142857142856|
|         2|     samsung|01-03-2023|1080000|  1120000|-3.7037037037037033|
|         2|     samsung|01-04-2023|1800000|  1080000|               40.0|
|         2|     samsung|

In [0]:
# percentage loss/gain based on previous month sales

prev_df.withColumn('percent_losses', round((col('sales')-col('prev_sale'))/ col('sales') *100, 2)).show(10)  

+----------+------------+----------+-------+---------+--------------+
|product_id|product_name|sales_date|  sales|prev_sale|percent_losses|
+----------+------------+----------+-------+---------+--------------+
|         1|      iphone|01-01-2023|1500000|     null|          null|
|         1|      iphone|01-02-2023|1300000|  1500000|        -15.38|
|         1|      iphone|01-03-2023|1600000|  1300000|         18.75|
|         1|      iphone|01-04-2023|1700000|  1600000|          5.88|
|         1|      iphone|01-05-2023|1200000|  1700000|        -41.67|
|         1|      iphone|01-06-2023|1100000|  1200000|         -9.09|
|         2|     samsung|01-01-2023|1100000|     null|          null|
|         2|     samsung|01-02-2023|1120000|  1100000|          1.79|
|         2|     samsung|01-03-2023|1080000|  1120000|          -3.7|
|         2|     samsung|01-04-2023|1800000|  1080000|          40.0|
+----------+------------+----------+-------+---------+--------------+
only showing top 10 

In [0]:
res_json_data = spark.read.format('json').option('header', 'true').option('inferSchema', 'true').option('mode', 'permissive')\
    .load('/FileStore/tables/resturant_json_data.json')

res_json_data.columns

Out[34]: ['code',
 'message',
 'restaurants',
 'results_found',
 'results_shown',
 'results_start',
 'status']

In [0]:
re1 = res_json_data
re1.printSchema()

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- restaurants: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- restaurant: struct (nullable = true)
 |    |    |    |-- R: struct (nullable = true)
 |    |    |    |    |-- res_id: long (nullable = true)
 |    |    |    |-- apikey: string (nullable = true)
 |    |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |    |-- cuisines: string (nullable = true)
 |    |    |    |-- currency: string (nullable = true)
 |    |    |    |-- deeplink: string (nullable = true)
 |    |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- events_url: string (nullable = true)
 |    |    |    |-- featured_image: string (nullable = true)
 |    |    |    |-- has_online_delivery: long (nullable = true)
 |    |    |    |-- has_table_booking: long (nullable = true)
 |    |    |    |-- i

In [0]:
ex_data = re1.select('*', explode('restaurants').alias('new_restaurants')).drop('restaurants')
ex_data.printSchema()

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- results_found: long (nullable = true)
 |-- results_shown: long (nullable = true)
 |-- results_start: string (nullable = true)
 |-- status: string (nullable = true)
 |-- new_restaurants: struct (nullable = true)
 |    |-- restaurant: struct (nullable = true)
 |    |    |-- R: struct (nullable = true)
 |    |    |    |-- res_id: long (nullable = true)
 |    |    |-- apikey: string (nullable = true)
 |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |-- cuisines: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- deeplink: string (nullable = true)
 |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- events_url: string (nullable = true)
 |    |    |-- featured_image: string (nullable = true)
 |    |    |-- has_online_delivery: long (nullable = true)
 |    |    |-- has_tab

In [0]:
ex1 = ex_data.select('*', 'new_restaurants.restaurant.R.res_id', 'new_restaurants.restaurant.name', explode_outer('new_restaurants.restaurant.establishment_types').alias('new_establishment_types')).drop('new_restaurants').drop('message', 'results_found')
ex1.printSchema()

root
 |-- code: long (nullable = true)
 |-- results_shown: long (nullable = true)
 |-- results_start: string (nullable = true)
 |-- status: string (nullable = true)
 |-- res_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- new_establishment_types: string (nullable = true)



In [0]:
ex1.select('res_id').show(6)

+--------+
|  res_id|
+--------+
|17066603|
|17059541|
|17064405|
|17057797|
|17057591|
|17064266|
+--------+
only showing top 6 rows



In [0]:
ex1.filter((col('res_id')>= 17057797) & (col('results_shown') >= 20) | (col('code').isNotNull()) ).show(6)

+----+-------------+-------------+------+--------+--------------------+-----------------------+
|code|results_shown|results_start|status|  res_id|                name|new_establishment_types|
+----+-------------+-------------+------+--------+--------------------+-----------------------+
|null|           20|            1|  null|17066603|            The Coop|                   null|
|null|           20|            1|  null|17059541|Maggiano's Little...|                   null|
|null|           20|            1|  null|17064405|Tako Cheena by Po...|                   null|
|null|           20|            1|  null|17057797|Bosphorous Turkis...|                   null|
|null|           20|            1|  null|17064266|Hawkers Asian Str...|                   null|
|null|           20|            1|  null|17060516|Seasons 52 Fresh ...|                   null|
+----+-------------+-------------+------+--------+--------------------+-----------------------+
only showing top 6 rows



In [0]:
csv_df = spark.read.format("csv").option("header", 'true').option("inferSchema", 'true').option("mode", 'PERMISSIVE').load("/FileStore/tables/disk_part.csv")

csv_df.show()

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  1|  Manish| 26| 75000|  INDIA|     m|
|  2|  Nikita| 23|100000|    USA|     f|
|  3|  Pritam| 22|150000|  INDIA|     m|
|  4|Prantosh| 17|200000|  JAPAN|     m|
|  5|  Vikash| 31|300000|    USA|     m|
|  6|   Rahul| 55|300000|  INDIA|     m|
|  7|    Raju| 67|540000|    USA|     m|
|  8| Praveen| 28| 70000|  JAPAN|     m|
|  9|     Dev| 32|150000|  JAPAN|     m|
| 10|  Sherin| 16| 25000| RUSSIA|     f|
| 11|    Ragu| 12| 35000|  INDIA|     f|
| 12|   Sweta| 43|200000|  INDIA|     f|
| 13| Raushan| 48|650000|    USA|     m|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|
| 15| Prakash| 52|750000|  INDIA|     m|
+---+--------+---+------+-------+------+



In [0]:
csv_df.write.format('csv')\
    .option('header', 'true')\
    .option('inferschhema', 'true')\
    .option('mode', 'overwrite')\
    .partitionBy('address', 'gender')\
    .option('path', '/FileStore/tables/address_gender_partition')\
    .save()

In [0]:
dbutils.fs.ls('/FileStore/tables/address_gender_partition')  

Out[41]: [FileInfo(path='dbfs:/FileStore/tables/address_gender_partition/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1745986997000),
 FileInfo(path='dbfs:/FileStore/tables/address_gender_partition/address=INDIA/', name='address=INDIA/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/address_gender_partition/address=JAPAN/', name='address=JAPAN/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/address_gender_partition/address=RUSSIA/', name='address=RUSSIA/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/address_gender_partition/address=USA/', name='address=USA/', size=0, modificationTime=0)]

In [0]:
dbutils.fs.ls('dbfs:/FileStore/tables/address_gender_partition/address=INDIA/')  # inside address search for gender

Out[42]: [FileInfo(path='dbfs:/FileStore/tables/address_gender_partition/address=INDIA/gender=f/', name='gender=f/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/address_gender_partition/address=INDIA/gender=m/', name='gender=m/', size=0, modificationTime=0)]

In [0]:
csv_df.write.format('csv')\
    .option('header', 'true')\
    .option('inferschhema', 'true')\
    .option('mode', 'overwrite')\
    .bucketBy(4, 'id')\
    .option('path', '/FileStore/tables/bucket_id')\
    .saveAsTable('bucket_id_table')

In [0]:
dbutils.fs.ls('/FileStore/tables/bucket_id')  

Out[44]: [FileInfo(path='dbfs:/FileStore/tables/bucket_id/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1745987059000),
 FileInfo(path='dbfs:/FileStore/tables/bucket_id/_committed_2027114794156935381', name='_committed_2027114794156935381', size=404, modificationTime=1745987059000),
 FileInfo(path='dbfs:/FileStore/tables/bucket_id/_started_2027114794156935381', name='_started_2027114794156935381', size=0, modificationTime=1745987058000),
 FileInfo(path='dbfs:/FileStore/tables/bucket_id/part-00000-tid-2027114794156935381-d19cf7a8-e4b5-428d-8819-0777a6ad7ac0-406-1_00000.c000.csv', name='part-00000-tid-2027114794156935381-d19cf7a8-e4b5-428d-8819-0777a6ad7ac0-406-1_00000.c000.csv', size=116, modificationTime=1745987058000),
 FileInfo(path='dbfs:/FileStore/tables/bucket_id/part-00000-tid-2027114794156935381-d19cf7a8-e4b5-428d-8819-0777a6ad7ac0-406-2_00001.c000.csv', name='part-00000-tid-2027114794156935381-d19cf7a8-e4b5-428d-8819-0777a6ad7ac0-406-2_00001.c000.csv', size=113, modifica

In [0]:
emp_df = spark.read.format('csv').option('header', 'true').option('inferschema', 'true').option('mode', 'PERMISSIVE')\
.load("/FileStore/tables/employee.csv")

emp_df.show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [0]:
emp_df.select("id", col("name"), emp_df["salary"], emp_df.address).show()     

+---+--------+------+------------+
| id|    name|salary|     address|
+---+--------+------+------------+
|  1|  Manish| 75000|       bihar|
|  2|  Nikita|100000|uttarpradesh|
|  3|  Pritam|150000|   Bangalore|
|  4|Prantosh|200000|     Kolkata|
|  5|  Vikash|300000|        null|
+---+--------+------+------------+



In [0]:
### use of expression => expr  =>   expr("")

emp_df.select(expr("id +5")).show()

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



In [0]:
emp_df.select(expr("id as emp_id"), expr("name as emp_name"), expr("concat(name, address)")).show()

+------+--------+---------------------+
|emp_id|emp_name|concat(name, address)|
+------+--------+---------------------+
|     1|  Manish|          Manishbihar|
|     2|  Nikita|   Nikitauttarpradesh|
|     3|  Pritam|      PritamBangalore|
|     4|Prantosh|      PrantoshKolkata|
|     5|  Vikash|                 null|
+------+--------+---------------------+



In [0]:
emp_df.select(col('id').alias('empl_id'), 'name', 'age').show()

+-------+--------+---+
|empl_id|    name|age|
+-------+--------+---+
|      1|  Manish| 26|
|      2|  Nikita| 23|
|      3|  Pritam| 22|
|      4|Prantosh| 17|
|      5|  Vikash| 31|
+-------+--------+---+



In [0]:
emp_df.where((col('salary') > 150000) | (col('age') <18)).show()

+---+--------+---+------+-------+--------+
| id|    name|age|salary|address| nominee|
+---+--------+---+------+-------+--------+
|  4|Prantosh| 17|200000|Kolkata|   India|
|  5|  Vikash| 31|300000|   null|nominee5|
+---+--------+---+------+-------+--------+



In [0]:
emp_df.select("*", lit("kumar")).show()

+---+--------+---+------+------------+--------+-----+
| id|    name|age|salary|     address| nominee|kumar|
+---+--------+---+------+------------+--------+-----+
|  1|  Manish| 26| 75000|       bihar|nominee1|kumar|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|kumar|
|  3|  Pritam| 22|150000|   Bangalore|   India|kumar|
|  4|Prantosh| 17|200000|     Kolkata|   India|kumar|
|  5|  Vikash| 31|300000|        null|nominee5|kumar|
+---+--------+---+------+------------+--------+-----+



In [0]:
emp_df.select("*", lit("kumar").alias('last_name')).show()

+---+--------+---+------+------------+--------+---------+
| id|    name|age|salary|     address| nominee|last_name|
+---+--------+---+------+------------+--------+---------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    kumar|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    kumar|
|  3|  Pritam| 22|150000|   Bangalore|   India|    kumar|
|  4|Prantosh| 17|200000|     Kolkata|   India|    kumar|
|  5|  Vikash| 31|300000|        null|nominee5|    kumar|
+---+--------+---+------+------------+--------+---------+



In [0]:
emp_df.withColumn('surname', lit('singh')).show()

+---+--------+---+------+------------+--------+-------+
| id|    name|age|salary|     address| nominee|surname|
+---+--------+---+------+------------+--------+-------+
|  1|  Manish| 26| 75000|       bihar|nominee1|  singh|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|  singh|
|  3|  Pritam| 22|150000|   Bangalore|   India|  singh|
|  4|Prantosh| 17|200000|     Kolkata|   India|  singh|
|  5|  Vikash| 31|300000|        null|nominee5|  singh|
+---+--------+---+------+------------+--------+-------+



In [0]:
new_emp_df = emp_df.withColumnRenamed('id', 'emp_id')
new_emp_df.show()

+------+--------+---+------+------------+--------+
|emp_id|    name|age|salary|     address| nominee|
+------+--------+---+------+------------+--------+
|     1|  Manish| 26| 75000|       bihar|nominee1|
|     2|  Nikita| 23|100000|uttarpradesh|nominee2|
|     3|  Pritam| 22|150000|   Bangalore|   India|
|     4|Prantosh| 17|200000|     Kolkata|   India|
|     5|  Vikash| 31|300000|        null|nominee5|
+------+--------+---+------+------------+--------+



In [0]:
new_emp_df.printSchema()

root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
## cast data type

new_emp_df.withColumn('emp_id', col('emp_id').cast('string'))\
    .withColumn('name', col('name').cast('integer'))\
    .printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- name: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
new_emp_df.drop('emp_id', col('salary')).show()

+--------+---+------------+--------+
|    name|age|     address| nominee|
+--------+---+------------+--------+
|  Manish| 26|       bihar|nominee1|
|  Nikita| 23|uttarpradesh|nominee2|
|  Pritam| 22|   Bangalore|   India|
|Prantosh| 17|     Kolkata|   India|
|  Vikash| 31|        null|nominee5|
+--------+---+------------+--------+



In [0]:
# ⁠add n number of months and n number of days to ficen conditional date
# from pyspark.sql.functions import col, to_date, add_months, date_add
# from pyspark.sql import SparkSession

data = [("2025-04-28",), ("2024-11-10",)]
df = spark.createDataFrame(data, ["input_date"])

# Convert string to DateType
df = df.withColumn("input_date", to_date("input_date"))

# Step 1: Add months
df_months_added = df.withColumn("after_months", add_months(col("input_date"), 2))

# Step 2: Add days on top of the already modified date
df_final = df_months_added.withColumn("final_date", date_add(col("after_months"), 10))

# Display result
display(df_final)


input_date,after_months,final_date
2025-04-28,2025-06-28,2025-07-08
2024-11-10,2025-01-10,2025-01-20


In [0]:
data1 = [('2020-05-20',), ('2018-01-12',), ('2022-08-23',)]
df1 = spark.createDataFrame(data1, ['input_date'])
# display(df1)

df_m = df1.withColumn('added_months', add_months(col('input_date'), 4))
# df_m.show()

df_d = df_m.withColumn('added_days', date_add('added_months', 15))
df_d.show()


+----------+------------+----------+
|input_date|added_months|added_days|
+----------+------------+----------+
|2020-05-20|  2020-09-20|2020-10-05|
|2018-01-12|  2018-05-12|2018-05-27|
|2022-08-23|  2022-12-23|2023-01-07|
+----------+------------+----------+



In [0]:
data = [(1, 1000, 1200), (2, 500, 750)]
columns = ["id", "year1_value", "year2_value"]

df = spark.createDataFrame(data, columns)

df_percentage = df.withColumn("percentage_diff", ((col("year2_value") - col("year1_value")) / col("year1_value")) * 100)

df_percentage.show()

+---+-----------+-----------+---------------+
| id|year1_value|year2_value|percentage_diff|
+---+-----------+-----------+---------------+
|  1|       1000|       1200|           20.0|
|  2|        500|        750|           50.0|
+---+-----------+-----------+---------------+



In [0]:
df_percentage.withColumn("round_per", round('percentage_diff', 2)).show()


+---+-----------+-----------+---------------+---------+
| id|year1_value|year2_value|percentage_diff|round_per|
+---+-----------+-----------+---------------+---------+
|  1|       1000|       1200|           20.0|     20.0|
|  2|        500|        750|           50.0|     50.0|
+---+-----------+-----------+---------------+---------+



In [0]:
my_data=[(10 ,'Anil',50000, 18),
(11 ,'Vikas',75000,  16),
(12 ,'Nisha',40000,  18),
(13 ,'Nidhi',60000,  17),
(14 ,'Priya',80000,  18),
(15 ,'Mohit',45000,  18),
(16 ,'Rajesh',90000, 10),
(17 ,'Raman',55000, 16),
(18 ,'Sam',65000,   17)]

my_schema = ['id', 'name', 'salary', 'manager_id']

man_df = spark.createDataFrame(my_data, my_schema)
# man_df.show()
dup_my_data = [(18 ,'Sam',65000, 17),
               (18 ,'Sam',65000, 18),
               (19 ,'Sohan',50000, 18),
(20 ,'Sima',75000,  17)]

dup_man_df = spark.createDataFrame(dup_my_data, my_schema)
dup_man_df.show()

+---+-----+------+----------+
| id| name|salary|manager_id|
+---+-----+------+----------+
| 18|  Sam| 65000|        17|
| 18|  Sam| 65000|        18|
| 19|Sohan| 50000|        18|
| 20| Sima| 75000|        17|
+---+-----+------+----------+



In [0]:
man_df.union(dup_man_df).count()

Out[77]: 13

In [0]:
man_df.unionAll(dup_man_df).count()

Out[78]: 13

In [0]:
wrong_column_data=[(19 ,50000, 18,'Sohan'),
(20 ,75000,  17,'Sima')]

wrong_schema = ['id', 'salary', 'manager_id', 'name']

wrong_df = spark.createDataFrame(data=wrong_column_data, schema=wrong_schema)
wrong_df.show()

+---+------+----------+-----+
| id|salary|manager_id| name|
+---+------+----------+-----+
| 19| 50000|        18|Sohan|
| 20| 75000|        17| Sima|
+---+------+----------+-----+



In [0]:
man_df.union(wrong_df).show()       # didnt give error just adds data

+---+------+------+----------+
| id|  name|salary|manager_id|
+---+------+------+----------+
| 10|  Anil| 50000|        18|
| 11| Vikas| 75000|        16|
| 12| Nisha| 40000|        18|
| 13| Nidhi| 60000|        17|
| 14| Priya| 80000|        18|
| 15| Mohit| 45000|        18|
| 16|Rajesh| 90000|        10|
| 17| Raman| 55000|        16|
| 18|   Sam| 65000|        17|
| 19| 50000|    18|     Sohan|
| 20| 75000|    17|      Sima|
+---+------+------+----------+



In [0]:
man_df.unionByName(wrong_df).show()       # didnt give error just adds data

+---+------+------+----------+
| id|  name|salary|manager_id|
+---+------+------+----------+
| 10|  Anil| 50000|        18|
| 11| Vikas| 75000|        16|
| 12| Nisha| 40000|        18|
| 13| Nidhi| 60000|        17|
| 14| Priya| 80000|        18|
| 15| Mohit| 45000|        18|
| 16|Rajesh| 90000|        10|
| 17| Raman| 55000|        16|
| 18|   Sam| 65000|        17|
| 19| Sohan| 50000|        18|
| 20|  Sima| 75000|        17|
+---+------+------+----------+



In [0]:
man_df.sort(col('salary').desc()).show()

+---+------+------+----------+
| id|  name|salary|manager_id|
+---+------+------+----------+
| 16|Rajesh| 90000|        10|
| 14| Priya| 80000|        18|
| 11| Vikas| 75000|        16|
| 18|   Sam| 65000|        17|
| 13| Nidhi| 60000|        17|
| 17| Raman| 55000|        16|
| 10|  Anil| 50000|        18|
| 15| Mohit| 45000|        18|
| 12| Nisha| 40000|        18|
+---+------+------+----------+



In [0]:
man_df.select('salary', 'name').distinct().show()

+------+------+
|salary|  name|
+------+------+
| 50000|  Anil|
| 75000| Vikas|
| 40000| Nisha|
| 60000| Nidhi|
| 80000| Priya|
| 45000| Mohit|
| 90000|Rajesh|
| 55000| Raman|
| 65000|   Sam|
+------+------+



In [0]:
man_df.drop_duplicates('name', 'salary').show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-2623574814266193>:1[0m
[0;32m----> 1[0m [43mman_df[49m[38;5;241;43m.[39;49m[43mdrop_duplicates[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mname[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[38;5;124;43m'[39;49m[38;5;124;43msalary[39;49m[38;5;124;43m'[39;49m[43m)[49m[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m 

In [0]:
man_df.drop_duplicates(['name', 'salary']).show()

+---+------+------+----------+
| id|  name|salary|manager_id|
+---+------+------+----------+
| 10|  Anil| 50000|        18|
| 11| Vikas| 75000|        16|
| 12| Nisha| 40000|        18|
| 13| Nidhi| 60000|        17|
| 14| Priya| 80000|        18|
| 15| Mohit| 45000|        18|
| 16|Rajesh| 90000|        10|
| 17| Raman| 55000|        16|
| 18|   Sam| 65000|        17|
+---+------+------+----------+



In [0]:
emp_d = [
(1,'manish',26,20000,'india','IT'),
(2,'rahul',None,40000,'germany','engineering'),
(3,'pawan',12,60000,'india','sales'),
(4,'roshini',44,None,'uk','engineering'),
(5,'raushan',35,70000,'india','sales'),
(6,None,29,200000,'uk','IT'),
(7,'adam',37,65000,'us','IT'),
(8,'chris',16,40000,'us','sales'),
(None,None,None,None,None,None),
(7,'adam',37,65000,'us','IT')
]

sch1 = ['id', 'name', 'age', 'salary', 'country', 'dept']

dept_df = spark.createDataFrame(data=emp_d, schema=sch1)
dept_df.show()

+----+-------+----+------+-------+-----------+
|  id|   name| age|salary|country|       dept|
+----+-------+----+------+-------+-----------+
|   1| manish|  26| 20000|  india|         IT|
|   2|  rahul|null| 40000|germany|engineering|
|   3|  pawan|  12| 60000|  india|      sales|
|   4|roshini|  44|  null|     uk|engineering|
|   5|raushan|  35| 70000|  india|      sales|
|   6|   null|  29|200000|     uk|         IT|
|   7|   adam|  37| 65000|     us|         IT|
|   8|  chris|  16| 40000|     us|      sales|
|null|   null|null|  null|   null|       null|
|   7|   adam|  37| 65000|     us|         IT|
+----+-------+----+------+-------+-----------+



In [0]:
dept_df.select('*').count()

Out[94]: 10

In [0]:
dept_df.select(count('name')).show()               # null excluded

+-----------+
|count(name)|
+-----------+
|          8|
+-----------+



In [0]:
dept_df.select(count('salary'), avg('salary').cast('integer'), min('age').cast('float').alias('f_age'), sum('age')).show()

+-------------+------------------------+-----+--------+
|count(salary)|CAST(avg(salary) AS INT)|f_age|sum(age)|
+-------------+------------------------+-----+--------+
|            8|                   70000| 12.0|     236|
+-------------+------------------------+-----+--------+



In [0]:
dept_df.select(max('age').alias('max_age'), sum('age').alias('sum_age')).show()

+-------+-------+
|max_age|sum_age|
+-------+-------+
|     44|    236|
+-------+-------+



In [0]:
dept_df.groupBy('dept').sum('salary').show()     # or => dept_df.groupBy('dept').agg(sum('salary')).show()

+-----------+-----------+
|       dept|sum(salary)|
+-----------+-----------+
|         IT|     350000|
|engineering|      40000|
|      sales|     170000|
|       null|       null|
+-----------+-----------+



In [0]:
dept_df.show(5)

+---+-------+----+------+-------+-----------+
| id|   name| age|salary|country|       dept|
+---+-------+----+------+-------+-----------+
|  1| manish|  26| 20000|  india|         IT|
|  2|  rahul|null| 40000|germany|engineering|
|  3|  pawan|  12| 60000|  india|      sales|
|  4|roshini|  44|  null|     uk|engineering|
|  5|raushan|  35| 70000|  india|      sales|
+---+-------+----+------+-------+-----------+
only showing top 5 rows



In [0]:
dept_df.withColumn('age', when(col('age').isNull(), lit(50)).otherwise(col('age')))\
    .withColumn('adult', when(col('age')<18, 'minor')
        .when((col('age')>18) & (col('age')<30), 'mid_level')
            .otherwise('major')).show()

+----+-------+---+------+-------+-----------+---------+
|  id|   name|age|salary|country|       dept|    adult|
+----+-------+---+------+-------+-----------+---------+
|   1| manish| 26| 20000|  india|         IT|mid_level|
|   2|  rahul| 50| 40000|germany|engineering|    major|
|   3|  pawan| 12| 60000|  india|      sales|    minor|
|   4|roshini| 44|  null|     uk|engineering|    major|
|   5|raushan| 35| 70000|  india|      sales|    major|
|   6|   null| 29|200000|     uk|         IT|mid_level|
|   7|   adam| 37| 65000|     us|         IT|    major|
|   8|  chris| 16| 40000|     us|      sales|    minor|
|null|   null| 50|  null|   null|       null|    major|
|   7|   adam| 37| 65000|     us|         IT|    major|
+----+-------+---+------+-------+-----------+---------+



In [0]:
window1 = Window.partitionBy('dept')
dept_df.withColumn('first_val', first('salary').over(window1)).show(5)

+----+------+----+------+-------+----+---------+
|  id|  name| age|salary|country|dept|first_val|
+----+------+----+------+-------+----+---------+
|null|  null|null|  null|   null|null|     null|
|   1|manish|  26| 20000|  india|  IT|    20000|
|   6|  null|  29|200000|     uk|  IT|    20000|
|   7|  adam|  37| 65000|     us|  IT|    20000|
|   7|  adam|  37| 65000|     us|  IT|    20000|
+----+------+----+------+-------+----+---------+
only showing top 5 rows



In [0]:
window1 = Window.partitionBy('dept').orderBy('salary').rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

fl_df = dept_df.withColumn('first_val', first('salary').over(window1))\
    .withColumn('last_val', last('salary').over(window1))
fl_df.show()

+----+-------+----+------+-------+-----------+---------+--------+
|  id|   name| age|salary|country|       dept|first_val|last_val|
+----+-------+----+------+-------+-----------+---------+--------+
|null|   null|null|  null|   null|       null|     null|    null|
|   1| manish|  26| 20000|  india|         IT|    20000|  200000|
|   7|   adam|  37| 65000|     us|         IT|    20000|  200000|
|   7|   adam|  37| 65000|     us|         IT|    20000|  200000|
|   6|   null|  29|200000|     uk|         IT|    20000|  200000|
|   4|roshini|  44|  null|     uk|engineering|     null|   40000|
|   2|  rahul|null| 40000|germany|engineering|     null|   40000|
|   8|  chris|  16| 40000|     us|      sales|    40000|   70000|
|   3|  pawan|  12| 60000|  india|      sales|    40000|   70000|
|   5|raushan|  35| 70000|  india|      sales|    40000|   70000|
+----+-------+----+------+-------+-----------+---------+--------+



In [0]:
emp_data1 = [
(1,'manish',26,20000,'india'),
(2,'rahul',None,40000,'germany'),
(3,'pawan',12,60000,'india'),
(4,'roshini',44,None,'uk')]

ss_schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('salary', IntegerType(), True),
    StructField('country', StringType(), True)
])

ssdf = spark.createDataFrame(emp_data1, ss_schema)
ssdf.show()

+---+-------+----+------+-------+
| id|   name| age|salary|country|
+---+-------+----+------+-------+
|  1| manish|  26| 20000|  india|
|  2|  rahul|null| 40000|germany|
|  3|  pawan|  12| 60000|  india|
|  4|roshini|  44|  null|     uk|
+---+-------+----+------+-------+



In [0]:
data = [("A", "2022", 100),
    ("A", "2023", 150),
    ("B", "2022", 80),
    ("B", "2023", 120)]

df = spark.createDataFrame(data, ["category", "year", "sales"])
df.show()
pivot_df = df.groupBy("category").pivot("year").sum("sales")
pivot_df.show()

+--------+----+-----+
|category|year|sales|
+--------+----+-----+
|       A|2022|  100|
|       A|2023|  150|
|       B|2022|   80|
|       B|2023|  120|
+--------+----+-----+

+--------+----+----+
|category|2022|2023|
+--------+----+----+
|       B|  80| 120|
|       A| 100| 150|
+--------+----+----+



In [0]:
data_1 = [(1, '2024-05-10'),
(2, '2024-06-15'),
(3, '2024-07-01')]

dtdf = spark.createDataFrame(data_1, ['id', 'date'])
dtdf.show()

+---+----------+
| id|      date|
+---+----------+
|  1|2024-05-10|
|  2|2024-06-15|
|  3|2024-07-01|
+---+----------+



In [0]:
dtdf.withColumn("datetime", to_timestamp('date', 'yyyy-MM-dd'))\
    .withColumn('next4days', date_add('datetime', 4))\
        .withColumn("months", add_months('next4days', 2))\
            .withColumn("back3days", date_sub('datetime', 3))\
                .withColumn('day_no', dayofweek('datetime'))\
                    .withColumn('difffromtoday', datediff(current_date(), 'date'))\
                        .withColumn('months_diff', round(months_between(current_date(), 'date'), 2))\
                            .withColumn('dayname', date_format('date', 'EEEE'))\
                                .withColumn('MONTHNAME', date_format('date', 'MMMM'))\
                                    .withColumn('month_no', date_format('date', 'MM'))\
                                        .withColumn('det', date_format('date', 'dd'))\
                                            .show()
                                            # .withColumn('year', date_format('date', 'yyyy'))\          

+---+----------+-------------------+----------+----------+----------+------+-------------+-----------+--------+---------+--------+---+
| id|      date|           datetime| next4days|    months| back3days|day_no|difffromtoday|months_diff| dayname|MONTHNAME|month_no|det|
+---+----------+-------------------+----------+----------+----------+------+-------------+-----------+--------+---------+--------+---+
|  1|2024-05-10|2024-05-10 00:00:00|2024-05-14|2024-07-14|2024-05-07|     6|          355|      11.65|  Friday|      May|      05| 10|
|  2|2024-06-15|2024-06-15 00:00:00|2024-06-19|2024-08-19|2024-06-12|     7|          319|      10.48|Saturday|     June|      06| 15|
|  3|2024-07-01|2024-07-01 00:00:00|2024-07-05|2024-09-05|2024-06-28|     2|          303|       9.94|  Monday|     July|      07| 01|
+---+----------+-------------------+----------+----------+----------+------+-------------+-----------+--------+---------+--------+---+



In [0]:
emp_data = [(1,"manish","11-07-2023","10:20"),
        (1,"manish","11-07-2023","11:20"),
        (2,"rajesh","11-07-2023","11:20"),
        (1,"manish","11-07-2023","11:50"),
        (2,"rajesh","11-07-2023","13:20"),
        (1,"manish","11-07-2023","19:20"),
        (2,"rajesh","11-07-2023","17:20"),
        (1,"manish","12-07-2023","10:32"),
        (1,"manish","12-07-2023","12:20"),
        (3,"vikash","12-07-2023","09:12"),
        (1,"manish","12-07-2023","16:23"),
        (3,"vikash","12-07-2023","18:08")]

emp_schema = ["id", "name", "date", "time"]
dt_df = spark.createDataFrame(data=emp_data, schema=emp_schema)

d1 = dt_df.withColumn('timestamp', from_unixtime(unix_timestamp(expr('concat(date, " ", time)'), 'dd-MM-yyyy HH:mm')))
d1.show(5)

+---+------+----------+-----+-------------------+
| id|  name|      date| time|          timestamp|
+---+------+----------+-----+-------------------+
|  1|manish|11-07-2023|10:20|2023-07-11 10:20:00|
|  1|manish|11-07-2023|11:20|2023-07-11 11:20:00|
|  2|rajesh|11-07-2023|11:20|2023-07-11 11:20:00|
|  1|manish|11-07-2023|11:50|2023-07-11 11:50:00|
|  2|rajesh|11-07-2023|13:20|2023-07-11 13:20:00|
+---+------+----------+-----+-------------------+
only showing top 5 rows



In [0]:
%run /Users/gpc1194@gmail.com/write_file

+---+------+-----+
| id|  Name|Value|
+---+------+-----+
|  0|User_0|    0|
|  1|User_1|   10|
|  2|User_2|   20|
|  3|User_3|   30|
|  4|User_4|   40|
+---+------+-----+
only showing top 5 rows



path,name,size,modificationTime
dbfs:/FileStore/tables/large_csv/_committed_2822074481379389151,_committed_2822074481379389151,1463,1746012521000
dbfs:/FileStore/tables/large_csv/_committed_3501629668587049354,_committed_3501629668587049354,1463,1746013446000
dbfs:/FileStore/tables/large_csv/_committed_3961489446787236986,_committed_3961489446787236986,1463,1746012744000
dbfs:/FileStore/tables/large_csv/_committed_4336225920058035292,_committed_4336225920058035292,1463,1746013517000
dbfs:/FileStore/tables/large_csv/_committed_5005893022092808214,_committed_5005893022092808214,1474,1746011538000
dbfs:/FileStore/tables/large_csv/_committed_6156657614402819135,_committed_6156657614402819135,1463,1746012401000
dbfs:/FileStore/tables/large_csv/_committed_6825651536842820972,_committed_6825651536842820972,744,1746009951000
dbfs:/FileStore/tables/large_csv/_committed_772670762137022899,_committed_772670762137022899,1455,1746013610000
dbfs:/FileStore/tables/large_csv/_committed_8185455039847280034,_committed_8185455039847280034,1463,1746013580000
dbfs:/FileStore/tables/large_csv/_committed_8354869218294077964,_committed_8354869218294077964,1463,1746013411000


In [0]:
fun1() 
print(var1) 

Hello World
42


In [0]:
large_df.printSchema()
large_df.show(6)

root
 |-- id: long (nullable = false)
 |-- Name: string (nullable = false)
 |-- Value: long (nullable = false)

+---+------+-----+
| id|  Name|Value|
+---+------+-----+
|  0|User_0|    0|
|  1|User_1|   10|
|  2|User_2|   20|
|  3|User_3|   30|
|  4|User_4|   40|
|  5|User_5|   50|
+---+------+-----+
only showing top 6 rows

