In [0]:
read_csv_data = spark.read.format('csv')\
    .option('header', 'true')\
        .option('inferSchema', 'true')\
            .option('mode', 'permissive')\         # failfast, dropmalformed
                .load('/FileStore/tables/flight_data.csv')

read_csv_data.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [0]:
%fs
ls /FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/address_gender_partition/,address_gender_partition/,0,0
dbfs:/FileStore/tables/address_partition/,address_partition/,0,0
dbfs:/FileStore/tables/bad_records/,bad_records/,0,0
dbfs:/FileStore/tables/bucket_id/,bucket_id/,0,0
dbfs:/FileStore/tables/corrupted_json.json,corrupted_json.json,214,1745302675000
dbfs:/FileStore/tables/csv_write/,csv_write/,0,0
dbfs:/FileStore/tables/csv_write_par/,csv_write_par/,0,0
dbfs:/FileStore/tables/disk_part-1.csv,disk_part-1.csv,429,1745314617000
dbfs:/FileStore/tables/disk_part.csv,disk_part.csv,726,1745312118000
dbfs:/FileStore/tables/employee.csv,employee.csv,225,1745235607000


In [0]:
read_csv_data.columns

Out[10]: ['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [0]:
read_csv_data.show(10)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+
only showing top 10 rows



In [0]:
from datetime import datetime

data1 = datetime.now()        # today
print(data1)

month1 = data1.month
print(month1)

m1 = data1.strftime('%B')
print(m1)

day_name = data1.strftime('%A')
print(day_name)

2025-04-28 11:53:36.996544
4
April
Monday


In [0]:
from pyspark.sql.functions import *

df = spark.createDataFrame(data=[(1, )], schema=['id']).withColumn('current_time', current_timestamp())
df.show(truncate=False)

+---+-----------------------+
|id |current_time           |
+---+-----------------------+
|1  |2025-04-28 12:11:07.628|
+---+-----------------------+



In [0]:
from pyspark.sql.functions import date_format

df2 = df.withColumn("month", month("current_time")) \
    .withColumn("month_name", date_format("current_time", "MMMM")) \
        .withColumn("day_name", date_format("current_time", "EEEE"))

display(df2)

id,current_time,month,month_name,day_name
1,2025-04-28T12:13:04.203+0000,4,April,Monday


In [0]:
from pyspark.sql.functions import current_date    # "*"

df1 = spark.createDataFrame(data=[(1, )], schema=['id']).withColumn('today', current_date())
# display(df1)

df_m_y = df1.withColumn('month_name', date_format('today', 'MMMM'))\
    .withColumn('year', year('today'))\
        .withColumn('month', month('today'))
display(df_m_y)

id,today,month_name,year,month
1,2025-04-28,April,2025,4


In [0]:
%pip install requests

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import requests
from pyspark.sql import SparkSession

# spark = SparkSession.builder.appName("API_Post").getOrCreate()   # already available in Databricks
BEARER_TOKEN = "dc4cff04-06d4-4549-ac19-b456b49c5586"
HEADERS = {
    "Authorization": f"Bearer {BEARER_TOKEN}",
    "Accept": "application/json",
}

response = requests.get('https://fabricate.mockaroo.com/api/v1/databases/report/api/users', headers=HEADERS)

res1 = requests.get("https://fabricate.mockaroo.com/api/v1/databases/report/api/products", headers=HEADERS)

data = response.json() 
data1 = res1.json() 

In [0]:
user_df = spark.createDataFrame(data)
user_df.show(5)

+--------------------+------+---+--------------+----------------+
|               email|gender| id|      location|            name|
+--------------------+------+---+--------------+----------------+
|arabele.jeskin@ho...|Female|  1|      Staxigoe|  Arabele Jeskin|
|saul.pantone@aol.com|  Male|  2|   Cluj-Napoca|    Saul Pantone|
|brand.baudic@plan...|  Male|  3|  Yeraganahlli|    Brand Baudic|
|annemarie.jedrys@...|Female|  4|     Lynnfield|Annemarie Jedrys|
|nelia.spafford@gm...|Female|  5|Mount Pleasant|  Nelia Spafford|
+--------------------+------+---+--------------+----------------+
only showing top 5 rows



In [0]:
product_df = spark.createDataFrame(data1)
product_df.show(5)

+---+----------------+--------------------+--------+-------+
| id|product_category|        product_name|quantity|user_id|
+---+----------------+--------------------+--------+-------+
|  1|         Fitness|Stainless Steel W...|       1|     17|
|  2|    Food - Dairy|Feta Cheese Crumbles|       2|     49|
|  3|         Kitchen|      Coffee Grinder|       3|     84|
|  4|    Food - Meats|Savory Breakfast ...|       4|     70|
|  5|         Kitchen|Stainless Steel C...|       5|     73|
+---+----------------+--------------------+--------+-------+
only showing top 5 rows



In [0]:
user_df.join(product_df, user_df['id']==product_df['user_id'], 'inner').show(5)

+--------------------+------+---+--------------+----------------+---+--------------------+--------------------+--------+-------+
|               email|gender| id|      location|            name| id|    product_category|        product_name|quantity|user_id|
+--------------------+------+---+--------------+----------------+---+--------------------+--------------------+--------+-------+
|arabele.jeskin@ho...|Female|  1|      Staxigoe|  Arabele Jeskin| 63|   Food - Condiments|Vegan Caesar Dres...|      63|      1|
|saul.pantone@aol.com|  Male|  2|   Cluj-Napoca|    Saul Pantone| 92|   Food - Condiments|         Salsa Verde|      92|      2|
|brand.baudic@plan...|  Male|  3|  Yeraganahlli|    Brand Baudic| 53|                Pets|Portable Pet Wate...|      53|      3|
|annemarie.jedrys@...|Female|  4|     Lynnfield|Annemarie Jedrys| 70|              Health|Deep Tissue Massa...|      70|      4|
|nelia.spafford@gm...|Female|  5|Mount Pleasant|  Nelia Spafford| 34|Food - Prepared F...|   Chic

In [0]:
user_df.join(product_df, user_df['id']==product_df['user_id'], 'left').show(5)

+--------------------+------+---+--------------+----------------+---+--------------------+--------------------+--------+-------+
|               email|gender| id|      location|            name| id|    product_category|        product_name|quantity|user_id|
+--------------------+------+---+--------------+----------------+---+--------------------+--------------------+--------+-------+
|arabele.jeskin@ho...|Female|  1|      Staxigoe|  Arabele Jeskin| 63|   Food - Condiments|Vegan Caesar Dres...|      63|      1|
|saul.pantone@aol.com|  Male|  2|   Cluj-Napoca|    Saul Pantone| 92|   Food - Condiments|         Salsa Verde|      92|      2|
|brand.baudic@plan...|  Male|  3|  Yeraganahlli|    Brand Baudic| 53|                Pets|Portable Pet Wate...|      53|      3|
|annemarie.jedrys@...|Female|  4|     Lynnfield|Annemarie Jedrys| 70|              Health|Deep Tissue Massa...|      70|      4|
|nelia.spafford@gm...|Female|  5|Mount Pleasant|  Nelia Spafford| 34|Food - Prepared F...|   Chic