In [0]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

n1 = datetime.now()
print(n1)
d1 = n1.strftime('%B')
print(d1)

backd = n1 - relativedelta(years=2, months=3, days=12)
print(backd.year)

t1 = n1 - timedelta(weeks=3)
print(t1)
# 

2025-05-05 11:08:18.766257
%
2023
2025-05-05 11:08:15.766257


In [0]:
data1 = [
    (1, 101,     10, 2.5, '2024-01-15'),
(2, 105,    5,  2.5, '2024-01-28'),
(3, 102,  7,  1.2, '2024-01-10'),
(4, 106,   4,  1.2, '2024-02-01'),
(5, 103,   3,  3.0, '2024-02-15'),
(6, 104,    8,  2.5, '2024-02-20'),
(7, 107,   6,  3.0, '2024-03-05')
]

sche1 = ['sale_id', 'product_id', 'quantity', 'price', 'sale_date']

df1 = spark.createDataFrame(data1, sche1)
df1.show()

+-------+----------+--------+-----+----------+
|sale_id|product_id|quantity|price| sale_date|
+-------+----------+--------+-----+----------+
|      1|       101|      10|  2.5|2024-01-15|
|      2|       105|       5|  2.5|2024-01-28|
|      3|       102|       7|  1.2|2024-01-10|
|      4|       106|       4|  1.2|2024-02-01|
|      5|       103|       3|  3.0|2024-02-15|
|      6|       104|       8|  2.5|2024-02-20|
|      7|       107|       6|  3.0|2024-03-05|
+-------+----------+--------+-----+----------+



In [0]:
%sql
create procedure getdata()
begin

select prdo_id, date_format(sale_date, '%Y-%m') as sale_month,
sum(quantity) as total_qu, sum(price) as total_price from t1
groupby prdo_id, date_format(sale_date, '%Y-%m') orderby prdo_id, sale_month;

end;

In [0]:
%sql

-- SQL engines (like MySQL) don’t allow using an alias in the GROUP BY clause directly...

CREATE PROCEDURE calculate_monthly_sales()
BEGIN
    SELECT
        product_id,
        DATE_FORMAT(sale_date, '%Y-%m') AS sale_month,
        SUM(quantity) AS total_quantity,
        SUM(quantity * price) AS total_sales
    FROM sales_data
    GROUP BY product_id, DATE_FORMAT(sale_date, '%Y-%m')
    ORDER BY product_id, sale_month;
END;


In [0]:
from pyspark.sql.functions import *

def calculate_monthly_sales(df):
    # Create a new column 'sale_month' for month grouping
    df_monthly = df.withColumn("sale_month", date_format("sale_date", "yyyy-MM"))
    # print(df_monthly)
    # Group by product_id, product_name, and sale_month
    result = df_monthly.groupBy("product_id", "sale_month") \
        .agg(
            sum("quantity").alias("total_quantity"),
            sum(col("quantity") * col("price")).alias("total_sales")
        ) \
        .orderBy("product_id", "sale_month")
    
    return result

monthly_sales_summary = calculate_monthly_sales(df1)
monthly_sales_summary.show()


+----------+----------+--------------+-----------+
|product_id|sale_month|total_quantity|total_sales|
+----------+----------+--------------+-----------+
|       101|   2024-01|            10|       25.0|
|       102|   2024-01|             7|        8.4|
|       103|   2024-02|             3|        9.0|
|       104|   2024-02|             8|       20.0|
|       105|   2024-01|             5|       12.5|
|       106|   2024-02|             4|        4.8|
|       107|   2024-03|             6|       18.0|
+----------+----------+--------------+-----------+



In [0]:
import pandas as pd

url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
    
df1 = pd.read_csv(url, sep = '\t')
df1.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


In [0]:
# Solution 1

df1.shape[0]  # entries <= 4622 observations

Out[30]: 4622

In [0]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


In [0]:
df1.columns

Out[32]: Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

In [0]:
# For the most-ordered item, how many items were ordered?

df1 = df1.groupby('item_name')
df1 = df1.sum()
df1 = df1.sort_values(['quantity'], ascending=False)
df1.head(1)

Unnamed: 0_level_0,order_id,quantity
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicken Bowl,713926,761


In [0]:
# How many items were orderd in total?

t1 = df1.quantity.sum()
t1

Out[37]: 4972

In [0]:
rev = (df1['quantity'] * df1['item_price']).sum()
rev

[0;31m---------------------------------------------------------------------------[0m
[0;31mKeyError[0m                                  Traceback (most recent call last)
File [0;32m/databricks/python/lib/python3.9/site-packages/pandas/core/indexes/base.py:3621[0m, in [0;36mIndex.get_loc[0;34m(self, key, method, tolerance)[0m
[1;32m   3620[0m [38;5;28;01mtry[39;00m:
[0;32m-> 3621[0m     [38;5;28;01mreturn[39;00m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_engine[49m[38;5;241;43m.[39;49m[43mget_loc[49m[43m([49m[43mcasted_key[49m[43m)[49m
[1;32m   3622[0m [38;5;28;01mexcept[39;00m [38;5;167;01mKeyError[39;00m [38;5;28;01mas[39;00m err:

File [0;32m/databricks/python/lib/python3.9/site-packages/pandas/_libs/index.pyx:136[0m, in [0;36mpandas._libs.index.IndexEngine.get_loc[0;34m()[0m

File [0;32m/databricks/python/lib/python3.9/site-packages/pandas/_libs/index.pyx:163[0m, in [0;36mpandas._libs.index.IndexEngine.get_loc[0;34m()[0m

File

In [0]:
# How many orders were made in the period?

os1 = df1.order_id.value_counts().count()
os1

Out[41]: 50

In [0]:
# How many different items are sold?

s01 = df1.item_name.value_counts().count()
s01

[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-290789362603183>:3[0m
[1;32m      1[0m [38;5;66;03m# How many different items are sold?[39;00m
[0;32m----> 3[0m s01 [38;5;241m=[39m df1[38;5;241m.[39mitem_name[38;5;241m.[39mvalue_counts()[38;5;241m.[39mcount()
[1;32m      4[0m s01

File [0;32m/databricks/python/lib/python3.9/site-packages/pandas/core/generic.py:5575[0m, in [0;36mNDFrame.__getattr__[0;34m(self, name)[0m
[1;32m   5568[0m [38;5;28;01mif[39;00m (
[1;32m   5569[0m     name [38;5;129;01mnot[39;00m [38;5;129;01min[39;00m [38;5;28mself[39m[38;5;241m.[39m_internal_names_set
[1;32m   5570[0m     [38;5;129;01mand[39;00m name [38;5;129;01mnot[39;00m [38;5;129;01min[39;00m [38;5;28mself[39m[38;5;241m.[39m_metadata
[1;32m   5571[0m     [38;5;129;01mand[39;00m name [38;5;129;01mnot[39;00m

In [0]:
# Sort by the name of the item

df1.item_name.sort_values()
 
# OR

df1.sort_values(by = "item_name")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-290789362603189>:3[0m
[1;32m      1[0m [38;5;66;03m# Sort by the name of the item[39;00m
[0;32m----> 3[0m df1[38;5;241m.[39mitem_name[38;5;241m.[39msort_values()
[1;32m      5[0m [38;5;66;03m# OR[39;00m
[1;32m      7[0m df1[38;5;241m.[39msort_values(by [38;5;241m=[39m [38;5;124m"[39m[38;5;124mitem_name[39m[38;5;124m"[39m)

File [0;32m/databricks/python/lib/python3.9/site-packages/pandas/core/generic.py:5575[0m, in [0;36mNDFrame.__getattr__[0;34m(self, name)[0m
[1;32m   5568[0m [38;5;28;01mif[39;00m (
[1;32m   5569[0m     name [38;5;129;01mnot[39;00m [38;5;129;01min[39;00m [38;5;28mself[39m[38;5;241m.[39m_internal_names_set
[1;32m   5570[0m     [38;5;129;01mand[39;00m name [38;5;129;01mnot[39;00m [38;5;129;01min[39;00m [38;5;28mself[39m[38