In [0]:
%run /Users/075bei014.gokarna@pcampus.edu.np/MaintainingLog

# Dimension Date

In [0]:
%sql
CREATE TABLE IF NOT EXISTS dim_date (
    date_key INT,
    full_date DATE,
    day_of_week VARCHAR(20),
    day_of_month INT,
    day_of_year INT,
    week_of_year INT,
    month INT,
    month_name VARCHAR(20),
    quarter INT,
    year INT
);

## Dimension Time Load Function

In [0]:
from datetime import datetime
import pandas as pd
from pyspark.sql.types import IntegerType, DateType

def load_dim_date_table(start_date, end_date):

    # Generate date range
    date_range = pd.date_range(start_date, end_date)

    # Create a list of dictionaries representing each date row
    rows = []
    df = spark.table('dim_date')  # Assuming the table already exists

    dateTimeInTable = df.select('date_key').rdd.flatMap(lambda x: x).collect()
#     print(dateTimeInTable)
    
    for date in date_range:
        date_dict = {
            'date_key': int(date.strftime('%Y%m%d')),
            'full_date': date.strftime('%Y-%m-%d'),
            'day_of_week': date.strftime('%A'),
            'day_of_month': date.day,
            'day_of_year': date.timetuple().tm_yday,
            'week_of_year': date.strftime('%U'),
            'month': date.month,
            'month_name': date.strftime('%B'),
            'quarter': (date.month - 1) // 3 + 1,
            'year': date.year
        }
        if date_dict['date_key'] not in dateTimeInTable:
            rows.append(date_dict)
            
    if len(rows) > 0:
        LogTable.load('DIM', 'STARTED', 'dim_date')
        
        new_df = spark.createDataFrame(rows)
        column_cast_list = ['date_key', 'day_of_month', 'day_of_year', 'week_of_year', 'quarter', 'year', 'month']

        
        for column in column_cast_list:
            new_df = new_df.withColumn(column, new_df[column].cast(IntegerType()))
        
        
        new_df = new_df.withColumn('full_date', new_df['full_date'].cast(DateType()))
        
        new_df.write.format('delta').mode('append').option("mergeSchema", "true").saveAsTable('dim_date')
        LogTable.load('DIM', 'COMPLETED', 'dim_date')
        

        print("Dimension table 'dim_date' has been successfully loaded.")
    else:
        print("Dimension table 'dim_date' is up to date.")

## Loading Dim Date

In [0]:
load_dim_date_table('2020-01-01', '2025-12-31')

Dimension table 'dim_date' is up to date.


# Dimesnsion Time

In [0]:
%sql
DROP TABLE IF EXISTS dim_time;
CREATE TABLE IF NOT EXISTS dim_time
(
    time_key INT,
    start_time VARCHAR(8),
    end_time VARCHAR(8),
    hour INT,
    minute INT,
    second INT
);

## Loading Function

In [0]:
def load_dim_time_table():
    hours = []
    
    df = spark.table('dim_time') 

    
    for hour in range(24):
        hour_dict = {
            'time_key': hour,
            'start_time': f'{hour:02}:00:00',
            'end_time': f'{((hour + 1)%24):02}:00:00',
            'hour': hour,
            'minute': 0,
            'second': 0
        }
        hours.append(hour_dict)
        
    column_cast_list = ['hour', 'time_key', 'minute', 'second']
    new_df = spark.createDataFrame(hours)
    
    for column in column_cast_list:
        new_df = new_df.withColumn(column, new_df[column].cast(IntegerType()))
    new_df.write.format('delta').mode('overwrite').option("mergeSchema", "true").saveAsTable('dim_time')

    print("Dimension table 'dim_time' has been successfully loaded.")
    

## Loading Dim Time

In [0]:
load_dim_time_table()

Dimension table 'dim_time' has been successfully loaded.


# Dimension City

In [0]:
df = spark.read.load("dbfs:/FileStore/tables/Cities")

In [0]:
df.write.format('delta').mode('overwrite').option("mergeSchema", "true").saveAsTable('dim_city')

In [0]:
print("Table dimesnsion table successfully loaded")

Table dimesnsion table successfully loaded
