# Setup and import

In [1]:
import pandas as pd

# Data Import

In [2]:
df = pd.read_csv("./Store_Sensors_data.csv")

In [3]:
df.head()

Unnamed: 0,Date,Hour,StoreNo,StoreName,EntranceName,InCount,OutCount
0,2021-12-11,0:0:0,30,My_Store,1,0,0
1,2021-12-11,0:0:0,30,My_Store,2,0,0
2,2021-12-11,0:0:0,30,My_Store,3,0,0
3,2021-12-11,0:0:0,30,My_Store,4,0,0
4,2021-12-11,0:15:0,30,My_Store,1,0,0


# Dim Tables

## my_store

In [4]:
my_store = df[['StoreName', 'StoreNo']].drop_duplicates()
my_store

Unnamed: 0,StoreName,StoreNo
0,My_Store,30
1056,Our_Store,31
1322,Your_Store,32
1588,Their_Store,34
1854,mine_Their_Store,35
2120,Their_Our_Store,36
2386,Their_My_Store,38


In [5]:
my_store.reset_index(inplace=True, drop=True)
if 'StoreID' not in my_store.columns:
    my_store.reset_index(inplace=True)
    my_store.rename(columns={'index':'StoreID'}, inplace=True)
my_store

Unnamed: 0,StoreID,StoreName,StoreNo
0,0,My_Store,30
1,1,Our_Store,31
2,2,Your_Store,32
3,3,Their_Store,34
4,4,mine_Their_Store,35
5,5,Their_Our_Store,36
6,6,Their_My_Store,38


In [6]:
my_store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   StoreID    7 non-null      int64 
 1   StoreName  7 non-null      object
 2   StoreNo    7 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 296.0+ bytes


## my_sensor

In [7]:
my_sensor = df[['StoreNo', 'EntranceName']].drop_duplicates()
my_sensor

Unnamed: 0,StoreNo,EntranceName
0,30,001
1,30,002
2,30,003
3,30,004
1056,31,Main Entrance
1322,32,Main
1588,34,Main
1854,35,Main
2120,36,Main
2386,38,Main


In [8]:
my_sensor.reset_index(inplace=True, drop=True)
my_sensor

Unnamed: 0,StoreNo,EntranceName
0,30,001
1,30,002
2,30,003
3,30,004
4,31,Main Entrance
5,32,Main
6,34,Main
7,35,Main
8,36,Main
9,38,Main


In [9]:
my_sensor = pd.merge(my_sensor, my_store, on='StoreNo').copy()
my_sensor = my_sensor[['StoreID', 'EntranceName']]

In [10]:
my_sensor.reset_index(inplace=True)
my_sensor.rename(columns={'index':'SensorID'}, inplace=True)
my_sensor

Unnamed: 0,SensorID,StoreID,EntranceName
0,0,0,001
1,1,0,002
2,2,0,003
3,3,0,004
4,4,1,Main Entrance
5,5,2,Main
6,6,3,Main
7,7,4,Main
8,8,5,Main
9,9,6,Main


In [11]:
my_sensor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   SensorID      10 non-null     int64 
 1   StoreID       10 non-null     int64 
 2   EntranceName  10 non-null     object
dtypes: int64(2), object(1)
memory usage: 368.0+ bytes


## dim_date

In [12]:
def get_dim_date_df(start_date: str, end_date: str, upsert_dataframe: pd.DataFrame = None) -> pd.DataFrame:
    """
    Generate a dimension date data frame with date-related information between the specified start and end dates.

    Args:
        start_date (str): The start date in the format 'YYYY-MM-DD'.
        end_date (str): The end date in the format 'YYYY-MM-DD'.
        upsert_dataframe (pd.DataFrame, optional): An existing dataframe to perform an upsert (update or insert) operation. 
                                                   If provided, new dates will be appended to the dataframe based on the 'Date_Key' column.
                                                   Defaults to None.

    Returns:
        pd.DataFrame: The generated dimension table with the following columns:

            - Date_Key: A unique key representing the date in the format 'YYYYMMDD'.
            - Full_Date: The full date in the format 'YYYY-MM-DD'.
            - Day_of_Week: The day of the week as an integer (1 for Monday, 2 for Tuesday, etc.).
            - Day_of_Month: The day of the month as an integer.
            - Day_of_Year: The day of the year as an integer.
            - Day_Name: The name of the day of the week.
            - Week_of_month: The week of the month as an integer.
            - Week_of_year: The week of the year as an integer.
            - Month_Of_year: The month of the year as an integer.
            - Days_in_Month: The number of days in the month.
            - Month_Name: The name of the month.
            - Year: The year as an integer.

    Raises:
        KeyError: Raised if the 'Date_Key' column is not present in the upsert_dataframe when performing an upsert operation.
    """
    dim_date_dict = {
        'Date_Key': [],
        'Full_Date': [],
        'Day_of_Week': [],
        'Day_of_Month': [],
        'Day_of_Year': [],
        'Day_Name': [],
        'Week_of_month': [],
        'Week_of_year': [],
        'Month_Of_year': [],
        'Days_in_Month': [],
        'Month_Name': [],
        'Year': []
    }

    for date_ in pd.date_range(start_date, end_date):
        full_date = str(date_)[:-9].strip()
        date_key = int(full_date.replace('-', ''))

        if upsert_dataframe is not None:
            if 'Date_Key' not in upsert_dataframe.columns:
                raise KeyError("No 'Date_Key' column found in the upsert dataframe.")
            if date_key in upsert_dataframe['Date_Key'].to_list():
                continue

        dim_date_dict['Date_Key'].append(date_key)
        dim_date_dict['Full_Date'].append(full_date)
        dim_date_dict['Day_of_Week'].append(date_.day_of_week + 1)
        dim_date_dict['Day_of_Month'].append(date_.day)
        dim_date_dict['Day_of_Year'].append(date_.day_of_year)
        dim_date_dict['Day_Name'].append(date_.day_name())
        dim_date_dict['Week_of_month'].append(date_.weekday() + 1)
        dim_date_dict['Week_of_year'].append(date_.weekofyear)
        dim_date_dict['Month_Of_year'].append(date_.month)
        dim_date_dict['Days_in_Month'].append(date_.days_in_month)
        dim_date_dict['Month_Name'].append(date_.month_name())
        dim_date_dict['Year'].append(date_.year)
    
    res = pd.DataFrame(dim_date_dict)
    if upsert_dataframe is not None:
        res = res.merge(upsert_dataframe, how='outer')

    return res


In [30]:
# get_dim_date_df('2021-01-01', '2021-12-30').to_csv("dim_date.csv", index=False)

## dim_time

In [14]:
def get_dim_time_df(frequency: str = 'H') -> pd.DataFrame:
    """
    Generate a pandas DataFrame with start and end times based on a specified frequency.

    Args:
        frequency (str): Frequency of the time intervals. Defaults to 'H' (hourly).

    Returns:
        pd.DataFrame: DataFrame with columns 'StartTime' and 'EndTime'.

    """
    startTime = []
    timekey = []
    for date_ in pd.date_range(start='2023-01-01', end='2023-01-02', freq=frequency):
        time = date_.time()
        startTime.append(time)
        timekey.append(int(str(time).replace(':', '')))
    startTime = startTime[:-1]
    timekey = timekey[:-1]
    endTime = startTime[1:] + startTime[0:1]

    return pd.DataFrame({'Time_Key':timekey,'Start_Time': startTime, 'End_Time': endTime})


In [32]:
get_dim_time_df()
# get_dim_time_df('900S').to_csv("dim_quarter_hourly_time.csv", index=False)

# Fact Tables

## Initial General DF

In [16]:
dim_date = get_dim_date_df('2019-01-01', '2025-01-01')

In [17]:
raw_data_with_date_time = df.merge(my_store, on=['StoreNo', 'StoreName'])
raw_data_with_date_time = raw_data_with_date_time.merge(my_sensor, on=['StoreID', 'EntranceName'])
raw_data_with_date_time = raw_data_with_date_time.merge(dim_date, left_on='Date', right_on='Full_Date')[['Date_Key', 'Hour', 'StoreID', 'SensorID', 'InCount', 'OutCount']]
raw_data_with_date_time

Unnamed: 0,Date_Key,Hour,StoreID,SensorID,InCount,OutCount
0,20211211,0:0:0,0,0,0,0
1,20211211,0:15:0,0,0,0,0
2,20211211,0:30:0,0,0,0,0
3,20211211,0:45:0,0,0,0,0
4,20211211,1:0:0,0,0,0,0
...,...,...,...,...,...,...
2647,20211213,17:15:0,6,9,1,0
2648,20211213,17:30:0,6,9,2,3
2649,20211213,17:45:0,6,9,0,0
2650,20211213,18:0:0,6,9,4,4


## Fact_Hourly_Data

In [18]:
dim_time_hourly = get_dim_time_df()
dim_time_hourly.count()

Time_Key      24
Start_Time    24
End_Time      24
dtype: int64

### IMORTANT CONCEPTS

In [19]:
fact_houtly_data = raw_data_with_date_time.copy()

fact_houtly_data['Hour'] = pd.to_datetime(fact_houtly_data['Hour'], format='%H:%M:%S')

# ROUND TO HOUR
fact_houtly_data['Hour'] = fact_houtly_data['Hour'].dt.round('H')


# MAKE DATE TIME TIME
fact_houtly_data['Hour'] = fact_houtly_data['Hour'].dt.time


# fact_houtly_data.rename(columns={'Hour':'Time_Key'})
fact_houtly_data.head()

Unnamed: 0,Date_Key,Hour,StoreID,SensorID,InCount,OutCount
0,20211211,00:00:00,0,0,0,0
1,20211211,00:00:00,0,0,0,0
2,20211211,00:00:00,0,0,0,0
3,20211211,01:00:00,0,0,0,0
4,20211211,01:00:00,0,0,0,0


In [20]:
dim_time_hourly.head()

Unnamed: 0,Time_Key,Start_Time,End_Time
0,0,00:00:00,01:00:00
1,10000,01:00:00,02:00:00
2,20000,02:00:00,03:00:00
3,30000,03:00:00,04:00:00
4,40000,04:00:00,05:00:00


In [21]:
# Adding time key from dim_time

fact_houtly_data = fact_houtly_data.merge(dim_time_hourly[['Start_Time', 'Time_Key']], left_on='Hour', right_on='Start_Time')
fact_houtly_data.drop(['Start_Time', 'Hour'], inplace=True, axis=1)

In [22]:
fact_houtly_data

Unnamed: 0,Date_Key,StoreID,SensorID,InCount,OutCount,Time_Key
0,20211211,0,0,0,0,0
1,20211211,0,0,0,0,0
2,20211211,0,0,0,0,0
3,20211211,0,0,0,0,0
4,20211211,0,0,0,0,0
...,...,...,...,...,...,...
2647,20211212,5,8,0,0,230000
2648,20211212,5,8,0,0,230000
2649,20211212,6,9,0,0,230000
2650,20211212,6,9,0,0,230000


In [23]:
fact_houtly_data = fact_houtly_data.groupby(['Date_Key', 'Time_Key', 'StoreID', 'SensorID']).sum().reset_index()
fact_houtly_data

Unnamed: 0,Date_Key,Time_Key,StoreID,SensorID,InCount,OutCount
0,20211211,0,0,0,0,0
1,20211211,0,0,1,0,0
2,20211211,0,0,2,0,0
3,20211211,0,0,3,0,0
4,20211211,0,1,4,0,0
...,...,...,...,...,...,...
665,20211213,180000,2,5,17,22
666,20211213,180000,3,6,14,14
667,20211213,180000,4,7,10,9
668,20211213,180000,5,8,27,26


In [24]:
fact_houtly_data.count()

Date_Key    670
Time_Key    670
StoreID     670
SensorID    670
InCount     670
OutCount    670
dtype: int64

## Quarter Hourly Data -> Fact

In [25]:
dim_time_quarter_hourly = get_dim_time_df('900S')
dim_time_quarter_hourly

Unnamed: 0,Time_Key,Start_Time,End_Time
0,0,00:00:00,00:15:00
1,1500,00:15:00,00:30:00
2,3000,00:30:00,00:45:00
3,4500,00:45:00,01:00:00
4,10000,01:00:00,01:15:00
...,...,...,...
91,224500,22:45:00,23:00:00
92,230000,23:00:00,23:15:00
93,231500,23:15:00,23:30:00
94,233000,23:30:00,23:45:00


In [26]:
fact_quarter_hourly_data = raw_data_with_date_time.copy()

fact_quarter_hourly_data['Hour'] = pd.to_datetime(fact_quarter_hourly_data['Hour'], format='%H:%M:%S')
fact_quarter_hourly_data['Hour'] = fact_quarter_hourly_data['Hour'].dt.time

fact_quarter_hourly_data = fact_quarter_hourly_data.merge(dim_time_quarter_hourly[['Time_Key', 'Start_Time']], left_on='Hour', right_on='Start_Time')
fact_quarter_hourly_data.drop(['Hour', 'Start_Time'], axis=1, inplace=True)
fact_quarter_hourly_data

Unnamed: 0,Date_Key,StoreID,SensorID,InCount,OutCount,Time_Key
0,20211211,0,0,0,0,0
1,20211211,0,1,0,0,0
2,20211211,0,2,0,0,0
3,20211211,0,3,0,0,0
4,20211211,1,4,0,0,0
...,...,...,...,...,...,...
2647,20211212,2,5,0,0,234500
2648,20211212,3,6,0,0,234500
2649,20211212,4,7,0,0,234500
2650,20211212,5,8,0,0,234500


In [27]:
fact_quarter_hourly_data = fact_quarter_hourly_data[['Date_Key', 'Time_Key', 'SensorID', 'StoreID', 'InCount', 'OutCount']]
fact_quarter_hourly_data

Unnamed: 0,Date_Key,Time_Key,SensorID,StoreID,InCount,OutCount
0,20211211,0,0,0,0,0
1,20211211,0,1,0,0,0
2,20211211,0,2,0,0,0
3,20211211,0,3,0,0,0
4,20211211,0,4,1,0,0
...,...,...,...,...,...,...
2647,20211212,234500,5,2,0,0
2648,20211212,234500,6,3,0,0
2649,20211212,234500,7,4,0,0
2650,20211212,234500,8,5,0,0


## Daily Data

In [28]:
fact_daily_data = raw_data_with_date_time.copy()
fact_daily_data = fact_daily_data.groupby(['Date_Key', 'StoreID', 'SensorID'], as_index=False).sum()

  fact_daily_data = fact_daily_data.groupby(['Date_Key', 'StoreID', 'SensorID'], as_index=False).sum()


In [29]:
fact_daily_data

Unnamed: 0,Date_Key,StoreID,SensorID,InCount,OutCount
0,20211211,0,0,60,70
1,20211211,0,1,25,43
2,20211211,0,2,156,130
3,20211211,0,3,189,170
4,20211211,1,4,725,750
5,20211211,2,5,936,935
6,20211211,3,6,1171,1162
7,20211211,4,7,424,419
8,20211211,5,8,1394,1398
9,20211211,6,9,421,416
