In [None]:
# Import necessary libraries
import pandas as pd 
import os 


In [63]:
# Define paths 
store_sales = "grocery_sales.csv"
extra_data = "extra_data.parquet"

In [64]:
# Define an extract function
def extract(store_sales, extra_data):
    """
    Extracts data from the store_sales and extra_data files.

    Parameters:
    store_sales (str): Path to the store sales CSV file.
    extra_data (str): Path to the extra data parquet file.

    Returns:
    pd.DataFrame: DataFrame containing the extracted data.
    """
    # Read the store sales data
    store_sales = pd.read_csv(store_sales)

    # Read the extra data
    extra_data = pd.read_parquet(extra_data)

    # Merge the two DataFrames on 'index'
    merged_df = store_sales.merge(extra_data, on="index")
    return merged_df

In [62]:
# Call the extract function with file paths
merged_data = extract(store_sales, extra_data)
print(merged_data.head())  # Display the first few rows of the merged DataFrame

   index  Store_ID        Date  Dept  Weekly_Sales  IsHoliday  Temperature  \
0      0         1  2010-02-05     1      24924.50          0        42.31   
1      1         1  2010-02-05    26      11737.12          0        42.31   
2      2         1  2010-02-05    17      13223.76          0        42.31   
3      3         1  2010-02-05    45         37.44          0        42.31   
4      4         1  2010-02-05    28       1085.29          0        42.31   

   Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5  \
0       2.572        0.0        0.0        0.0        0.0        0.0   
1       2.572        0.0        0.0        0.0        0.0        0.0   
2       2.572        0.0        0.0        0.0        0.0        0.0   
3       2.572        0.0        0.0        0.0        0.0        0.0   
4       2.572        0.0        0.0        0.0        0.0        0.0   

          CPI  Unemployment  Type      Size  
0  211.096358         8.106   3.0  151315.0  
1  211

In [51]:
# Check for missing values in the merged DataFrame
print(merged_data.isnull().sum())

index            0
Store_ID         0
Date            39
Dept             0
Weekly_Sales    38
IsHoliday        0
Temperature      0
Fuel_Price       0
MarkDown1        0
MarkDown2        0
MarkDown3        0
MarkDown4        1
MarkDown5        1
CPI             47
Unemployment    37
Type             1
Size             1
dtype: int64


In [52]:
# Create the transform() function with one parameter: "raw_data"
def transform(raw_data): 
    # fill missing numerical values with mean
    raw_data.fillna(
        {
            'CPI': raw_data['CPI'].mean(), 
            'Weekly_Sales': raw_data['Weekly_Sales'].mean(), 
            'Unemployment': raw_data['Unemployment'].mean()
        }, inplace=True
    ) 

    # Convert Date column to date_time_type
    raw_data["Date"] = pd.to_datetime(raw_data["Date"], format = "%Y-%m-%d") 
    
    # Extract Month value from date
    raw_data['Month'] = raw_data['Date'].dt.month 

    # Filter rows where weekly_sales > 10,000
    raw_data = raw_data.loc[raw_data['Weekly_Sales'] > 10000, :] 

    # Filter for required columns 
    raw_data = raw_data.drop(["index", "Temperature", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5", "Type", "Size", "Date"], axis=1)

    return raw_data

In [54]:
# Call the transform function with the merged data
clean_data = transform(merged_data)
# Display the first few rows of the transformed DataFrame 
print(clean_data.head())  

   Store_ID  Dept  Weekly_Sales  IsHoliday         CPI  Unemployment  Month
0         1     1      24924.50          0  211.096358      8.106000    2.0
1         1    26      11737.12          0  211.096358      8.106000    2.0
2         1    17      13223.76          0  211.096358      8.106000    2.0
5         1    79      46729.77          0  211.096358      7.500052    2.0
6         1    55      21249.31          0  211.096358      7.500052    2.0


In [55]:
# Create the avg_weekly_sales_per_month function that takes in the cleaned data from the last step
def avg_weekly_sales_per_month(clean_data):
    holiday_sales = clean_data[['Month', 'Weekly_Sales']]

    holiday_sales = holiday_sales.groupby('Month').agg(Avg_Sales = ('Weekly_Sales', 'mean')).reset_index().round(2)
    
    return holiday_sales

In [56]:
# Call the avg_weekly_sales_per_month() function and pass the cleaned DataFrame
agg_data = avg_weekly_sales_per_month(clean_data)

print(agg_data.head())

   Month  Avg_Sales
0    1.0   33174.18
1    2.0   34333.33
2    3.0   33220.89
3    4.0   33392.37
4    5.0   33339.89


In [57]:
# Create the load() function that takes in the cleaned DataFrame and the aggregated one with the paths where they are going to be stored
def load(full_data, full_data_file_path, agg_data, agg_data_file_path):
    full_data.to_csv(full_data_file_path, index=False)
    agg_data.to_csv(agg_data_file_path, index=False) 

In [58]:
# Call the load() function and pass the cleaned and aggregated DataFrames with their paths    
load(clean_data, 'clean_data.csv', agg_data, 'agg_data.csv')

In [59]:
# Create the validation() function with one parameter: file_path - to check whether the previous function was correctly executed
def validation(file_path):
    file_exists = os.path.exists(file_path) 

    if not file_exists:
        raise Exception (f'There is no file at the path {file_path}')

In [60]:
# Call the validation() function and pass first, the cleaned DataFrame path, and then the aggregated DataFrame path
validation('clean_data.csv')
validation('agg_data.csv')