# Validate Project Hypothesis
## Objectives

- Hypothesis 1:
Markdown events before holidays have a significant positive effect on sales.
   - H0: Markdown events before holidays do not have a significant positive effect on sales.
   - H1: Markdown events before holidays have a significant positive effect on sales.

### Import libraries

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

### Load the dataset

In [None]:
df_train = pd.read_csv("outputs/TrainData.csv")
df_train.head()

### Convert "Date" column to datetime

In [None]:
df_train['Date'] = pd.to_datetime(df_train['Date'])

## Hypothesis 1

### Filter data after November 2011

In [None]:
df_train_filtered = df_train[df_train['Date'] >= '2011-11-01']
df_train_filtered = df_train_filtered.reset_index(drop=True)
df_train_filtered.shape

### Calculate distances between True values of "IsHoliday"

In [None]:
#holiday_distances = df_train_filtered[df_train_filtered['IsHoliday'] == True]['Date'].diff().dt.days.dropna()
# Create an empty dictionary to store the filtered data for each combination
filtered_data_list = []

# Loop through each store (1 to 45) and each department (1 to 99)
for store_num in range(1, 46):  # Loop through store numbers from 1 to 45
    for dept_num in range(1, 100):  # Loop through department numbers from 1 to 99
        # Create dynamic column names based on store and department numbers
        store_column = f'Store_{store_num}'
        dept_column = f'Dept_{dept_num}'
        
        # Check if the columns exist in the dataframe
        if store_column in df_train_filtered.columns and dept_column in df_train_filtered.columns:
            # Filter the dataframe where the store and department columns have values equal to 1
            filtered_df = df_train_filtered[(df_train_filtered[store_column] == 1) & (df_train_filtered[dept_column] == 1)]
            filtered_df.sort_values(by='Date', ascending=True, inplace=True)
            
            # Store the filtered data in the dictionary with keys for each store-department combination
            filtered_data_list = filtered_data_list + list(filtered_df[filtered_df['IsHoliday'] == True]['Date'].diff().dt.days.dropna())

In [None]:
filtered_data_list_p = list(filter(lambda x : x > 0, filtered_data_list))
min(filtered_data_list_p)

In [None]:
# Define a step based on minimum distance (e.g., 3 points - weeks)
step = 3

### Separate "Holiday_Sales" and "Normal_Sales"

In [None]:
# Separate Holiday_Sales (including 3 steps before each holiday) and Normal_Sales
holiday_sales_indices = []

for idx in df_train_filtered[df_train_filtered['IsHoliday'] == True].index:
    holiday_sales_indices.extend(range(max(0, idx-step), idx+1))

holiday_sales_indices = list(set(holiday_sales_indices))

# Create Holiday_Sales and Normal_Sales sets
holiday_sales = df_train_filtered.loc[holiday_sales_indices, 'Weekly_Sales']
normal_sales = df_train_filtered.drop(holiday_sales_indices)['Weekly_Sales']

### Conduct a t-test to check if Holiday_Sales are significantly greater than Normal_Sales

In [None]:
t_stat, p_value = stats.ttest_ind(holiday_sales, normal_sales, equal_var=False)

# Output the t-test results
print(f"T-statistic: {t_stat}, P-value: {p_value}")
if p_value < 0.05:
    print("We reject the null hypothesis. Markdown events before holidays have a significant positive effect on sales.")
else:
    print("We fail to reject the null hypothesis. No significant positive effect was found.")