# Import data, EDA

In [None]:
import pandas as pd

# Importing data into df
file_path = 'sales.csv'

# Load the CSV file into a DataFrame
ironkaggle_df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
# print(ironkaggle_df.head())

print(ironkaggle_df.dtypes)
# print(ironkaggle_df.describe())
print(ironkaggle_df.columns)


In [None]:
# <!-- | **Variable**           | **Type**         | **Details**                              |
# |-------------------------|------------------|------------------------------------------|
# | `store_ID`             | Categorical     | Nominal, unique store identifiers        |
# | `day_of_week`          | Categorical     | Represents days of the week   |
# | `date`                 | Ordinal         | Nominal (date) |
# | `nb_customers_on_day`  | Numerical       | number of customers          |
# | `open`                 | Categorical     | Binary, open or closed         |
# | `promotion`            | Categorical     | Binary, promotion status                 |
# | `state_holiday`        | Categorical     | Binary, holiday status                  |
# | `school_holiday`       | Categorical     | Binary, holiday status                   |
# | `sales`                | Numerical       | Continuous, sales revenue                | 

categorical_var = ['store_ID', 'day_of_week', 'open', 'promotion', 'state_holiday', 'school_holiday']
date_var = ['date']
numerical_var = ['nb_customers_on_day', 'sales']

# I will drop the untitled column, bc I don't know what it is. 
ironkaggle_df = ironkaggle_df.drop(columns=['Unnamed: 0'])

# Converting the 'date' column to datetime
ironkaggle_df['date'] = pd.to_datetime(ironkaggle_df['date'])

# Count the number of unique values in a column
unique_values = ironkaggle_df['state_holiday'].unique()
print(f"Unique values in the column: {unique_values}")
 
# Unique values in the column: ['0' 'a' 'c' 'b']
# Converting 'state_holiday' ['0' '1' '2 '3']. 

# Define the mapping
mapping = {'0': '0', 'a': '1', 'c': '2', 'b': '3'}

# Apply the mapping to the column
ironkaggle_df['state_holiday'] = ironkaggle_df['state_holiday'].map(mapping)
# converting to int
ironkaggle_df['state_holiday'] = ironkaggle_df['state_holiday'].astype('int64')




## EDA

In [None]:
# Categorical var

for col in categorical_var:
    unique_count = ironkaggle_df[col].nunique()
    print(f"The column '{col}' has {unique_count} unique values.")

In [5]:
# Filter rows where 'open' is 0
closed_sales = ironkaggle_df[ironkaggle_df['open'] == 0]['sales']

# Summary statistics for 'sales' when 'open' is 0
closed_sales_summary = closed_sales.describe()
# print(closed_sales_summary)

# There are no sales when stores are closed. These rows do not provide insights for sales analysis
# I will remove them from the dataset. 

# Remove rows where 'open' is 0 and 'sales' is also 0
ironkaggle_df_filtered = ironkaggle_df[~((ironkaggle_df['open'] == 0) & (ironkaggle_df['sales'] == 0))]
# Drop the entire 'open' feature
ironkaggle_df_filtered = ironkaggle_df_filtered.drop(columns=['open'])
categorical_var.remove('open')



In [6]:
# For the numerical var, what are their descriptive statistics?

import pandas as pd
import numpy as np
from tabulate import tabulate

def calculate_descriptive_stats(df, column):
    """
    Calculates descriptive statistics for a given column of a DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        column (str): The column name for which to calculate statistics.

    Returns:
        pd.DataFrame: A DataFrame containing the calculated statistics.
    """
    # Calculate statistics
    stats = {
        "Statistic": [
            "Mean",
            "Median",
            "Mode",
            "Range",
            "Variance",
            "Standard Deviation",
            "Interquartile Range (IQR)",
            "Skewness",
            "Kurtosis",
            "Minimum",
            "Maximum",
            "Sum",
            "Count",
            "25th Percentile",
            "75th Percentile",
        ],
        "Value": [
            df[column].mean(),
            df[column].median(),
            df[column].mode()[0] if not df[column].mode().empty else np.nan,
            df[column].max() - df[column].min(),
            df[column].var(),
            df[column].std(),
            df[column].quantile(0.75) - df[column].quantile(0.25),
            df[column].skew(),
            df[column].kurt(),
            df[column].min(),
            df[column].max(),
            df[column].sum(),
            df[column].count(),
            df[column].quantile(0.25),
            df[column].quantile(0.75),
        ],
    }

    # Create a DataFrame
    stats_df = pd.DataFrame(stats)

    # Print neatly using tabulate
    # print(tabulate(stats_df, headers='keys', tablefmt='grid', showindex=False))

    return stats_df

def calculate_stats_for_multiple_columns(df, columns):
    """
    Calculates descriptive statistics for multiple columns and stores them in a dictionary.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        columns (list): List of column names to calculate statistics for.

    Returns:
        dict: A dictionary where keys are column names and values are DataFrames of statistics.
    """
    results = {}
    for column in columns:
        # print(f"\nDescriptive Statistics for {column}:")
        stats_df = calculate_descriptive_stats(df, column)
        results[column] = stats_df
    return results

# Example Usage
# Assuming `df` is your DataFrame and `numerical_var` is a list of numerical columns
numerical_var = ['nb_customers_on_day', 'sales']
results = calculate_stats_for_multiple_columns(ironkaggle_df_filtered, numerical_var)


In [7]:
# grouping by store

# Group 'sales' by 'store_ID' and compute aggregate statistics
grouped_sales = ironkaggle_df_filtered.groupby('store_ID')['sales'].agg(['sum', 'mean', 'median', 'count', 'max', 'min']).reset_index()

# Display the grouped DataFrame
# print(grouped_sales)


In [None]:
import matplotlib.pyplot as plt

# Plot sum and means to see distributions
# Extracting data for plotting
store_count = len(grouped_sales["store_ID"])
sums = grouped_sales["sum"]
mean = grouped_sales["mean"]

# Convert sums to millions and means to thousands
sums_in_millions = [x / 1_000_000 for x in sums]
means_in_thousands = [x / 1_000 for x in mean]

# Create a single figure for all 4 plots
plt.figure(figsize=(14, 10))

# Histogram for sum of sales
plt.subplot(2, 2, 1)
n, bins, patches = plt.hist(sums_in_millions, bins=10, alpha=0.8, edgecolor='black')
for i in range(len(n)):
    plt.text((bins[i] + bins[i + 1]) / 2, n[i], str(int(n[i])), ha='center', va='bottom')
plt.title("Histogram of Sum of Sales (in Millions)")
plt.xlabel("Sum of Sales (in Millions)")
plt.ylabel("Store Count")
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Histogram for mean sales
plt.subplot(2, 2, 2)
n, bins, patches = plt.hist(means_in_thousands, bins=10, alpha=0.8, edgecolor='black', color='orange')
for i in range(len(n)):
    plt.text((bins[i] + bins[i + 1]) / 2, n[i], str(int(n[i])), ha='center', va='bottom')
plt.title("Histogram of Mean Sales (in Thousands)")
plt.xlabel("Mean Sales (in Thousands)")
plt.ylabel("Store Count")
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Box plot for sum of sales
plt.subplot(2, 2, 3)
plt.boxplot(sums, vert=False, patch_artist=True, boxprops=dict(facecolor='lightblue'))
plt.title("Box Plot of Sum of Sales")
plt.xlabel("Sum of Sales (in Dollars)")

# Box plot for mean of sales
plt.subplot(2, 2, 4)
plt.boxplot(mean, vert=False, patch_artist=True, boxprops=dict(facecolor='lightgreen'))
plt.title("Box Plot of Mean Sales")
plt.xlabel("Mean Sales (in Dollars)")

plt.tight_layout()
plt.show()

In [None]:
# Categorical var

for col in categorical_var:
    unique_count = ironkaggle_df_filtered[col].nunique()
    print(f"The column '{col}' has {unique_count} unique values.")

### Correlation Matrix

In [None]:
import seaborn as sns


correlation_matrix = ironkaggle_df_filtered.select_dtypes(include=['float64', 'int64']).corr()

# SVisualize the correlation matrix using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(
    correlation_matrix,
    annot=True,        # Display correlation values on the heatmap
    cmap="coolwarm",   # Color map for visualization
    fmt=".2f",         # Format to 2 decimal places
    linewidths=0.5     # Add space between cells
)
plt.title("Correlation Matrix Heatmap for numerical var")
plt.show()



High Correlation Between nb_customers_on_day and sales (0.82): strong positive linear relationship. It suggests that an increase in the number of customers on a day is strongly associated with an increase in sales. 
Moderate Correlation Between promotion and sales (0.37): positive relationship, albeit weaker than with customer numbers. This implies that promotions have a noticeable, but not overwhelming, impact on sales.

Weak Correlations with school_holiday: school_holiday shows very weak correlations with other variables, including sales (0.04). This suggests that school holidays might not significantly impact sales or customer behavior in this dataset.

Negative Correlation Between day_of_week and promotion/sales (-0.29 and -0.18):  negative correlation with day_of_week suggests that certain days of the week might have fewer promotions or lower sales.

In [11]:
# Droppng 'school_holiday' bc very weak corr.
ironkaggle_df_filtered = ironkaggle_df_filtered.drop(columns=['school_holiday'])


# Modelling time!

In [None]:
target_variable = 'sales'

charges_mean = round(ironkaggle_df_filtered[target_variable].mean(), 2)
print(f"The mean of our target variable '{target_variable}' is {charges_mean}")

# Establishing a naive baseline in regression provides a simple benchmark for model performance. 
# It helps identify if a model is adding value by exagerating trivial predictions, showing overfitting or underfitting. 
# Without a baseline, it's difficult to evaluate MSE or R2 and determine if a model is effective or shoudl be improved.

# Also, I will convert date to numerical features.
ironkaggle_df_filtered['year'] = ironkaggle_df_filtered['date'].dt.year
ironkaggle_df_filtered['month'] = ironkaggle_df_filtered['date'].dt.month



## Initial Modelling Without GridSearch or Pipeline

Let's build a simple linear regression model without any feature engineering, grid search, or pipeline. This will serve as our initial baseline for comparison.

### Task:
- Split the data into training and test sets
- Train a simple linear regression model
- Evaluate its performance using regression metrics
- Write it down as a markdown below so you can keep track. This is a scientific experiment


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


### LOADING INFORMATION ###

# Creating a copy of the DataFrame
ironkaggle_df_filtered_copy = ironkaggle_df_filtered.copy()

# # One-hot encoding for categorical variables
# insurance_df_copy_encoded = pd.get_dummies(insurance_df_copy, drop_first=True)

################################################################

# Initialize a list to store the results
results = []

# Loop through all columns in the DataFrame except the target variable and 'date'
for feature in ironkaggle_df_filtered_copy.columns:
    # Skip the target variable or 'date' as in the datetime date
    if feature == target_variable or feature == 'date':
        continue
    
    print(f"\nPerforming Linear Regression for Feature: {feature}")
    
    # Define the feature (X) as a single column
    X = ironkaggle_df_filtered_copy[[feature]]
    y = ironkaggle_df_filtered_copy[[target_variable]]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Append the results for the current feature
    results.append([feature, mse, r2])
    
    # Visualize the regression line
    plt.figure(figsize=(8, 6))
    plt.scatter(X, y, color="blue", label="Actual Data")
    plt.plot(X, model.predict(X), color="red", label="Regression Line")
    plt.title(f"Linear Regression: {target_variable} vs {feature}")
    plt.xlabel(feature)
    plt.ylabel(target_variable)
    plt.legend()
    plt.grid(linestyle="--", alpha=0.7)
    plt.show()

# Create and print a table with the results
headers = ["Feature", "Mean Squared Error (MSE)", "R-squared (R2)"]
print("\nSummary of Results:")
print(tabulate(results, headers=headers, tablefmt="grid"))

nb_customers_on_day has the lowest MSE (3.09e+06), meaning it's the best-performing feature.
Features like store_ID, state_holiday, and year have high MSEs (close to 9.6e+06), indicating they contribute little to reducing prediction error.

In [None]:
# dropping 'store_ID' and 'state_holiday' to improve model performance and rerunning the model

# Creating a copy of the DataFrame
ironkaggle_df_re_filtered_copy = ironkaggle_df_filtered.copy()

ironkaggle_df_re_filtered_copy = ironkaggle_df_re_filtered_copy.drop(columns=['store_ID', 'state_holiday'])

### LOADING INFORMATION ###


################################################################

# Initialize a list to store the results
results = []

# Loop through all columns in the DataFrame except the target variable and 'date'
for feature in ironkaggle_df_re_filtered_copy.columns:
    # Skip the target variable or 'date' as in the datetime date
    if feature == target_variable or feature == 'date':
        continue
    
    print(f"\nPerforming Linear Regression for Feature: {feature}")
    
    # Define the feature (X) as a single column
    X = ironkaggle_df_re_filtered_copy[[feature]]
    y = ironkaggle_df_re_filtered_copy[[target_variable]]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Append the results for the current feature
    results.append([feature, mse, r2])
    
    # Visualize the regression line
    plt.figure(figsize=(8, 6))
    plt.scatter(X, y, color="blue", label="Actual Data")
    plt.plot(X, model.predict(X), color="red", label="Regression Line")
    plt.title(f"Linear Regression: {target_variable} vs {feature}")
    plt.xlabel(feature)
    plt.ylabel(target_variable)
    plt.legend()
    plt.grid(linestyle="--", alpha=0.7)
    plt.show()

# Create and print a table with the results
headers = ["Feature", "Mean Squared Error (MSE)", "R-squared (R2)"]
print("\nSummary of Results:")
print(tabulate(results, headers=headers, tablefmt="grid"))

Feature	Analysis
day_of_week	Weak predictor (R² = 0.032). There may be small weekly trends, but it adds minimal value.
nb_customers_on_day	Strongest predictor (R² = 0.678). Explains most of the variance in the target. This feature is essential.
promotion	Moderate predictor (R² = 0.137). Indicates that promotions positively impact the target variable (likely sales).
year	Very weak predictor (R² = 0.0013). Likely irrelevant unless there are long-term trends.
month	Weak predictor (R² = 0.0059). Minimal seasonal or monthly impact.


## Modelling with Pipeline and Grid Search

Now, let's see how using pipelines can simplify our workflow and prevent data leakage. We'll also use GridSearchCV to find the best hyperparameters.

### Task:
- Create a pipeline that includes scaling and linear regression
- Define a parameter grid for hyperparameter tuning
- Use GridSearchCV to find the best parameters and evaluate the model performance


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Define the features (X) and target (y)
X = ironkaggle_df_re_filtered_copy.drop(columns=[target_variable, 'date'])
y = ironkaggle_df_re_filtered_copy[target_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),       # Scale features
    ('regressor', LinearRegression())   # Apply Linear Regression
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"LinearRegression, Mean Squared Error (MSE): {mse:.2f}")
print(f"LinearRegression, R-squared (R2): {r2:.2f}")



## Trying Another Model with Pipeline

Let's try using a Gradient Boosting Regressor to see if it performs better.

### Task:
- Create and use a pipeline for Gradient Boosting Regressor
- Define a parameter grid for grid search
- Use GridSearchCV to find the best parameters and evaluate the model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the features (X) and target (y)
X = ironkaggle_df_re_filtered_copy.drop(columns=[target_variable, 'date'])
y = ironkaggle_df_re_filtered_copy[target_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),                 # Scale features
    ('regressor', GradientBoostingRegressor(      # Gradient Boosting
        n_estimators=100,                         # Hyperparameter: Number of boosting stages
        learning_rate=0.1,                        # Hyperparameter: Learning rate
        max_depth=3,                              # Hyperparameter: Maximum depth of trees
        random_state=42                           # For reproducibility
    ))
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Gradient Boosting Regressor, Mean Squared Error (MSE): {mse:.2f}")
print(f"Gradient Boosting Regressor, R-squared (R2): {r2:.2f}")

In [None]:
### Wasn't able to do this, was like 20 minutes. 



# from sklearn.model_selection import GridSearchCV, train_test_split
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score


# # Define the features (X) and target (y)
# X = ironkaggle_df_re_filtered_copy.drop(columns=[target_variable, 'date'])
# y = ironkaggle_df_re_filtered_copy[target_variable]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define models and their parameter grids
# models_and_parameters = [
#     {
#         'model': [LinearRegression()],
#         'model__fit_intercept': [True, False]
#     },
#     {
#         'model': [GradientBoostingRegressor()],
#         'model__n_estimators': [100, 200],
#         'model__learning_rate': [0.05, 0.1],
#         'model__max_depth': [3, 5]
#     },
#     {
#         'model': [RandomForestRegressor()],
#         'model__n_estimators': [100, 200],
#         'model__max_depth': [None, 10],
#         'model__min_samples_split': [2, 5]
#     }
# ]

# # Create a pipeline
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),  # Step 1: Standardize the data
#     ('model', LinearRegression())  
# ])

# # Perform GridSearchCV
# grid_search = GridSearchCV(
#     estimator=pipeline, 
#     param_grid=models_and_parameters, 
#     cv=5,  # n-fold cross-validation
#     scoring='r2', 
#     n_jobs=-1 
# )

# # Fit the grid search
# grid_search.fit(X_train, y_train)

# # Get the best model and parameters
# best_model = grid_search.best_estimator_
# best_params = grid_search.best_params_

# # Evaluate the best model on the test set
# y_pred = best_model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print("Best Model:", best_model)
# print("Best Parameters:", best_params)
# print(f"Test Set Mean Squared Error (MSE): {mse:.2f}")
# print(f"Test Set R-squared (R2): {r2:.2f}")

In [None]:
# Using Xgboost

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score


# Define the features (X) and target (y)
X = ironkaggle_df_re_filtered_copy.drop(columns=[target_variable, 'date'])
y = ironkaggle_df_re_filtered_copy[target_variable]


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Standardize the features for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the XGBoost regressor
model = xgb.XGBRegressor(
    n_estimators=100,      # Number of trees
    learning_rate=0.1,     # Step size shrinkage
    max_depth=6,           # Maximum tree depth
    random_state=42,       # Reproducibility
    objective='reg:squarederror'  # Regression loss function
)

# Train the model
model.fit(X_train, y_train)


# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")




# Prediction

In [49]:
# use for prediction

import pandas as pd

# Importing data into df
file_path = 'REAL_DATA_PROCESSED.csv'

# Load the CSV file into a DataFrame
ironkaggle_real_df = pd.read_csv(file_path)


categorical_var = ['store_ID', 'day_of_week', 'open', 'promotion', 'state_holiday', 'school_holiday']
date_var = ['date']
numerical_var = ['nb_customers_on_day', 'sales']

# I will drop the untitled column, bc I don't know what it is. 
ironkaggle_real_df = ironkaggle_real_df.drop(columns=['index'])

# Converting the 'date' column to datetime
ironkaggle_real_df['date'] = pd.to_datetime(ironkaggle_real_df['date'], dayfirst=True)

# Count the number of unique values in a column
unique_values = ironkaggle_real_df['state_holiday'].unique()
# print(f"Unique values in the column: {unique_values}")
 
# Unique values in the column: ['0' 'a' 'c' 'b']
# Converting 'state_holiday' ['0' '1' '2 '3']. 

# Define the mapping
mapping = {'0': '0', 'a': '1', 'c': '2', 'b': '3'}

ironkaggle_real_df['year'] = ironkaggle_real_df['date'].dt.year
ironkaggle_real_df['month'] = ironkaggle_real_df['date'].dt.month
ironkaggle_real_df['day'] = ironkaggle_real_df['date'].dt.day

ironkaggle_real_df = ironkaggle_real_df.drop(columns=['date'])

# Apply the mapping to the column
ironkaggle_real_df['state_holiday'] = ironkaggle_real_df['state_holiday'].map(mapping)

# converting to int
ironkaggle_real_df['state_holiday'] = ironkaggle_real_df['state_holiday'].astype('int64')

# Remove rows where 'open' is 0 
ironkaggle_real_df = ironkaggle_real_df[((ironkaggle_real_df['open'] == 0)) ]
                                       
# Drop the entire 'open' feature
ironkaggle_real_df = ironkaggle_real_df.drop(columns=['open'])
categorical_var.remove('open')

# Drop the entire 'state_holiday' feature
ironkaggle_real_df = ironkaggle_real_df.drop(columns=['state_holiday'])
categorical_var.remove('state_holiday')

# Drop the entire 'school_holiday' feature
ironkaggle_real_df = ironkaggle_real_df.drop(columns=['school_holiday'])
categorical_var.remove('school_holiday')

# # Drop the entire 'day' feature
# ironkaggle_real_df = ironkaggle_real_df.drop(columns=['day'])
# categorical_var.remove('day')

print(ironkaggle_real_df.dtypes)
print(ironkaggle_real_df.columns.tolist())

sales_predictions = model.predict(ironkaggle_real_df)


store_ID               int64
day_of_week            int64
nb_customers_on_day    int64
promotion              int64
year                   int32
month                  int32
day                    int32
dtype: object
['store_ID', 'day_of_week', 'nb_customers_on_day', 'promotion', 'year', 'month', 'day']


ValueError: Feature shape mismatch, expected: 5, got 7

## Build Neural Network

To build the neural network, you can refer to your own codes you wrote while following the [Deep Learning with Python, TensorFlow, and Keras tutorial](https://www.youtube.com/watch?v=wQ8BIBpya2k) in the lesson. It's pretty similar to what you will be doing in this lab.

1. Split the training and test data.
1. Create a `Sequential` model.
1. Add several layers to your model. Make sure you use ReLU as the activation function for the middle layers. Use Softmax for the output layer because each output has a single lable and all the label probabilities add up to 1.
1. Compile the model using `adam` as the optimizer and `sparse_categorical_crossentropy` as the loss function. For metrics, use `accuracy` for now.
1. Fit the training data.
1. Evaluate your neural network model with the test data.

In [None]:
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tabulate import tabulate
# from tensorflow.keras.callbacks import EarlyStopping
# from sklearn.preprocessing import StandardScaler



# # Define the features (X) and target (y)
# X = ironkaggle_df_re_filtered_copy.drop(columns=[target_variable, 'date'])
# y = ironkaggle_df_re_filtered_copy[target_variable]


# # Split the training and test data.
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # convert to float32 for efficiency
# X_train = X_train.astype('float32')
# X_test = X_test.astype('float32')

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# print("X_train shape:", X_train.shape)
# print("y_train shape:", y_train.shape)

# #  print("Unique values in y_train:", set(y_train))

# # Create a `Sequential` model
# model = Sequential()

# # Add several layers to your model. Make sure you use ReLU as the activation function for the middle layers. 
# # Use Softmax for the output layer because each output has a single lable and all the label probabilities add up to 1.

# # model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))     # Input and first hidden layer
# # model.add(Dense(32, activation='relu'))                                 # Second hidden layer
# # model.add(Dense(16, activation='relu'))                                 # Third hidden layer
# # model.add(Dense(len(set(y)), activation='softmax'))                      # Output layer with Softmax

# model = Sequential()
# model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
# model.add(Dense(32, activation='relu'))
# # model.add(Dense(len(set(y)), activation='softmax'))
# model.add(Dense(1, activation='linear'))  # Single neuron for regression


# model.compile(optimizer='adam', 
#               loss='mean_squared_error', 
#               metrics=['mae'])


# # Fit the training data.
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# model.fit(X_train, y_train, epochs=30, batch_size=8, validation_split=0.2, callbacks=[early_stopping])


# # # Evaluate your neural network model with the test data.
# test_loss, test_accuracy = model.evaluate(X_test, y_test)

# # Prepare data for tabulate
# evaluation_data = [
#     ["Metric", "Value"],
#     ["Test Accuracy", f"{test_accuracy:.2%}"],  # Convert to percentage format
#     ["Test Loss", f"{test_loss:.4f}"]
# ]

# # Print evaluation results as a table
# print(tabulate(evaluation_data, headers="firstrow", tablefmt="grid"))

