# ARIMA 

## Data loading and filteration

In [None]:


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import mlflow
from statsmodels.tsa.stattools import pacf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import  pickle




df = pd.read_csv("./ElectronicsProductsPricingData.csv", encoding='latin1')



df["prices.dateSeen"] = pd.to_datetime(df["prices.dateSeen"], errors='coerce')


# compute a percent‑discount column from the price fields
# discount_percent = ((regular_price - sale_price) / regular_price) * 100
# assume amountMax is the non‑sale price and amountMin the current price;
# guard against division by zero.

df['discount_percent'] = np.where(
    df['prices.amountMax'] > 0,
    100 * (df['prices.amountMax'] - df['prices.amountMin']) / df['prices.amountMax'],
    0
)


train = df[df["prices.dateSeen"].dt.year == 2017]
test = df[df["prices.dateSeen"].dt.year == 2018]


df_2018 = df[df["prices.dateSeen"].dt.year == 2018]
df = df[df["prices.dateSeen"].dt.year == 2017]

mlflow.end_run()
mlflow.set_tracking_uri('http://localhost:5050')


df[["discount_percent","prices.amountMax","prices.dateSeen","prices.amountMin"]].head()

## 1. Data preparation

### 1.1 Engineer time features 

In [None]:
df['day_of_week'] = df['prices.dateSeen'].dt.dayofweek
df['month'] = df['prices.dateSeen'].dt.month
df['week_of_year'] = df['prices.dateSeen'].dt.isocalendar().week.astype(int)

df_2018['day_of_week'] = df_2018['prices.dateSeen'].dt.dayofweek
df_2018['month'] = df_2018['prices.dateSeen'].dt.month
df_2018['week_of_year'] = df_2018['prices.dateSeen'].dt.isocalendar().week.astype(int)


# Binary flag for Thursday updates
df['is_thursday'] = (df['day_of_week'] == 3).astype(int)
df_2018['is_thursday'] = (df_2018['day_of_week'] == 3).astype(int)

# Preview the engineered features
df[['prices.dateSeen', 'day_of_week', 'week_of_year', 'is_thursday']].head()


In [None]:

# Sample dataset
np.random.seed(42)


# Style
sns.set_style("whitegrid")

# Create figure (VERY WIDE)
fig, axes = plt.subplots(
    nrows=3,
    ncols=2,
    figsize=(20, 18)   # ← control width here
)

# Flatten axes for easier indexing
axes = axes.flatten()

# 1 Scatter
sns.scatterplot(data=df, x="prices.dateSeen", y="discount_percent", ax=axes[0])
sns.scatterplot(data=df_2018, x="prices.dateSeen", y="discount_percent", ax=axes[0])


# 2 Line
sns.lineplot(data=df, x="month", y="discount_percent", ax=axes[1])
sns.lineplot(data=df_2018, x="month", y="discount_percent", ax=axes[1])


# 3 Bar
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.barplot(data=df, x="day_of_week", y="discount_percent", ax=axes[2], order=range(7))
sns.barplot(data=df_2018, x="day_of_week", y="discount_percent", ax=axes[2], order=range(7), alpha=0.5)
axes[2].set_xticklabels(weekdays)

# 4 Box

sns.lineplot(data=df, x="week_of_year", y="discount_percent", ax=axes[3])
sns.lineplot(data=df_2018, x="week_of_year", y="discount_percent", ax=axes[3])

# 5 Rolling Average (4 weeks)
df_rolling_4w = df.set_index('prices.dateSeen').sort_index()
df_rolling_4w = df_rolling_4w["discount_percent"].rolling('4D').mean()
sns.lineplot(x=df_rolling_4w.index, y=df_rolling_4w , ax=axes[4])
sns.lineplot(x=df_2018.set_index('prices.dateSeen').sort_index().index, y=df_2018.set_index('prices.dateSeen').sort_index()["discount_percent"].rolling('4D').mean() , ax=axes[4])



df_2018.reset_index(inplace=True)


# is_thursday
sns.barplot(data=df, x="is_thursday", y="discount_percent", ax=axes[5])
sns.barplot(data=df_2018, x="is_thursday", y="discount_percent", ax=axes[5], alpha=0.5)







# Improve spacing
plt.tight_layout()

plt.show()

#### 1.1.1 Temporal Feature Extraction
As per the Phase 2 requirements, we focus on high-resolution time features: day_of_week, month, week_of_year, and is_thursday. Justification: Thursdays are often identified as key re-pricing days in retail as stores prepare for weekend traffic. Utility: These features provide the categorical "hooks" for the Stage 1 Classification model.

### 1.2 Create lag features 

### adf test 

In [None]:
df = df.sort_values("prices.dateSeen")
df = df.set_index("prices.dateSeen")

df_2018 = df_2018.sort_values("prices.dateSeen")
df_2018 = df_2018.set_index("prices.dateSeen")


## Here the data is resampled to daily frequency, and the mean discount percentage for each day is calculated.
## This is important because the original data may have multiple entries per day, and we want to analyze the overall trend of discounts on a daily basis.
daily_series = df["discount_percent"].resample("D").mean()
daily_series = daily_series.dropna()

print("Length:", len(daily_series))


result = adfuller(daily_series)

print("ADF Statistic:", result[0])
print("p-value:", result[1])

In [None]:



figure1 = plt.figure(figsize=(12, 6))
plot_acf(df_2018["discount_percent"].dropna(), lags=60)
plt.title("Autocorrelation Function (ACF)")
plt.savefig("./images/acf.png")
mlflow.log_artifact("./images/acf.png")
plt.close(figure1)

#### partial autocoorelation 
 -- here the lag 1 is the one with the highest correlation , whereas other lag 3 is alo , great too , but after lag 6, they are all below the confidence level.

In [None]:
fig_pacf = plt.figure()
plot_pacf(df["discount_percent"].dropna(), lags=60)
plt.savefig("./images/pacf.png")
mlflow.log_artifact("./images/pacf.png")
plt.close(fig_pacf)

#### lag 1 and lag 3 

In [7]:
series = df["discount_percent"]

lag_df = pd.DataFrame({
    "current": series,
    "lag_1": series.shift(1),
    "lag_2": series.shift(2),
    "lag_14": series.shift(14),
    "lag_30": series.shift(30),
})

In [8]:
lag_df = lag_df.dropna()

In [None]:
figure_lag_1 = plt.figure()

plt.scatter(lag_df["lag_1"], lag_df["current"], alpha=0.5)
plt.xlabel("Lag 1 (t-1)")
plt.ylabel("Current (t)")
plt.title("Lag-1 Relationship")
plt.savefig("./images/lag_1.png")
mlflow.log_artifact("./images/lag_1.png")
plt.close(figure_lag_1)


figure_lag_2 = plt.figure()
plt.figure(figsize=(6, 6))
plt.scatter(lag_df["lag_2"], lag_df["current"], alpha=0.5)
plt.xlabel("Lag 2 (t-2)")
plt.ylabel("Current (t)")
plt.title("Lag-2 Relationship")
plt.savefig("./images/lag_2.png")
mlflow.log_artifact("./images/lag_2.png")
plt.close(figure_lag_2)


#### montly and peroidic lag analysies



In [None]:

    # ── Monthly average discount ──────────────────────────────────────────────
monthly_series = df["discount_percent"].resample("ME").mean()
monthly_series_2018 = df_2018["discount_percent"].resample("ME").mean()
fig, axes = plt.subplots(2, 1, figsize=(12, 8))
# Plot monthly mean
axes[0].plot(monthly_series.index, monthly_series.values, marker='o', linewidth=2, color='steelblue')
axes[0].plot(monthly_series_2018.index, monthly_series_2018.values, marker='o', linewidth=2, color='orange')
axes[0].set_title("Monthly Average Discount %")
axes[0].set_xlabel("Month")
axes[0].set_ylabel("Avg Discount %")
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)
# Month-of-year box plot (seasonality)
monthly_df = df["discount_percent"].copy().reset_index()
monthly_df.columns = ["date", "discount_percent"]
monthly_df["month"] = monthly_df["date"].dt.month
monthly_df["month_name"] = monthly_df["date"].dt.strftime("%b")
month_order = ["Jan","Feb","Mar","Apr","May","Jun",
            "Jul","Aug","Sep","Oct","Nov","Dec"]
monthly_df["month_name"] = pd.Categorical(monthly_df["month_name"], categories=month_order, ordered=True)
monthly_df_clean = monthly_df.dropna(subset=["month_name"])
monthly_df_clean.boxplot(column="discount_percent", by="month_name",
                        ax=axes[1], grid=False)
axes[1].set_title("Discount % Distribution by Month")
axes[1].set_xlabel("Month")
axes[1].set_ylabel("Discount %")
plt.suptitle("")
plt.tight_layout()
plt.show()

In [None]:

    # ── Lag-14 and Lag-30 scatter plots (weekly / monthly lags) ───────────────
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].scatter(lag_df["lag_14"], lag_df["current"], alpha=0.4, color='steelblue')
axes[0].set_xlabel("Lag 14 (t-14)")
axes[0].set_ylabel("Current (t)")
axes[0].set_title("Lag-14 Relationship (~2 weeks)")
axes[0].grid(True, alpha=0.3)
axes[1].scatter(lag_df["lag_30"], lag_df["current"], alpha=0.4, color='darkorange')
axes[1].set_xlabel("Lag 30 (t-30)")
axes[1].set_ylabel("Current (t)")
axes[1].set_title("Lag-30 Relationship (~1 month)")
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# ── Lag correlation summary table ─────────────────────────────────────────
# Use lag_df which already has duplicates removed via dropna/shift alignment

lag_df["lag_7"] = lag_df["current"].shift(7)
lag_df_clean = lag_df.dropna()

lag_corr = pd.DataFrame({
    "lag": [1, 2, 7, 14, 30],
    "pearson_r": [
        lag_df_clean["lag_1"].corr(lag_df_clean["current"]),
        lag_df_clean["lag_2"].corr(lag_df_clean["current"]),
        lag_df_clean["lag_7"].corr(lag_df_clean["current"]),
        lag_df_clean["lag_14"].corr(lag_df_clean["current"]),
        lag_df_clean["lag_30"].corr(lag_df_clean["current"]),
    ]
})

fig, ax = plt.subplots(figsize=(7, 4))
colors = ['steelblue' if r > 0.1 else 'lightgray' for r in lag_corr["pearson_r"]]
bars = ax.bar([f"Lag {l}" for l in lag_corr["lag"]], lag_corr["pearson_r"], color=colors)
ax.axhline(0.1, color='red', linestyle='--', linewidth=1, label='0.1 threshold')
ax.set_title("Pearson Correlation: Lag vs Current Discount %")
ax.set_ylabel("Pearson r")
ax.set_xlabel("Lag")
ax.legend()
ax.grid(True, axis='y', alpha=0.3)
for bar, val in zip(bars, lag_corr["pearson_r"]):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
            f"{val:.3f}", ha='center', va='bottom', fontsize=9)
    
plt.savefig("./images/Pearson_Correlation(Lag_vs_Current_Discount).png")
mlflow.log_artifact("./images/Pearson_Correlation(Lag_vs_Current_Discount).png")
plt.tight_layout()
plt.show()

print(lag_corr.to_string(index=False))


### 1.3 Handle =% discount - build two stage model

In [13]:
# compute a percent‑discount column from the price fields
# discount_percent = ((regular_price - sale_price) / regular_price) * 100
# assume amountMax is the non‑sale price and amountMin the current price;
# guard against division by zero.
df['discount_percent'] = np.where(
	df['prices.amountMax'] > 0,
	100 * (df['prices.amountMax'] - df['prices.amountMin']) / df['prices.amountMax'],
	0
)

# 1. Creating the Binary Classification Target (Stage 1)
# 1 = Discounted (Sale), 0 = Not Discounted (Regular Price)
df['is_discounted'] = (df['discount_percent'] > 0).astype(int)

# 2. Prepare the Regression Target (Stage 2)
# We isolate actual discount values to prevent zero-inflation from biasing the model
actual_discounts = df.loc[df['is_discounted'] == 1, 'discount_percent']

#### 1.2.1 Handling 0% Discounts: Two-Stage Methodology
To address the high frequency of zero-discount days, we implement a two-stage approach: 1. Classification (Stage 1): Predicting the probability of a sale occurring based on time features. 2. Regression (Stage 2): Estimating the depth of the discount once a sale event is confirmed. Benefit: This architectural decision minimizes the error (MAE/RMSE) caused by zero-inflated target variables.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
sns.countplot(x='is_discounted', data=df, palette='Set2')
plt.title('Distribution of Discount Events (Stage 1 Target)')
plt.xlabel('Is there a Discount? (0 = No, 1 = Yes)')
plt.ylabel('Number of Observations')
plt.show()

#### what is two-stage model ?
since there are a lot of zeroes inside the data , so its confused by all the zeroes and tends to predict small values like 2-3  % instead of true zeroes or real discounts. we will split the problem into two 

- Stage 1 (classifier ) : "will there be a discount at all?
- Stage 2 (regression) : "if yes, how much will the discount be" -predict the %

### 1.4 Train/test split: 2017 = train, 2018 = test

 - we will split the dat series into two different forms 
 - **2017** - Train for the data 
 - **2018** - Testing the model

In [None]:
# subset containing ONLY actual discounts (Stage 2 data)
df_discounted = df[df['is_discounted'] == 1]

# Visualization: Distribution of Non-Zero Discounts
plt.figure(figsize=(10, 6))
sns.histplot(df_discounted['discount_percent'], bins=20, kde=True, color='darkorange')
plt.title('Stage 2: Magnitude of Actual Discounts (Excluding 0%)')
plt.xlabel('Discount Percentage (%)')
plt.ylabel('Frequency')
plt.show()

## ARIMA model
The **ARIMA** model consists of three different values , the p(Autoregressive), d(differencing), q(movig average)
 - **Autoregressive** - it refers to the relationship between the current value and the past vlaues of the sam variable.
 - **Integration** - it refers to the gegree of differcing applied to the data to make it stationary. 
 - **Moving Average** - it refers to the relationship between the current value and the past errors of the model. 

From the above data filtering , we have p value either 1, 2,3 or 4 , now for the q value , for the q value , since we from the above ADF test , we get the p-value less then 0.05 , which makes the series to be stationary , d value is 0 , but if the series was greater then 0.05 , then we have to 

In [None]:



train = train.sort_values("prices.dateSeen")
train = train.set_index("prices.dateSeen")



test = test.sort_values("prices.dateSeen")
test = test.set_index("prices.dateSeen")



train.head()

In [None]:
mlflow.end_run()
mlflow.set_experiment("ARIMA_Discount_Model")

best_rmse = float("inf")
best_aic = float("inf")
best_model = None
best_params = None

for p in range(1,4):
    for d in range(2):
        for q in range(3):

            with mlflow.start_run():

                # Log parameters
                mlflow.log_param("p", p)
                mlflow.log_param("d", d)
                mlflow.log_param("q", q)

                model = ARIMA(train["discount_percent"], order=(p,d,q))
                model_fit = model.fit()

                forecast = model_fit.forecast(len(test))

                rmse = np.sqrt(
                    mean_squared_error(
                        test["discount_percent"], forecast
                    )
                )

                aic = model_fit.aic

                # ✅ ALWAYS log metrics
                mlflow.log_metric("RMSE", rmse)
                mlflow.log_metric("AIC", aic)

                # ✅ Update best model OUTSIDE logging logic
                if (rmse < best_rmse) or (rmse == best_rmse and aic < best_aic):
                    best_rmse = rmse
                    best_aic = aic
                    best_model = model_fit
                    best_params = (p,d,q)

In [18]:
import pickle

with open("best_arima_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

mlflow.log_artifact("best_arima_model.pkl")

In [None]:
mlflow.end_run()
with mlflow.start_run(run_name="Best_Model"):

    mlflow.log_param("best_p", best_params[0])
    mlflow.log_param("best_d", best_params[1])
    mlflow.log_param("best_q", best_params[2])

    mlflow.log_metric("best_RMSE", best_rmse)
    mlflow.log_metric("best_AIC", best_aic)

    mlflow.log_artifact("best_arima_model.pkl")