In [1]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
from scipy.stats import boxcox, skew
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
base_dir = Path(os.getenv("BASE_DIRECTORY"))
file_path = base_dir / "data/commodities/commodities_price_data-2024.csv"

In [3]:
data = pd.read_csv(file_path, parse_dates=['Date'], date_format='%d/%m/%Y')
data

Unnamed: 0,Date,Item Name,Low Price,Average Price,High Price
0,01-01-2024,કપાસ બી. ટી.,1000,1151,1461
1,01-01-2024,ઘઉં લોકવન,450,550,581
2,01-01-2024,ઘઉં ટુકડા,550,608,722
3,01-01-2024,મગફળી જીણી,911,1286,1375
4,01-01-2024,સિંગ ફાડીયા,1050,1461,1701
...,...,...,...,...,...
9193,31-12-2024,વાલ,391,901,1171
9194,31-12-2024,ચોળા / ચોળી,801,1701,2651
9195,31-12-2024,સોયાબીન,601,821,841
9196,31-12-2024,ગોગળી,721,871,881


In [6]:
# Pivot table: Rows = Date, Columns = Products, Values = Prices
price_pivot = data.pivot_table(index="Date", columns="Item Name", values="Average Price")

# Compute correlation matrix
correlation_matrix = price_pivot.corr()

# Define correlation threshold
correlation_threshold = 0.75

# Step 1: Build Graph Representation
graph = {}  # Dictionary where key = product, value = set of correlated products

for product in correlation_matrix.columns:
    related = correlation_matrix[product][correlation_matrix[product] >= correlation_threshold].index.tolist()
    # related.remove(product)  # Remove self-correlation

    if product not in graph:
        graph[product] = set()

    for item in related:
        graph[product].add(item)
        if item not in graph:
            graph[item] = set()
        graph[item].add(product)  # Ensure bidirectional linkage

# Step 2: Find Connected Components (DFS)
def find_clusters(graph):
    visited = set()
    clusters = []

    def dfs(node, cluster):
        """Depth-First Search to group connected nodes"""
        if node in visited:
            return
        visited.add(node)
        cluster.add(node)
        for neighbor in graph[node]:
            dfs(neighbor, cluster)

    for product in graph:
        if product not in visited:
            cluster = set()
            dfs(product, cluster)
            clusters.append(cluster)

    return clusters

# Step 3: Get the clustered product groups
product_clusters = find_clusters(graph)

# Step 4: Display Clusters
for i, cluster in enumerate(product_clusters):
    print(f"Cluster {i+1}: {', '.join(cluster)}")


Cluster 1: ડુંગળી લાલ, વાલ પાપડી, તલ લાલ, ધાણા, લસણ સુકું, કારીજીરી, રાજગરો, અજમાં, તલ - તલી, મરચા સૂકા પટ્ટો, સોયાબીન, બાજરો, સિંગદાણા જાડા, તલ કાળા, મગફળી જાડી, ચોળા / ચોળી, તુવેર, મગફળી 66, સુવાદાણા, મરચા સૂકા ઘોલર, નવું જીરૂ, કપાસ બી. ટી., અરીઠા, રજકાનું બી, ઇસબગુલ, મગફળી જીણી, ધાણી, મેથી, નવા ધાણા, કપાસ નવો, એરંડા / એરંડી, સિંગ ફાડીયા, નવી ધાણી, કળથી, ગુવાર બી, ડુંગળી સફેદ, અડદ, નવું લસણ, ગોગળી, સફેદ ચણા
Cluster 2: કાંગ
Cluster 3: ક્લંજી
Cluster 4: ઘઉં ટુકડા
Cluster 5: ઘઉં લોકવન
Cluster 6: ચણા
Cluster 7: જીરૂ
Cluster 8: જુવાર
Cluster 9: નવા ચણા
Cluster 10: મકાઇ
Cluster 11: મગ
Cluster 12: મગફળી નવી
Cluster 13: મઠ
Cluster 14: મરચા
Cluster 15: રાય
Cluster 16: રાયડો
Cluster 17: વટાણા
Cluster 18: વરીયાળી
Cluster 19: વાલ
Cluster 20: સુરજમુખી


In [None]:
def check_stationarity(series, size):
    if len(series) == size:
        return True
    result = adfuller(series)
    print("\nADF Test Results:")
    print(f"ADF Statistic: {result[0]:.4f}")
    print(f"p-value: {result[1]:.4f}")
    for key, value in result[4].items():
        print(f"Critical Value {key}: {value:.4f}")
    if result[1] < 0.05:
        print("The series is stationary (no further differencing needed).")
        return True
    else:
        print("The series is not stationary (differencing is required).")
        return False
    

# Function to count significant lags
def count_significant_spikes(values, confint):
    # The significant lags are those outside the confidence interval
    significant_lags = np.where(np.abs(values) > confint[:, 1])[0]
    return len(significant_lags), significant_lags


models = {}
for i, cluster in enumerate(product_clusters):
    for product in cluster:
        product_name = product 
        product_data = data[data['Item Name'] == product_name]
        size_product_data = product_data.shape[0]
        print(product_name, "size: ", size_product_data)
        
        if((size_product_data//2)-1 <= 0):
            continue
        
        # Check if any data exists for the selected product
        if product_data.empty:
            print(f"No data found for the product: {product_name}")

        else:
            # Extract the 'Average Price' column for the selected product
            price_data = product_data['Average Price']
            
            
        variance_ratio = price_data.std() / price_data.mean()
        skewness = skew(price_data)

        if variance_ratio > 0.1 and skewness > 0.5:
            apply_boxcox = True
        else:
            apply_boxcox = False
            
        if product_name == 'મેથી':
            apply_boxcox = True
        print(apply_boxcox)
        
        #-------------------------------------------------------------------------------------------------------------------
        print(f"Checking stationarity for the raw price data of {product_name}:")
        d = 0
        price_diff = price_data
        
        if apply_boxcox:
            price_diff, lam = boxcox(price_diff)
            price_diff = pd.Series(price_diff)
        
        while True:
            if check_stationarity(price_diff, (size_product_data//2)+1):
                # price_diff = price_data
                print(f"Data is stationary with d={d}.")
                break
            else:
                price_diff = price_diff.diff().dropna()
                d += 1
                
        
        
        print("\nChecking stationarity for the differenced data:")
        check_stationarity(price_diff, (size_product_data//2)+1)
        
        #--------------------------------------------------------------------------------------------------------
        acf_values, acf_confint = acf(price_diff, alpha=0.05)
        pacf_values, pacf_confint = pacf(price_diff, alpha=0.05)


        # Count significant spikes for ACF (q)
        q, significant_acf_lags = count_significant_spikes(acf_values, acf_confint)

        # Count significant spikes for PACF (p)
        p, significant_pacf_lags = count_significant_spikes(pacf_values, pacf_confint)

        print(f"Significant spikes in ACF (q): {q} at lags {significant_acf_lags}")
        print(f"Significant spikes in PACF (p): {p} at lags {significant_pacf_lags}")
        
        
        # ACF and PACF Plots
        try:
            plt.figure(figsize=(12, 6))
            plot_acf(price_diff, lags=(size_product_data//2)-1, title="Autocorrelation Function (ACF)")
            plt.show()

            plt.figure(figsize=(12, 6))
            plot_pacf(price_diff, lags=(size_product_data//2)-1, title="Partial Autocorrelation Function (PACF)")
            plt.show()
        
        except Exception as e:
            print(f"Can't plot the acf/pacf plots for {product_name} due to {e}")
        
        
        if p > 5:
            p = min(5, q)
        if q > 5: 
            q = min(5, p)
        if d > 3:
            d = 1

        fit_successful = False

        while not fit_successful and q >= 0:
            try:
                print(f"\nFitting ARIMA model with order ({p}, {d}, {q})...")
                model = ARIMA(price_data, order=(p, d, q))
                model_fit = model.fit()
                fit_successful = True  
                print("ARIMA model fitted successfully!")
                
            except np.linalg.LinAlgError as err:
                print(f"Error encountered: {err}")
                if q > 0: 
                    q -= 1
                    print(f"Reducing q to {q} and trying again...")
                else:
                    print("Unable to fit model after reducing q multiple times. Exiting loop.")
                    break
                
                
        # Step 5: Analyze Model Summary
        print("\nARIMA Model Summary:")
        print(model_fit.summary())  

40
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
