In [107]:
import pandas as pd
import numpy as np
import yfinance as yf
import os
import json
import requests
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
import matplotlib.pyplot as plt
from dotenv import load_dotenv

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

load_dotenv()

# Create necessary directories
DATA_DIR = "data"
MODEL_OUTPUT_DIR = "model_outputs"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

In [128]:

eia_api_key = os.getenv("EIA_API_KEY")
eia_url = f"https://api.eia.gov/v2/petroleum/stoc/wstk/data/?api_key={eia_api_key}&frequency=monthly&data[0]=value"

response = requests.get(eia_url)

if response.status_code == 200:
    data = response.json()

    # Extract relevant time series
    inventory_data = [
        [entry["period"], entry["value"]] for entry in data["response"]["data"]
    ]
    inventory_df = pd.DataFrame(inventory_data, columns=["Date", "Crude_Inventory"])

    inventory_df["Date"] = pd.to_datetime(inventory_df["Date"])
    inventory_df.set_index("Date", inplace=True)

In [127]:
inventory_df

Unnamed: 0_level_0,Crude_Inventory
Date,Unnamed: 1_level_1
1990-07-20,12958
1991-01-18,12572
1991-03-22,13016
1991-04-26,13848
1991-11-08,12097
...,...
2020-07-31,330
2020-11-06,274
2021-01-15,293
2021-04-30,193


In [136]:
import requests
import os
import pandas as pd

def fetch_eia_crude_inventory():
    eia_api_key = os.getenv("EIA_API_KEY")

    # 🔥 Updated API URL (ensuring it fetches recent data)
    eia_url = f"https://api.eia.gov/v2/petroleum/stoc/wstk/data/?api_key={eia_api_key}&frequency=weekly&data[0]=value&start=2000-01-01"

    response = requests.get(eia_url)

    if response.status_code == 200:
        data = response.json()

        # Extract relevant time series
        inventory_data = [
            [entry["period"], entry["value"]] for entry in data["response"]["data"]
        ]
        inventory_df = pd.DataFrame(inventory_data, columns=["Date", "Crude_Inventory"])

        # Convert Date column to datetime
        inventory_df["Date"] = pd.to_datetime(inventory_df["Date"])
        inventory_df.set_index("Date", inplace=True)

        inventory_df = inventory_df[~inventory_df.index.duplicated(keep="last")]

        inventory_df = inventory_df.sort_index()

        return inventory_df
    else:
        print(f"Error fetching data: {response.json()}")
        return None



In [138]:
data = fetch_eia_crude_inventory()

In [139]:
data.tail()

Unnamed: 0_level_0,Crude_Inventory
Date,Unnamed: 1_level_1
2025-01-17,0
2025-01-24,32499
2025-01-31,31587
2025-02-07,0
2025-02-14,25


In [203]:
def fetch_data():
    ticker = "CL=F"  # WTI Crude Oil Futures
    df = yf.download(ticker, start="2000-01-01", end="2025-01-01")
    df = df[['Close']].rename(columns={'Close': 'WTI_Price'})
    
    # Fetch U.S. Dollar Index (DXY)
    # dxy = yf.download("DX-Y.NYB", start="2000-01-01", end="2025-01-01")[['Close']]
    # dxy.rename(columns={"Close": "DXY"}, inplace=True)

    # Fetch Crude Inventory using new EIA API
    # inventory_df = fetch_eia_crude_inventory()
    
    # Merge datasets
    df.index.name = "Date"
    # merged_df = df.join([dxy], how="left")
    # merged_df.to_csv(os.path.join(DATA_DIR, "oil_data_with_features.csv"))
    
    return df

In [204]:
data = fetch_data()

[*********************100%***********************]  1 of 1 completed


In [209]:
def preprocess_data():
    df = pd.read_csv(os.path.join(DATA_DIR, "oil_data_with_features.csv"), skiprows=2, parse_dates=["Date"], index_col="Date")
    
    # Ensure column names are correct
    df = df.rename(columns={"Unnamed: 1":"WTI_Price", "Unnamed: 2":"DXY"})
    df.columns = [col.strip() for col in df.columns]

    df.fillna(method="ffill", inplace=True)
    df = df.asfreq('B')  # Ensure business day frequency
    df.dropna(inplace=True)
    
    return df

def train_arima_model(df):
    df = df.asfreq('B')  # Ensure business day frequency
    
    y = df["WTI_Price"].dropna()  # Target variable (oil price)
    
    # Infer frequency instead of setting it manually to avoid ValueError
    y.index = pd.DatetimeIndex(y.index).to_period('B')
    y_diff = y.diff().dropna()
    
    model = auto_arima(y_diff, seasonal=False, stepwise=True, trace=True, suppress_warnings=True)
    best_order = model.order
    print(f"Best ARIMA Order: {best_order}")
    
    # Fit ARIMA Model
    arima_model = ARIMA(y, order=best_order).fit()
    arima_model.save(os.path.join(MODEL_OUTPUT_DIR, "arima_model.pkl"))
    
    return arima_model

def forecast_prices(model, df, steps=30):
    # Forecast future prices
    forecast = model.forecast(steps=steps)
    forecast_dates = pd.date_range(df.index[-1], periods=steps+1, freq='B')[1:]
    forecast_df = pd.DataFrame({'Date': forecast_dates, 'Forecasted_Price': forecast})
    forecast_df.set_index("Date", inplace=True)
    forecast_df.to_csv(os.path.join(MODEL_OUTPUT_DIR, "forecast_prices.csv"))
    return forecast_df


In [210]:
print("Preprocessing data...")
processed_data = preprocess_data()

Preprocessing data...


In [211]:
print("Training ARIMA model...")
arima_model = train_arima_model(processed_data)

Training ARIMA model...
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=24414.494, Time=0.55 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=24553.519, Time=0.02 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=24429.467, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=24417.869, Time=0.07 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=24414.813, Time=0.45 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=24414.812, Time=0.33 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=24415.519, Time=0.68 sec
 ARIMA(2,0,3)(0,0,0)[0]             : AIC=24415.245, Time=0.65 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=24412.846, Time=0.15 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=24413.401, Time=0.12 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=24416.028, Time=0.10 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=24414.722, Time=0.31 sec

Best model:  ARIMA(1,0,1)(0,0,0)[0]          
Total fit time: 3.466 seconds
Best ARIMA Order: (1, 0, 1)


In [212]:
print("Forecasting future prices...")
forecast_data = forecast_prices(arima_model, processed_data)
print(forecast_data.head())


Forecasting future prices...
            Forecasted_Price
Date                        
2025-01-01         71.578132
2025-01-02         71.564552
2025-01-03         71.550998
2025-01-06         71.537470
2025-01-07         71.523969
