In [1]:
# Dependencies
import json
import requests
import pandas as pd
from scipy import stats
from api_keys import api_key_eia,api_key_av

## Preparing Montly Crude Oil WTI Spot Price Dataframe

In [54]:
# Read the original CSV file of crude oil WTI futures price (daily basis)
oil_daily = pd.read_csv("Resources/oil_futures_daily.csv").rename(columns = {"date" : "Date","value" : "oil_spot_price"})
oil_daily.head(11)

Unnamed: 0,Date,oil_spot_price
0,2023-09-25,89.68
1,2023-09-22,90.0
2,2023-09-21,89.56
3,2023-09-20,89.2
4,2023-09-19,91.16
5,2023-09-18,91.47
6,2023-09-15,90.83
7,2023-09-14,90.13
8,2023-09-13,88.59
9,2023-09-12,88.87


In [55]:
# Replace non-numeric values with NaN in the 'oil_spot_price' column
oil_daily['oil_spot_price'] = oil_daily['oil_spot_price'].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values in the 'oil_spot_price' column
oil_daily.dropna(subset=['oil_spot_price'], inplace=True)

In [56]:
# Review the data frame
oil_daily.head(11)

Unnamed: 0,Date,oil_spot_price
0,2023-09-25,89.68
1,2023-09-22,90.0
2,2023-09-21,89.56
3,2023-09-20,89.2
4,2023-09-19,91.16
5,2023-09-18,91.47
6,2023-09-15,90.83
7,2023-09-14,90.13
8,2023-09-13,88.59
9,2023-09-12,88.87


In [60]:
# Check the data type
oil_daily.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9506 entries, 0 to 9842
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            9506 non-null   object 
 1   oil_spot_price  9506 non-null   float64
dtypes: float64(1), object(1)
memory usage: 222.8+ KB


In [61]:
# Convert the "date" column to datetime data type
oil_daily['Date'] = pd.to_datetime(oil_daily['Date'])

In [62]:
# Check the data type again
oil_daily.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9506 entries, 0 to 9842
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            9506 non-null   datetime64[ns]
 1   oil_spot_price  9506 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 222.8 KB


In [63]:
# Group by month and sum the monthly oil spot prices and then take its mean
result = oil_daily.groupby(oil_daily['Date'].dt.to_period("M"))['oil_spot_price'].mean()
result

Date
1986-01    22.925455
1986-02    15.454737
1986-03    12.612500
1986-04    12.843636
1986-05    15.377619
             ...    
2023-05    71.578182
2023-06    70.248095
2023-07    76.069500
2023-08    81.386087
2023-09    88.811250
Freq: M, Name: oil_spot_price, Length: 453, dtype: float64

In [65]:
# Reset the index to have "date" as a column
oil_montly_df = result.reset_index().rename(columns={'oil_spot_price':'montly_value'})
oil_montly_df.head()

Unnamed: 0,Date,montly_value
0,1986-01,22.925455
1,1986-02,15.454737
2,1986-03,12.6125
3,1986-04,12.843636
4,1986-05,15.377619


In [67]:
# Filter the results to include only dates from January 2020 to May 2023
f_oil_monthly_df = oil_montly_df[(oil_montly_df['Date'] >= '2020-01') &
                                 (oil_montly_df['Date'] <= '2023-05')].reset_index(drop=True)
f_oil_monthly_df.tail()

Unnamed: 0,Date,montly_value
36,2023-01,78.123
37,2023-02,76.832632
38,2023-03,73.277826
39,2023-04,79.446316
40,2023-05,71.578182


## Collecting crude oil supply/demand data from U.S. Energy Information Administration
---
### 1. Crude Oil Exporting Data

In [14]:
# Set up API query for Crude Oil exporting from USA
api_url = f"https://api.eia.gov/v2/petroleum/move/exp/data/?frequency=monthly&data[0]=value&facets[product][]=EP00&facets[product][]=EPC0&start=2020-01&end=2023-06&sort[0][column]=period&sort[0][direction]=desc&offset=0&length=5000&api_key={api_key_eia}"
r = requests.get(api_url)
export_data = r.json()

In [15]:
print(export_data)

{'response': {'total': 1064, 'dateFormat': 'YYYY-MM', 'frequency': 'monthly', 'data': [{'period': '2023-05', 'duoarea': 'R20-Z00', 'area-name': 'PADD 2', 'product': 'EP00', 'product-name': 'Crude Oil and Petroleum Products', 'process': 'EEX', 'process-name': 'Exports', 'series': 'MTTEXP21', 'series-description': 'Midwest (PADD 2) Exports of Crude Oil and Petroleum Products (Thousand Barrels)', 'value': 12547, 'units': 'MBBL'}, {'period': '2023-05', 'duoarea': 'R40-Z00', 'area-name': 'PADD 4', 'product': 'EP00', 'product-name': 'Crude Oil and Petroleum Products', 'process': 'EEX', 'process-name': 'Exports', 'series': 'MTTEXP42', 'series-description': 'Rocky Mountain (PADD 4) Exports of Crude Oil and Petroleum Products (Thousand Barrels per Day)', 'value': 4, 'units': 'MBBL/D'}, {'period': '2023-05', 'duoarea': 'R40-Z00', 'area-name': 'PADD 4', 'product': 'EP00', 'product-name': 'Crude Oil and Petroleum Products', 'process': 'EEX', 'process-name': 'Exports', 'series': 'MTTEXP41', 'series

In [16]:
# Convert the data into DataFrame
export_df = pd.DataFrame(export_data['response']['data'])
export_df.head()

Unnamed: 0,period,duoarea,area-name,product,product-name,process,process-name,series,series-description,value,units
0,2023-05,R20-Z00,PADD 2,EP00,Crude Oil and Petroleum Products,EEX,Exports,MTTEXP21,Midwest (PADD 2) Exports of Crude Oil and Petr...,12547,MBBL
1,2023-05,R40-Z00,PADD 4,EP00,Crude Oil and Petroleum Products,EEX,Exports,MTTEXP42,Rocky Mountain (PADD 4) Exports of Crude Oil a...,4,MBBL/D
2,2023-05,R40-Z00,PADD 4,EP00,Crude Oil and Petroleum Products,EEX,Exports,MTTEXP41,Rocky Mountain (PADD 4) Exports of Crude Oil a...,121,MBBL
3,2023-05,R20-Z00,PADD 2,EP00,Crude Oil and Petroleum Products,EEX,Exports,MTTEXP22,Midwest (PADD 2) Exports of Crude Oil and Petr...,405,MBBL/D
4,2023-05,R50-Z00,PADD 5,EP00,Crude Oil and Petroleum Products,EEX,Exports,MTTEXP51,West Coast (PADD 5) Exports of Crude Oil and P...,10039,MBBL


In [17]:
# Group by "period" column and calcuate the total amount of crude oil export
sum_export = export_df.groupby(export_df['period'])['value'].sum().reset_index()
sum_export.head()

Unnamed: 0,period,value
0,2020-01,512114
1,2020-02,499910
2,2020-03,536736
3,2020-04,437670
4,2020-05,430916


### 2. Crude Oil Importing Data

##### WTI product overview
- light,sweet crude oil
- low density
- low sulfur content
- used to make gasoline & diesel fuel

In [18]:
# Set up API query for Crude Oil importing to USA
api_url = f"https://api.eia.gov/v2/crude-oil-imports/data/?frequency=monthly&data[0]=quantity&facets[destinationId][]=US&start=2020-01&end=2023-06&sort[0][column]=period&sort[0][direction]=desc&offset=0&length=5000&api_key={api_key_eia}"
r = requests.get(api_url)
import_data = r.json()

In [19]:
# Convert the data into DataFrame
import_df = pd.DataFrame(import_data['response']['data'])
import_df.head()

Unnamed: 0,period,originId,originName,originType,originTypeName,destinationId,destinationName,destinationType,destinationTypeName,gradeId,gradeName,quantity,quantity-units
0,2023-06,OPN_N,Non-OPEC,OPN,OPEC/non-OPEC,US,United States,US,United States,HSO,Heavy Sour,105310,thousand barrels
1,2023-06,REG_OA,Other Americas,REG,Region,US,United States,US,United States,MED,Medium,18042,thousand barrels
2,2023-06,REG_OA,Other Americas,REG,Region,US,United States,US,United States,LSW,Light Sweet,192,thousand barrels
3,2023-06,REG_OA,Other Americas,REG,Region,US,United States,US,United States,LSO,Light Sour,1091,thousand barrels
4,2023-06,REG_OA,Other Americas,REG,Region,US,United States,US,United States,HSW,Heavy Sweet,841,thousand barrels


In [20]:
# Print out the unique value from column "gradeName"
import_df["gradeName"].unique()

array(['Heavy Sour', 'Medium', 'Light Sweet', 'Light Sour', 'Heavy Sweet'],
      dtype=object)

In [21]:
# Filter the dataframe to include "light sweet" grade only
filtered_import_df = import_df[import_df["gradeName"]== 'Light Sweet']
filtered_import_df.head()

Unnamed: 0,period,originId,originName,originType,originTypeName,destinationId,destinationName,destinationType,destinationTypeName,gradeId,gradeName,quantity,quantity-units
2,2023-06,REG_OA,Other Americas,REG,Region,US,United States,US,United States,LSW,Light Sweet,192,thousand barrels
9,2023-06,REG_EU,Europe,REG,Region,US,United States,US,United States,LSW,Light Sweet,1795,thousand barrels
10,2023-06,CTY_VM,Vietnam,CTY,Country,US,United States,US,United States,LSW,Light Sweet,285,thousand barrels
12,2023-06,CTY_UK,United Kingdom,CTY,Country,US,United States,US,United States,LSW,Light Sweet,1795,thousand barrels
17,2023-06,CTY_NI,Nigeria,CTY,Country,US,United States,US,United States,LSW,Light Sweet,4044,thousand barrels


In [22]:
# Group by "period" column and calcuate the total amount of crude oil export
sum_import = filtered_import_df.groupby(filtered_import_df['period'])['quantity'].sum().reset_index()
sum_import.tail()

Unnamed: 0,period,quantity
37,2023-02,39620
38,2023-03,40232
39,2023-04,47896
40,2023-05,46108
41,2023-06,48080


In [23]:
# Drop the last row to filter out the date range (from January 2020 to May 2023)
sum_import = sum_import.iloc[:-1]
sum_import.tail()

Unnamed: 0,period,quantity
36,2023-01,46496
37,2023-02,39620
38,2023-03,40232
39,2023-04,47896
40,2023-05,46108


### 3. Crude Oil Stock
- Due to limitation of original API, the following crude oil stock data is only include its primary delivery point(Cushing, Oklahoma/PADD2) for WTI crude oil futures and main US storage crude oil storage place (Gulf Coast/PADD3)

In [24]:
# Set up API query for Crude Oil Stock in Midwest USA
api_url = f"https://api.eia.gov/v2/petroleum/stoc/typ/data/?frequency=monthly&data[0]=value&facets[product][]=EP00&facets[product][]=EPC0&facets[series][]=MCESTP21&facets[series][]=MCESTP31&facets[series][]=MCRRSP21&facets[series][]=MCRRSP31&facets[series][]=MCRSFP21&facets[series][]=MCRSFP31&facets[series][]=MCRSLP21&facets[series][]=MCRSLP31&facets[series][]=MCRSSP31&facets[series][]=MCRSTP21&facets[series][]=MCRSTP31&facets[series][]=MTTRSP21&facets[series][]=MTTRSP31&facets[series][]=MTTSTP21&facets[series][]=MTTSTP31&facets[series][]=M_EP00_SAS_R30_MBBL&facets[series][]=M_EP00_SKB_R20_MBBL&facets[series][]=M_EP00_SKB_R30_MBBL&facets[series][]=M_EP00_SKL_R20_MBBL&facets[series][]=M_EP00_SKL_R30_MBBL&facets[series][]=M_EP00_SKN_R20_MBBL&facets[series][]=M_EP00_SKN_R30_MBBL&facets[series][]=M_EP00_SKP_R20_MBBL&facets[series][]=M_EP00_SKP_R30_MBBL&facets[series][]=M_EP00_STT_R20_MBBL&facets[series][]=M_EP00_STT_R30_MBBL&facets[series][]=M_EPC0_SAXL_R20_MBBL&facets[series][]=M_EPC0_SAXL_R30_MBBL&start=2020-01&end=2023-06&sort[0][column]=period&sort[0][direction]=desc&offset=0&length=5000&api_key={api_key_eia}"
r = requests.get(api_url)
stock_data = r.json()

In [25]:
# Convert the data into DataFrame
stock_df = pd.DataFrame(stock_data['response']['data'])
stock_df.head(10)

Unnamed: 0,period,duoarea,area-name,product,product-name,process,process-name,series,series-description,value,units
0,2023-05,R20,PADD 2,EP00,Crude Oil and Petroleum Products,SKR,Stocks at Refineries,MTTRSP21,Midwest (PADD 2) Crude Oil and Petroleum Produ...,62245,MBBL
1,2023-05,R20,PADD 2,EP00,Crude Oil and Petroleum Products,STT,Stocks at Tank Farms,M_EP00_STT_R20_MBBL,Midwest (PADD 2) Stocks at Tank Farms of Crude...,112116,MBBL
2,2023-05,R30,PADD 3,EP00,Crude Oil and Petroleum Products,SKP,Stocks in Pipelines,M_EP00_SKP_R30_MBBL,Gulf Coast (PADD 3) Crude Oil and Petroleum Pr...,59534,MBBL
3,2023-05,R20,PADD 2,EP00,Crude Oil and Petroleum Products,SKB,Stocks at Bulk Terminals,M_EP00_SKB_R20_MBBL,Midwest (PADD 2) Crude Oil and Petroleum Produ...,82737,MBBL
4,2023-05,R20,PADD 2,EP00,Crude Oil and Petroleum Products,SKP,Stocks in Pipelines,M_EP00_SKP_R20_MBBL,Midwest (PADD 2) Crude Oil and Petroleum Produ...,35646,MBBL
5,2023-05,R30,PADD 3,EPC0,Crude Oil,SAX,Ending Stocks Excluding SPR,MCESTP31,Gulf Coast (PADD 3) Ending Stocks excluding SP...,249779,MBBL
6,2023-05,R20,PADD 2,EPC0,Crude Oil,STT,Stocks at Tank Farms,MCRSFP21,Midwest (PADD 2) Crude Oil Stocks at Tank Farm...,112116,MBBL
7,2023-05,R30,PADD 3,EP00,Crude Oil and Petroleum Products,STT,Stocks at Tank Farms,M_EP00_STT_R30_MBBL,Gulf Coast (PADD 3) Stocks at Tank Farms of Cr...,205433,MBBL
8,2023-05,R30,PADD 3,EPC0,Crude Oil,SAS,Ending Stocks SPR,MCRSSP31,Gulf Coast (PADD 3) Ending Stocks of Crude Oil...,354366,MBBL
9,2023-05,R20,PADD 2,EPC0,Crude Oil,SKR,Stocks at Refineries,MCRRSP21,Midwest (PADD 2) Crude Oil Stocks at Refinerie...,13104,MBBL


In [26]:
# Group by "period" column and calcuate the total amount of crude oil export
sum_stock = stock_df.groupby(stock_df['period'])['value'].sum().reset_index()
sum_stock.tail()

Unnamed: 0,period,value
36,2023-01,4467045
37,2023-02,4497009
38,2023-03,4441135
39,2023-04,4393584
40,2023-05,4389598


### 4. Crude Oil Production

In [27]:
# Set up API query for Crude Oil Production in USA
api_url = f"https://api.eia.gov/v2/petroleum/crd/crpdn/data/?frequency=monthly&data[0]=value&start=2020-01&end=2023-06&sort[0][column]=period&sort[0][direction]=desc&offset=0&length=5000&api_key={api_key_eia}"
r = requests.get(api_url)
production_data = r.json()

In [28]:
# Convert the data into DataFrame
production_df = pd.DataFrame(production_data['response']['data'])
production_df.head(10)

Unnamed: 0,period,duoarea,area-name,product,product-name,process,process-name,series,series-description,value,units
0,2023-05,SMT,USA-MT,EPC0,Crude Oil,FPF,Field Production,MCRFPMT1,Montana Field Production of Crude Oil (Thousan...,1965,MBBL
1,2023-05,SAR,USA-AR,EPC0,Crude Oil,FPF,Field Production,MCRFPAR2,Arkansas Field Production of Crude Oil (Thousa...,12,MBBL/D
2,2023-05,SUT,USA-UT,EPC0,Crude Oil,FPF,Field Production,MCRFPUT1,Utah Field Production of Crude Oil (Thousand B...,4785,MBBL
3,2023-05,R40,PADD 4,EPC0,Crude Oil,FPF,Field Production,MCRFPP41,Rocky Mountain (PADD 4) Field Production of Cr...,28943,MBBL
4,2023-05,R10,PADD 1,EPC0,Crude Oil,FPF,Field Production,MCRFPP11,East Coast (PADD 1) Field Production of Crude ...,2209,MBBL
5,2023-05,SMS,USA-MS,EPC0,Crude Oil,FPF,Field Production,MCRFPMS1,Mississippi Field Production of Crude Oil (Tho...,1072,MBBL
6,2023-05,SCA,CALIFORNIA,EPC0,Crude Oil,FPF,Field Production,MCRFPCA1,California Field Production of Crude Oil (Thou...,9624,MBBL
7,2023-05,SAK,USA-AK,EPCANS,ANS Crude Oil,FPF,Field Production,MANFPAK1,Alaska North Slope Crude Oil Production (Thous...,13069,MBBL
8,2023-05,STX,TEXAS,EPC0,Crude Oil,FPF,Field Production,MCRFPTX2,Texas Field Production of Crude Oil (Thousand ...,5500,MBBL/D
9,2023-05,SMT,USA-MT,EPC0,Crude Oil,FPF,Field Production,MCRFPMT2,Montana Field Production of Crude Oil (Thousan...,63,MBBL/D


In [29]:
# Group by "period" column and calcuate the total amount of crude oil export
sum_production = production_df.groupby(production_df['period'])['value'].sum().reset_index()
sum_production.tail()

Unnamed: 0,period,value
36,2023-01,818731
37,2023-02,739817
38,2023-03,831218
39,2023-04,797765
40,2023-05,826179


In [68]:
# Create a new dataframe for training and testing machine model
oil_model_df = f_oil_monthly_df.copy()
oil_model_df.head()

Unnamed: 0,Date,montly_value
0,2020-01,57.519048
1,2020-02,50.542632
2,2020-03,29.207727
3,2020-04,16.547619
4,2020-05,28.5625


## Collecting CPI & Federal Funds(Interest)Rate from Alpha Vantage API
---
- Note: Consumer Price Index for All Urban Consumers: All Items in U.S. City Average

In [31]:
# Query CPI data
url = f'https://www.alphavantage.co/query?function=CPI&interval=monthly&apikey={api_key_av}'
r = requests.get(url)
data = r.json()

In [32]:
print(data)

{'name': 'Consumer Price Index for all Urban Consumers', 'interval': 'monthly', 'unit': 'index 1982-1984=100', 'data': [{'date': '2023-08-01', 'value': '307.026'}, {'date': '2023-07-01', 'value': '305.691'}, {'date': '2023-06-01', 'value': '305.109'}, {'date': '2023-05-01', 'value': '304.127'}, {'date': '2023-04-01', 'value': '303.363'}, {'date': '2023-03-01', 'value': '301.836'}, {'date': '2023-02-01', 'value': '300.840'}, {'date': '2023-01-01', 'value': '299.170'}, {'date': '2022-12-01', 'value': '296.797'}, {'date': '2022-11-01', 'value': '297.711'}, {'date': '2022-10-01', 'value': '298.012'}, {'date': '2022-09-01', 'value': '296.808'}, {'date': '2022-08-01', 'value': '296.171'}, {'date': '2022-07-01', 'value': '296.276'}, {'date': '2022-06-01', 'value': '296.311'}, {'date': '2022-05-01', 'value': '292.296'}, {'date': '2022-04-01', 'value': '289.109'}, {'date': '2022-03-01', 'value': '287.504'}, {'date': '2022-02-01', 'value': '283.716'}, {'date': '2022-01-01', 'value': '281.148'}, 

In [33]:
# Convert the data into DataFrame
cpi_montly_df = pd.DataFrame(data['data']).rename(columns = {"value" : "cpi"})
cpi_montly_df.head()

Unnamed: 0,date,cpi
0,2023-08-01,307.026
1,2023-07-01,305.691
2,2023-06-01,305.109
3,2023-05-01,304.127
4,2023-04-01,303.363


In [34]:
# Filter the data by date range (from 2020 Jan - 2023 May)
f_cpi_montly_df = cpi_montly_df[(cpi_montly_df['date'] >= '2020-01-01') &
                                (cpi_montly_df['date'] < '2023-06-01')]

# # Sort the dataframe by column "date" in ascending order
f_cpi_montly_df = f_cpi_montly_df.sort_values(by='date', ascending=True).reset_index(drop=True)
f_cpi_montly_df.head()

Unnamed: 0,date,cpi
0,2020-01-01,257.971
1,2020-02-01,258.678
2,2020-03-01,258.115
3,2020-04-01,256.389
4,2020-05-01,256.394


In [35]:
# Query Federal Fund Interest Rate data
url = f'https://www.alphavantage.co/query?function=FEDERAL_FUNDS_RATE&interval=monthly&apikey={api_key_av}'
r = requests.get(url)
interest_data = r.json()

In [36]:
print(interest_data)

{'name': 'Effective Federal Funds Rate', 'interval': 'monthly', 'unit': 'percent', 'data': [{'date': '2023-08-01', 'value': '5.33'}, {'date': '2023-07-01', 'value': '5.12'}, {'date': '2023-06-01', 'value': '5.08'}, {'date': '2023-05-01', 'value': '5.06'}, {'date': '2023-04-01', 'value': '4.83'}, {'date': '2023-03-01', 'value': '4.65'}, {'date': '2023-02-01', 'value': '4.57'}, {'date': '2023-01-01', 'value': '4.33'}, {'date': '2022-12-01', 'value': '4.10'}, {'date': '2022-11-01', 'value': '3.78'}, {'date': '2022-10-01', 'value': '3.08'}, {'date': '2022-09-01', 'value': '2.56'}, {'date': '2022-08-01', 'value': '2.33'}, {'date': '2022-07-01', 'value': '1.68'}, {'date': '2022-06-01', 'value': '1.21'}, {'date': '2022-05-01', 'value': '0.77'}, {'date': '2022-04-01', 'value': '0.33'}, {'date': '2022-03-01', 'value': '0.20'}, {'date': '2022-02-01', 'value': '0.08'}, {'date': '2022-01-01', 'value': '0.08'}, {'date': '2021-12-01', 'value': '0.08'}, {'date': '2021-11-01', 'value': '0.08'}, {'date

In [37]:
# Convert the data into DataFrame
interest_montly_df = pd.DataFrame(interest_data['data']).rename(columns = {"value" : "interest"})
interest_montly_df.head()

Unnamed: 0,date,interest
0,2023-08-01,5.33
1,2023-07-01,5.12
2,2023-06-01,5.08
3,2023-05-01,5.06
4,2023-04-01,4.83


In [38]:
# Filter the data by date range (from 2020 Jan - 2023 May)
f_interest_montly_df = interest_montly_df[(interest_montly_df['date'] >= '2020-01-01') &
                                          (interest_montly_df['date'] < '2023-06-01')]

# # Sort the dataframe by column "date" in ascending order
f_interest_montly_df = f_interest_montly_df.sort_values(by='date', ascending=True).reset_index(drop=True)
f_interest_montly_df.head()

Unnamed: 0,date,interest
0,2020-01-01,1.55
1,2020-02-01,1.58
2,2020-03-01,0.65
3,2020-04-01,0.05
4,2020-05-01,0.05


## Read all other features from CSV files

In [39]:
# Read the original CSV file
natural_gas_df = pd.read_csv("Resources/Henry_Hub_Natural_Gas_Spot_Price_montly.csv") # Henry Hub Natural Gas Spot Price(montly)
brent_df = pd.read_csv("Resources/brent_monthly.csv") # crude oil brent spot price (montly)
vehicles_df = pd.read_csv("Resources/total_vehicle_ussales.csv") # total US vehicle sales (montly)
income_df = pd.read_csv("Resources/us_personal_income.csv") # us personal income (montly)
export_index_df = pd.read_csv("Resources/export_price_index_petro_coal.csv") # Export Price Index (NAICS): Petroleum and Coal Products Manufacturing
import_index_df = pd.read_csv("Resources/import_price_index_petro_coal.csv") # Import Price Index (NAICS): Petroleum and Coal Products Manufacturing

In [40]:
# Review natural gas dataframe
natural_gas_df.head()

Unnamed: 0,Month,Henry Hub Natural Gas Spot Price Dollars per Million Btu
0,Aug-23,2.58
1,Jul-23,2.55
2,Jun-23,2.18
3,May-23,2.15
4,Apr-23,2.16


In [41]:
# Filter the data by date range (from 2020 Jan - 2023 May)
f_natural_gas_df = natural_gas_df.iloc[3:44]

# Sort the dataframe by column "date" in ascending order
f_natural_gas_df = f_natural_gas_df.sort_index(ascending=False).reset_index(drop=True)
f_natural_gas_df.tail()

Unnamed: 0,Month,Henry Hub Natural Gas Spot Price Dollars per Million Btu
36,Jan-23,3.27
37,Feb-23,2.38
38,Mar-23,2.31
39,Apr-23,2.16
40,May-23,2.15


In [42]:
# Review crude oil Brent dataframe
brent_df.head()

Unnamed: 0,date,value
0,2023-08-01,86.15
1,2023-07-01,80.11
2,2023-06-01,74.84
3,2023-05-01,75.47
4,2023-04-01,84.64


In [43]:
# Filter the data by date range (from 2020 Jan - 2023 May)
f_brent_df = brent_df[(brent_df['date'] >= '2020-01-01') & (brent_df['date'] < '2023-06-01')]

# Sort the dataframe by column "date" in ascending order
f_brent_df = f_brent_df.sort_values(by='date', ascending=True).reset_index(drop=True)
f_brent_df.tail()

Unnamed: 0,date,value
36,2023-01-01,82.5
37,2023-02-01,82.59
38,2023-03-01,78.43
39,2023-04-01,84.64
40,2023-05-01,75.47


In [44]:
# Review vehicles sales dataframe
vehicles_df.head()

Unnamed: 0,DATE,TOTALSA
0,2018-01-01,17.508
1,2018-02-01,17.461
2,2018-03-01,17.628
3,2018-04-01,17.651
4,2018-05-01,17.558


In [45]:
# Filter the data by date range (from 2020 Jan - 2023 May)
f_vehicles_df = vehicles_df[(vehicles_df['DATE'] >= '2020-01-01') & (vehicles_df['DATE'] < '2023-06-01')]

# Sort the dataframe by column "date" in ascending order
f_vehicles_df = f_vehicles_df.sort_values(by='DATE', ascending=True).reset_index(drop=True)
f_vehicles_df.tail()

Unnamed: 0,DATE,TOTALSA
36,2023-01-01,16.455
37,2023-02-01,15.438
38,2023-03-01,15.404
39,2023-04-01,16.587
40,2023-05-01,15.63


In [46]:
# Review income dataframe
income_df.head()

Unnamed: 0,DATE,PI
0,2018-01-01,17294.8
1,2018-02-01,17354.1
2,2018-03-01,17416.0
3,2018-04-01,17472.7
4,2018-05-01,17547.6


In [47]:
# Filter the data by date range (from 2020 Jan - 2023 May)
f_income_df = income_df[(income_df['DATE'] >= '2020-01-01') & (income_df['DATE'] < '2023-06-01')]

# Sort the dataframe by column "date" in ascending order
f_income_df = f_income_df.sort_values(by='DATE', ascending=True).reset_index(drop=True)
f_income_df.tail()

Unnamed: 0,DATE,PI
36,2023-01-01,22432.0
37,2023-02-01,22520.6
38,2023-03-01,22605.1
39,2023-04-01,22669.7
40,2023-05-01,22761.5


In [48]:
# Review Export Price Index (NAICS; Petroleum and Coal Products Manufacturing) Dataframe
export_index_df.head()

Unnamed: 0,DATE,IY324
0,2005-12-01,100.0
1,2006-01-01,105.9
2,2006-02-01,104.2
3,2006-03-01,108.5
4,2006-04-01,117.7


In [49]:
# Filter the data by date range (from 2020 Jan - 2023 May)
f_export_index_df = export_index_df[(export_index_df['DATE'] >= '2020-01-01') & (export_index_df['DATE'] < '2023-06-01')]

# Sort the dataframe by column "date" in ascending order
f_export_index_df = f_export_index_df.sort_values(by='DATE', ascending=True).reset_index(drop=True)
f_export_index_df.tail()

Unnamed: 0,DATE,IY324
36,2023-01-01,163.0
37,2023-02-01,163.1
38,2023-03-01,164.6
39,2023-04-01,161.6
40,2023-05-01,141.0


In [50]:
# Review Import Price Index (NAICS; Petroleum and Coal Products Manufacturing) Dataframe
import_index_df.head()

Unnamed: 0,DATE,IZ324
0,2005-12-01,100.0
1,2006-01-01,102.7
2,2006-02-01,102.5
3,2006-03-01,99.9
4,2006-04-01,112.5


In [51]:
# Filter the data by date range (from 2020 Jan - 2023 May)
f_import_index_df = import_index_df[(import_index_df['DATE'] >= '2020-01-01') & (import_index_df['DATE'] < '2023-06-01')]

# Sort the dataframe by column "date" in ascending order
f_import_index_df = f_import_index_df.sort_values(by='DATE', ascending=True).reset_index(drop=True)
f_import_index_df.tail()

Unnamed: 0,DATE,IZ324
36,2023-01-01,145.2
37,2023-02-01,145.1
38,2023-03-01,137.6
39,2023-04-01,142.0
40,2023-05-01,132.6


## Combine all features and target variable into one DataFrame

In [69]:
# Copy all features to new dataframe

oil_model_df['sum_export'] = sum_export['value']
oil_model_df['sum_import'] = sum_import['quantity']
oil_model_df['sum_stock'] = sum_stock['value']
oil_model_df['sum_production'] = sum_production['value']
oil_model_df['cpi'] = f_cpi_montly_df['cpi']
oil_model_df['interest'] = f_interest_montly_df['interest']

oil_model_df['gas_Henry'] = f_natural_gas_df['Henry Hub Natural Gas Spot Price Dollars per Million Btu']
oil_model_df['crude_brent'] = f_brent_df['value']
oil_model_df['vehicle_sales'] = f_vehicles_df['TOTALSA']
oil_model_df['personal_income'] = f_income_df['PI']
oil_model_df['export_index'] = f_export_index_df['IY324']
oil_model_df['import_index'] = f_import_index_df['IZ324']

#Review Dataframe
oil_model_df.head()

Unnamed: 0,Date,montly_value,sum_export,sum_import,sum_stock,sum_production,cpi,interest,gas_Henry,crude_brent,vehicle_sales,personal_income,export_index,import_index
0,2020-01,57.519048,512114,33860,5471999,837842,257.971,1.55,2.02,63.65,17.314,19065.2,110.0,101.3
1,2020-02,50.542632,499910,25852,5494183,784968,258.678,1.58,1.91,55.66,17.061,19197.0,97.4,96.6
2,2020-03,29.207727,536736,17012,5609773,833917,258.115,0.65,1.79,32.01,11.68,18838.8,93.3,84.1
3,2020-04,16.547619,437670,12052,5881927,752800,256.389,0.05,1.74,18.38,8.923,21050.3,61.3,61.5
4,2020-05,28.5625,430916,26004,5920453,634626,256.394,0.05,1.75,29.38,12.328,20216.6,59.4,67.1


In [70]:
# Write the DataFrame to a CSV file
oil_model_df.to_csv("Outputs/oil_model.csv", index=False)