# Data Cleaning

We start with three datasets, one for each oil future. We want them to be in a combined dataset, with all the date indices, and we keep the columns representing the absolute price (Price), the percentage change in price (Change %), and the volume traded (Vol.)

These columns are renamed to Change_{Future}, Price_{Future}, and Vol{Future}, with Future representing the name of the oil future. 

In [8]:
import pandas as pd

# Load datasets
DATA_WTI = pd.read_csv("../Datasets/Futures_Oil_WTI.csv", header=0)
DATA_BRENT = pd.read_csv("../Datasets/Futures_Oil_Brent.csv", header=0)
DATA_DUBAI = pd.read_csv("../Datasets/Futures_Oil_Dubai.csv", header=0)

# Ensure the Date columns are parsed correctly
DATA_WTI['Date'] = pd.to_datetime(DATA_WTI['Date'], errors='coerce')
DATA_BRENT['Date'] = pd.to_datetime(DATA_BRENT['Date'], errors='coerce')
DATA_DUBAI['Date'] = pd.to_datetime(DATA_DUBAI['Date'], errors='coerce')

# Perform full outer join on the Date column
combined_data = (
    DATA_WTI[['Date', 'Change %', 'Price', 'Vol.']]  # Include Price and Vol for WTI
    .rename(columns={'Change %': 'Change_WTI', 'Price': 'Price_WTI', 'Vol.': 'Vol_WTI'})
    .merge(
        DATA_BRENT[['Date', 'Change %', 'Price', 'Vol.']]
        .rename(columns={'Change %': 'Change_Brent', 'Price': 'Price_Brent', 'Vol': 'Vol_Brent'}),
        on='Date',
        how='outer'
    )
    .merge(
        DATA_DUBAI[['Date', 'Change %', 'Price', 'Vol.']]
        .rename(columns={'Change %': 'Change_Dubai', 'Price': 'Price_Dubai', 'Vol': 'Vol_Dubai'}),
        on='Date',
        how='outer'
    )
)

# Remove '%' from Change % columns and convert to numeric
for column in ['Change_WTI', 'Change_Brent', 'Change_Dubai']:
    combined_data[column] = (
        combined_data[column]
        .str.replace('%', '', regex=False)  # Remove '%' symbol
        .str.strip()  # Remove leading/trailing spaces
        .astype(float)  # Convert to numeric
    )

# Sort by Date
combined_data = combined_data.sort_values(by='Date')

# Display the first few rows
display(combined_data)

# Save combined data to CSV if needed
combined_data.to_csv("../Datasets/Futures_Oil_Combined.csv", index=False)


  DATA_WTI['Date'] = pd.to_datetime(DATA_WTI['Date'], errors='coerce')
  DATA_BRENT['Date'] = pd.to_datetime(DATA_BRENT['Date'], errors='coerce')
  DATA_DUBAI['Date'] = pd.to_datetime(DATA_DUBAI['Date'], errors='coerce')


Unnamed: 0,Date,Change_WTI,Price_WTI,Vol_WTI,Change_Brent,Price_Brent,Vol._x,Change_Dubai,Price_Dubai,Vol._y
0,2000-01-02,2.10,28.22,76.41K,1.62,26.39,40.36K,,,
1,2000-01-03,4.40,31.77,73.33K,3.45,29.06,42.11K,,,
2,2000-01-05,0.51,25.87,29.44K,,,,,,
3,2000-01-06,3.90,30.14,67.23K,3.11,29.19,28.67K,,,
4,2000-01-08,1.31,27.79,62.86K,0.78,27.14,25.00K,,,
...,...,...,...,...,...,...,...,...,...,...
6773,2024-12-06,0.77,78.50,322.29K,0.75,82.15,260.33K,,,
6774,2024-12-07,-0.50,82.21,370.06K,-0.38,84.10,274.92K,,,
6775,2024-12-08,4.19,80.06,376.75K,3.12,81.38,245.73K,,,
6776,2024-12-09,2.47,68.97,351.98K,1.80,71.43,364.75K,,,
