In [None]:
%cd /app

import os
import pandas as pd
import datetime
import numpy as np
from pandas import IndexSlice as idx

In [None]:
ice_cot_df = list()
for i in range(2011, 2026):
    print(i)
    df = pd.read_csv(f"/app/data/ice_cot_data/COTHist{i}.csv")
    ice_cot_df.append(df)
ice_cot_df = pd.concat(ice_cot_df)

In [None]:
def ltrim(n):
    if n[:2] == '00':
        n = n[2:]
    if n[:1] == '0':
        n = n[1:]
    
    return n

dfs = []
for i in range(15, 25):
    print(i)
    i = str(i)
    df = pd.read_excel(f"data/cftc_data/f_{i}.xls", 
        sheet_name="XLS", 
        #usecols=selected_columns,
    )
    dfs.append(df)
cftc_cot_df = pd.concat(dfs)

In [None]:
filters = (ice_cot_df['FutOnly_or_Combined'] == "FutOnly") & (ice_cot_df['CFTC_Commodity_Code'].isin(['G', 'B']))
selected_columns = ['Market_and_Exchange_Names', 'As_of_Date_Form_MM/DD/YYYY', 'CFTC_Commodity_Code',
                        "M_Money_Positions_Long_All","M_Money_Positions_Short_All","M_Money_Positions_Spread_All",
                        'Traders_M_Money_Short_All', 'Traders_M_Money_Long_All', 'Traders_M_Money_Spread_All',
                        ]

ice_cot_sub_df = ice_cot_df[filters][selected_columns].copy()

In [None]:
## CFTC codes
# WTI: 067411
# RBOB: 111659
# Gasoil: 
# Heating Oil: 022651
# Brent (financial NYMEX): 06765J

filters = (cftc_cot_df['FutOnly_or_Combined'] == "FutOnly") & (cftc_cot_df['CFTC_Contract_Market_Code'].isin(['067411', '111659', '022651', '06765J']))
selected_columns = ['Market_and_Exchange_Names', 'Report_Date_as_MM_DD_YYYY', 'CFTC_Contract_Market_Code',
                        "M_Money_Positions_Long_ALL","M_Money_Positions_Short_ALL","M_Money_Positions_Spread_ALL",
                        'Traders_M_Money_Short_All', 'Traders_M_Money_Long_All', 'Traders_M_Money_Spread_All',
                        ]
cftc_cot_sub_df = cftc_cot_df[filters][selected_columns].copy()

In [None]:
ice_cot_sub_df.rename({"As_of_Date_Form_MM/DD/YYYY": "Report_Date_as_MM_DD_YYYY", "CFTC_Commodity_Code":"CFTC_Contract_Market_Code"}, axis=1, inplace=True)
for df in [ice_cot_sub_df, cftc_cot_sub_df]:
    df['Report_Date_as_MM_DD_YYYY'] = pd.to_datetime(df['Report_Date_as_MM_DD_YYYY'])
    df['Report_Date_as_MM_DD_YYYY'] = df['Report_Date_as_MM_DD_YYYY'].dt.strftime('%Y-%m-%d')
    df.columns = df.columns.str.lower()

rename = {
    'B': 'brent', 
    'G': 'gasoil', 
    '022651': 'heating_oil',
    '067411': 'wti',
    '111659': 'rbob'
}


In [None]:
combined_cot_df = pd.concat([ice_cot_sub_df, cftc_cot_sub_df])
combined_cot_df['cftc_contract_market_code'] = combined_cot_df['cftc_contract_market_code'].str.strip()
combined_cot_df['cftc_contract_market_code'] = combined_cot_df['cftc_contract_market_code'].map(rename)
combined_cot_df.drop(columns=['market_and_exchange_names'], inplace=True)
combined_cot_df['report_date_as_mm_dd_yyyy'] = pd.to_datetime(combined_cot_df['report_date_as_mm_dd_yyyy']).dt.date
combined_cot_df.rename({'report_date_as_mm_dd_yyyy': 'report_date', 'cftc_contract_market_code': 'code'}, axis=1, inplace=True)
## create multindex with reporte_date and code
#combined_cot_df.set_index(['code', 'report_date'], inplace=True)
#combined_cot_df.to_csv("data/mm_positioning.csv", index=True)

In [None]:
# filter combined_cot_df to be only observations greater than the first of January 2011 but without using index slicing and IndexSlice or resetting the index
combined_cot_df = combined_cot_df[combined_cot_df['report_date'] >= datetime.date(2011, 1, 1)].copy()

In [None]:
from sklearn.preprocessing import StandardScaler

# Select numeric columns to standardize
numeric_columns = ['m_money_positions_long_all', 'm_money_positions_short_all', 
                  'm_money_positions_spread_all', 'traders_m_money_short_all',
                  'traders_m_money_long_all', 'traders_m_money_spread_all']

# Initialize list to store scaled dataframes
scaled_dfs = []

# Split by code and scale each group
for name, group in combined_cot_df.groupby('code'):
    # Initialize scaler
    scaler = StandardScaler()
    
    # Scale numeric columns
    scaled_data = scaler.fit_transform(group[numeric_columns])
    
    # Create DataFrame with scaled data
    scaled_df = pd.DataFrame(scaled_data, columns=numeric_columns, index=group.index)
    
    # Add back code and report_date
    scaled_df['code'] = name
    scaled_df['report_date'] = group['report_date']
    
    scaled_dfs.append(scaled_df)

# Combine all scaled dataframes
scaled_combined_cot_df = pd.concat(scaled_dfs)


# merge scaled_combined_cot_df with combined_cot_df using 'code' and 'report_date' as keys
combined_cot_df = combined_cot_df.merge(scaled_combined_cot_df, on=['code', 'report_date'], suffixes=('', '_scaled'))

In [None]:
def special_multiplication(arr1, arr2):
    result = np.zeros_like(arr1)
    for i in range(len(arr1)):
        if arr1[i] < 0 and arr2[i] < 0:
            result[i] = - arr1[i] * arr2[i]
        elif arr1[i] > 0 and arr2[i] > 0:
            result[i] = arr1[i] * arr2[i]
        else:
            result[i] = 0
    return result

# Example usage
arr1 = np.array([1, -2, 3, -4, 5])
arr2 = np.array([-1, -2, 3, 4, -5])
print(special_multiplication(arr1, arr2))

In [None]:
for x in ['short', 'long', 'spread']:
    # multiple by the scaled number of positions by the scaled number of traders
    #combined_cot_df[f'm_money_{x}_extension'] = special_multiplication(combined_cot_df[f'm_money_positions_{x}_all'].values, combined_cot_df[f'traders_m_money_{x}_all'].values)
    combined_cot_df[f'm_money_{x}_extension'] = combined_cot_df[f'm_money_positions_{x}_all_scaled'] + combined_cot_df[f'traders_m_money_{x}_all_scaled']

In [None]:
combined_cot_df.to_csv("data/mm_positioning.csv", index=False)