### Data Preprocessing
Here we prepare the dataset for our reinforcement learning algorithms by performing:
- Log transformation for volume
- Min-Max normalization of features
- Splitting the dataset into training (2010-2018) and testing (2019-2021) sets
- Sorting by date and ticker

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
file_path = 'trading.csv'  
df = pd.read_csv(file_path)

# Convert datadate to datetime (for sorting and splitting)
df['datadate'] = pd.to_datetime(df['datadate'], format='%Y%m%d')

# Log transformation for volume
# Supposedly standard for financial data b/c trading volume varies a lot (millions vs hundreds).
df['log_volume'] = np.log1p(df['volume'])

# Normalize features excluding date and ticker
# Creates consistent value ranges so model doesn't overfit to features just b/c they're more volatile.
features_to_scale = ['adjcp', 'open', 'high', 'low', 'macd', 'rsi', 'cci', 'adx', 'turbulence', 'log_volume']
scaler = MinMaxScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Sort by date and ticker symbol
df = df.sort_values(by=['datadate', 'tic'])

# Split dataset into training (2010-2018) and testing (2019-2021)
# Don't randomly split b/c time series data, supposed to to preserve order.
train_df = df[(df['datadate'] >= '2010-01-01') & (df['datadate'] < '2019-01-01')]
test_df = df[(df['datadate'] >= '2019-01-01') & (df['datadate'] < '2022-01-01')]

# Save datasets
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

df.head()

Unnamed: 0.1,Unnamed: 0,datadate,tic,adjcp,open,high,low,volume,macd,rsi,cci,adx,turbulence,log_volume
0,0,2009-01-02,AAPL,0.007375,0.005558,0.006572,0.005839,26641980.0,0.714382,1.0,0.572197,1.0,0.0,0.587188
1,1,2009-01-02,AXP,0.021509,0.019422,0.02091,0.019808,10955620.0,0.714382,1.0,0.572197,1.0,0.0,0.470422
2,2,2009-01-02,BA,0.079058,0.072733,0.078225,0.074436,7010171.0,0.714382,1.0,0.572197,1.0,0.0,0.411754
3,3,2009-01-02,CAT,0.082744,0.077375,0.081351,0.07876,7116726.0,0.714382,0.0,0.572197,1.0,0.0,0.413736
4,4,2009-01-02,CSCO,0.016247,0.01467,0.015363,0.01499,40977480.0,0.714382,1.0,0.572197,1.0,0.0,0.643759
