### Improved Model Development - Random Forest Model

### Export Libraries

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

### Load the Dataset

In [3]:
df = pd.read_csv('../data/modeling_dataset.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15500 entries, 0 to 15499
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ticker           15500 non-null  object 
 1   date             15500 non-null  object 
 2   sector           15500 non-null  object 
 3   close            15500 non-null  float64
 4   sma_200          15500 non-null  float64
 5   ema_12           15500 non-null  float64
 6   volume_ratio     15500 non-null  float64
 7   volume_sma_20    15500 non-null  float64
 8   rsi_14           15500 non-null  float64
 9   macd_histogram   15500 non-null  float64
 10  price_to_sma_50  15500 non-null  float64
 11  momentum_10      15500 non-null  float64
 12  momentum_20      15500 non-null  float64
 13  volatility_20    15500 non-null  float64
 14  atr_14           15500 non-null  float64
 15  true_range       15500 non-null  float64
 16  trend_label      15500 non-null  object 
dtypes: float64(1

In [6]:
# Covert date to pandas dtetime
df['date'] = pd.to_datetime(df['date'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15500 entries, 0 to 15499
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ticker           15500 non-null  object        
 1   date             15500 non-null  datetime64[ns]
 2   sector           15500 non-null  object        
 3   close            15500 non-null  float64       
 4   sma_200          15500 non-null  float64       
 5   ema_12           15500 non-null  float64       
 6   volume_ratio     15500 non-null  float64       
 7   volume_sma_20    15500 non-null  float64       
 8   rsi_14           15500 non-null  float64       
 9   macd_histogram   15500 non-null  float64       
 10  price_to_sma_50  15500 non-null  float64       
 11  momentum_10      15500 non-null  float64       
 12  momentum_20      15500 non-null  float64       
 13  volatility_20    15500 non-null  float64       
 14  atr_14           15500 non-null  float

In [8]:
df.head()

Unnamed: 0,ticker,date,sector,close,sma_200,ema_12,volume_ratio,volume_sma_20,rsi_14,macd_histogram,price_to_sma_50,momentum_10,momentum_20,volatility_20,atr_14,true_range,trend_label
0,STK001,2021-01-04,Technology,160.11,160.11,160.11,1.0,962644.0,100.0,0.0,0.0,-0.008557,-0.045594,0.012463,2.88,2.88,Uptrend
1,STK001,2021-01-05,Technology,162.36,161.235,160.456154,1.153842,1137664.5,100.0,0.14359,0.006977,-0.008557,-0.045594,0.012463,4.135,5.39,Sideways
2,STK001,2021-01-06,Technology,161.78,161.416667,160.659822,1.167277,1241502.0,79.5053,0.18872,0.002251,-0.008557,-0.045594,0.012463,3.28,1.57,Sideways
3,STK001,2021-01-07,Technology,167.07,162.83,161.646004,1.16732,1314834.75,92.857143,0.543952,0.026039,-0.008557,-0.045594,0.018138,3.99,6.12,Downtrend
4,STK001,2021-01-08,Technology,165.68,163.4,162.266618,0.694431,1221520.0,79.284963,0.647386,0.013953,-0.008557,-0.045594,0.018663,4.008,4.08,Downtrend


### # Define Target and Feature Variables

In [12]:
tar_col = 'trend_label'

feat_cols = [col for col in df.columns if col not in ['ticker', 'date', tar_col]]

print("Number of features:", len(feat_cols))
print("Sample features:", feat_cols[:10])

Number of features: 14
Sample features: ['sector', 'close', 'sma_200', 'ema_12', 'volume_ratio', 'volume_sma_20', 'rsi_14', 'macd_histogram', 'price_to_sma_50', 'momentum_10']


### Train-Test Splitting

In [None]:
split_date = df['date'].quantile(0.7)
train_df = df[df['date'] <= split_date].copy()
test_df = df[df['date'] > split_date].copy()

print('Training Data Shape:', train_df.shape)
print('Testing Data Shape:', test_df.shape)
print('Split Date Threshold:', split_date)

Training Data Shape: (10860, 17)
Testing Data Shape: (4640, 17)
Split Date Threshold: 2023-02-01 00:00:00


### Train-Test Split Distribution

In [27]:
# Record counts and date ranges
train_start, train_end = train_df['date'].min(), train_df['date'].max()
test_start, test_end = test_df['date'].min(), test_df['date'].max()

print(f"Train period: {train_start.date()} - {train_end.date()}  ({len(train_df)} records)")
print(f"Test period:  {test_start.date()} - {test_end.date()}  ({len(test_df)} records)")

# Class distribution in train and test
print("\nTrain class distribution:")
print(train_df['trend_label'].value_counts(normalize=True).round(3))

print("\nTest class distribution:")
print(test_df['trend_label'].value_counts(normalize=True).round(3))


Train period: 2021-01-04 - 2023-02-01  (10860 records)
Test period:  2023-02-02 - 2023-12-22  (4640 records)

Train class distribution:
trend_label
2    0.352
0    0.338
1    0.310
Name: proportion, dtype: float64

Test class distribution:
trend_label
2    0.368
1    0.340
0    0.292
Name: proportion, dtype: float64


### Imbalance ratio

In [None]:
train_counts = (train_df['trend_label'].value_counts())

imbalance_ratio = train_counts.max() / train_counts.min()
imbalance_ratio

1.1364041604754829

In [34]:
test_counts = (test_df['trend_label'].value_counts())

imbalance_ratio = test_counts.max() / test_counts.min()
imbalance_ratio

1.260709010339734

### Encode Categorical variables

In [17]:
s_encode = LabelEncoder()
train_df['sector'] = s_encode.fit_transform(train_df['sector'])
test_df['sector'] = s_encode.transform(test_df['sector'])

t_encode = LabelEncoder()
train_df['trend_label'] = t_encode.fit_transform(train_df['trend_label'])
test_df['trend_label'] = t_encode.transform(test_df['trend_label'])

### Scale Numerical features

In [22]:

# Define numerical columns

num_cols = df.select_dtypes(include=['number', 'float']).columns

In [26]:
# Initialize RobustScaler

scaler = RobustScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

### Data Splitting

In [29]:
# Split the Data

X_train = train_df[feat_cols]
y_train = train_df[tar_col]
                   
X_test = test_df[feat_cols]
y_test = test_df[tar_col]

### Handle Imbalance using SMOTE

In [None]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)