## import statements, data imports

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(0)

from sklearn.preprocessing import StandardScaler

In [None]:
main_filename = '..\\..\\data\\raw\\all_stocks_5yr.csv' 
df = pd.read_csv(main_filename)
df.describe()

## remove nulls

In [None]:
df = df.dropna()

## clean / prep cols


In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
# df = pd.get_dummies(df, columns=['Name']) # one hot encoding Name

## train test validation split


In [None]:
y = df["close"] # label
X = df.drop(columns=["close"]) # features

In [None]:
aapl_count = (df['Name'] == 'AAPL').sum()
aapl_count
df = df[df['Name'] == "AAPL"].copy()
df = df.sort_values(by="date")

# lag features to help model memory
df['close_lag_1'] = df['close'].shift(1)
df['close_lag_2'] = df['close'].shift(2)
df['close_lag_3'] = df['close'].shift(3)

# rolling mean to detect volatility
df['close_ma_5'] = df['close'].rolling(window=5).mean()
df['close_ma_10'] = df['close'].rolling(window=10).mean()
df['close_std_5'] = df['close'].rolling(window=5).std()

df["target"] = df["close"].shift(-1)
df = df.dropna() 
df = df.drop(columns=["Name"])

# time-based train/val/test split
split_1 = int(len(df) * 0.6)
split_2 = int(len(df) * 0.8)

train = df.iloc[:split_1]
val   = df.iloc[split_1:split_2]
test  = df.iloc[split_2:]

## feature select: drop highly correlated features

In [None]:
# creating corr matrix
numeric_X = train.select_dtypes(include=[float, int])
corr_matrix = numeric_X.corr().abs()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# separate features and labels
X_train = train.drop(columns=["close", "target", "date"])
y_train = train["target"]

X_val = val.drop(columns=["close", "target", "date"])
y_val = val["target"]

X_test = test.drop(columns=["close", "target", "date"])
y_test = test["target"]

## data transform: scale data

In [None]:
scaler = StandardScaler() # standard scalar obj removes mean, scales data to unit variance

# fit scalar to train, test, and validation then transform . 'fit' calcs mean and standard dev from training data.
# X train scaled will contain scaled versions of feature sets with date removed
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## save data to local

In [None]:
processed_path = os.path.abspath(os.path.join("..", "..", "data", "processed"))
os.makedirs(processed_path, exist_ok=True)

In [None]:
X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# save to csv
X_train_df.to_csv(os.path.join(processed_path, "X_train.csv"), index=False)
X_val_df.to_csv(os.path.join(processed_path, "X_val.csv"), index=False)
X_test_df.to_csv(os.path.join(processed_path, "X_test.csv"), index=False)

In [None]:
y_train_df = pd.DataFrame(y_train, columns=['target'])
y_val_df = pd.DataFrame(y_val, columns=['target'])
y_test_df = pd.DataFrame(y_test, columns=['target'])

y_train_df.to_csv(os.path.join(processed_path, "y_train.csv"), index=False)
y_val_df.to_csv(os.path.join(processed_path, "y_val.csv"), index=False)
y_test_df.to_csv(os.path.join(processed_path, "y_test.csv"), index=False)