# Preprocessing

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf

%matplotlib inline


In [48]:
ticker = "AAPL"  # You can change this to any NASDAQ stock
df = yf.download(ticker, start="2017-01-01", end="2025-01-01")
df

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2017-01-03,26.862425,26.904054,26.540955,26.781479,115127600
2017-01-04,26.832355,26.945681,26.769912,26.793039,84472400
2017-01-05,26.968815,27.026633,26.783795,26.809235,88774400
2017-01-06,27.269468,27.327287,26.936434,27.008128,127007600
2017-01-09,27.519241,27.621002,27.276405,27.278717,134247600
...,...,...,...,...,...
2024-12-24,257.916443,257.926411,255.009620,255.209412,23234700
2024-12-26,258.735504,259.814335,257.347047,257.906429,27237100
2024-12-27,255.309296,258.415896,252.782075,257.546826,42355300
2024-12-30,251.923019,253.221595,250.474615,251.952985,35557500


In [49]:
df.columns = [col[0] for col in df.columns]  # Flatten multi-index column names (data from yfinance is multi-indexed)
df.reset_index(inplace=True)
df

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2017-01-03,26.862425,26.904054,26.540955,26.781479,115127600
1,2017-01-04,26.832355,26.945681,26.769912,26.793039,84472400
2,2017-01-05,26.968815,27.026633,26.783795,26.809235,88774400
3,2017-01-06,27.269468,27.327287,26.936434,27.008128,127007600
4,2017-01-09,27.519241,27.621002,27.276405,27.278717,134247600
...,...,...,...,...,...,...
2007,2024-12-24,257.916443,257.926411,255.009620,255.209412,23234700
2008,2024-12-26,258.735504,259.814335,257.347047,257.906429,27237100
2009,2024-12-27,255.309296,258.415896,252.782075,257.546826,42355300
2010,2024-12-30,251.923019,253.221595,250.474615,251.952985,35557500


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2012 entries, 0 to 2011
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    2012 non-null   datetime64[ns]
 1   Close   2012 non-null   float64       
 2   High    2012 non-null   float64       
 3   Low     2012 non-null   float64       
 4   Open    2012 non-null   float64       
 5   Volume  2012 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 94.4 KB


# Clean Data

In [51]:
# Ensure Date column is datetime type
df = df.copy()  # Avoid SettingWithCopyWarning
df['Date'] = pd.to_datetime(df['Date'])

# Sort by Date just in case
df = df.sort_values('Date')

# Optional: drop duplicate dates if any
df = df.drop_duplicates(subset=['Date'])

# Optional: Check missing values
print(df.isnull().sum())

Date      0
Close     0
High      0
Low       0
Open      0
Volume    0
dtype: int64


# Create Target Variable for Baseline Model

In [52]:
df['Target'] = df['Close'].shift(-1)

# Drop Rows With Missing Targets
The very last row will now have a NaN in Target, because there’s no "next day" to shift into it.

In [53]:
df = df.dropna()

# Save Data without Splitting

In [54]:
full_df = df.copy()     # or however you’ve built it

# Save as CSV
full_df.to_csv(f'data/processed/{ticker}_processed_data_full.csv', index=False)

# Split 80% Train / 20% Test

In [55]:
# Get total number of rows
total_rows = len(df)

# Calculate 80% split point
split_index = int(total_rows * 0.8)

# Split into training and testing sets
train_df = df.iloc[:split_index]
test_df = df.iloc[split_index:]


# Export CSVs

In [56]:
train_df.to_csv(f'data/processed/{ticker}_processed_data_train.csv', index=False)
test_df.to_csv(f'data/processed/{ticker}_processed_data_test.csv', index=False)