# Data Preprocessing for Stock Price Prediction
### This NoteBook loads stock price data, performs cleaning, feature engineering, scaling, and sequence creation for LSTM model training.
### The processed data will be saved for use in the LSTM training .

## 1. Load and Inspect Data

### Load the stock price data from a CSV file and perform initial inspection and cleaning.

In [None]:
import pandas as pd
data = pd.read_csv("../data/Google.csv")
df = pd.DataFrame(data=data)
print(df.head())

df.rename(columns={'Unnamed: 0': 'timestamp'}, inplace=True)
print(df.isnull().sum())
df['timestamp'] = pd.to_datetime(df['timestamp'])
print(df.info())
df.set_index('timestamp', inplace=True)
if df.index.duplicated().any():
    print(f"\nFound {df.index.duplicated().sum()} duplicate timestamps. Removing duplicates...")
    df = df[~df.index.duplicated(keep='first')]
    print("Duplicates removed. DataFrame head:")
    print(df.head())
else:
    print("\nNo duplicate timestamps found.")
print(df.head())


## 2. Create Dummy Data

### For testing purposes, create a dummy dataset to ensure reproducibility. In a real scenario, use the actual data loaded above.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
date_range = pd.date_range(start='2016-01-01', periods=200, freq='B')  # 200 business days
np.random.seed(42)
dummy_data = {
    'open': np.random.uniform(30, 50, 200).cumsum() + 100,
    'high': np.random.uniform(30, 50, 200).cumsum() + 101,
    'low': np.random.uniform(30, 50, 200).cumsum() + 99,
    'close': np.random.uniform(30, 50, 200).cumsum() + 100,
    'adjclose': np.random.uniform(30, 50, 200).cumsum() + 100,
    'volume': np.random.randint(10_000_000, 100_000_000, 200),
    'ticker': ['GOOG'] * 200
}

### Adjust high, low relative to open/close for realism

In [None]:
dummy_data['high'] = np.maximum(dummy_data['open'], dummy_data['close']) + np.random.uniform(0.5, 2, 200)
dummy_data['low'] = np.minimum(dummy_data['open'], dummy_data['close']) - np.random.uniform(0.5, 2, 200)

df = pd.DataFrame(dummy_data, index=date_range)
df.index.name = 'timestamp'
print(df.info())

## 3. Feature Engineering

### Add technical indicators like daily returns, moving averages, and volatility to enrich the dataset.

In [None]:
target_column = 'close'

# 1. Daily Returns
df['daily_return'] = df[target_column].pct_change()

# 2. Simple Moving Averages (SMA) - common lags 20, 50
df['SMA_20'] = df[target_column].rolling(window=20).mean()
df['SMA_50'] = df[target_column].rolling(window=50).mean()

# 3. Exponential Moving Averages (EMA) - often more responsive
df['EMA_20'] = df[target_column].ewm(span=20, adjust=False).mean()
df['EMA_50'] = df[target_column].ewm(span=50, adjust=False).mean()

# 4. Volatility (Standard Deviation of returns over a period)
df['volatility_20'] = df['daily_return'].rolling(window=20).std()

# Remove rows with NaN values created by rolling windows
df.dropna(inplace=True)
print(df.info())

## 4. Visualize Data

### Plot the close price, moving averages, volume, and daily returns to understand the data.

In [None]:
plt.figure(figsize=(16, 8))
plt.plot(df.index, df['close'], label='Close Price', color='blue')
plt.plot(df.index, df['SMA_20'], label='SMA 20', color='green', linestyle='--')
plt.plot(df.index, df['SMA_50'], label='SMA 50', color='red', linestyle='--')
plt.title(f'Historical {df["ticker"].iloc[0]} Close Price with Moving Averages')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(df.index, df['volume'], label='Volume', color='purple')
plt.title(f'{df["ticker"].iloc[0]} Daily Trading Volume')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(df.index, df['daily_return'], label='Daily Return', color='orange')
plt.title(f'{df["ticker"].iloc[0]} Daily Returns')
plt.xlabel('Date')
plt.ylabel('Return')
plt.axhline(0, color='gray', linestyle='--')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## 5. Scale Features


### Scale numerical features to the range [0, 1] using MinMaxScaler.

In [None]:
features_to_scale = [col for col in df.columns if col not in ['ticker']]
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df[features_to_scale])
scaled_df = pd.DataFrame(scaled_data, columns=features_to_scale, index=df.index)

print("\nScaled DataFrame head (only showing first 5 rows and selected columns for brevity):")
print(scaled_df.head())
print("\nDescriptive statistics of scaled data:")
print(scaled_df.describe())

# 6. Create Sequences for LSTM



### Create sequences of past `timesteps` days to predict the next day's close price.

In [None]:
timesteps = 60
X = []
y = []
target_feature_index = scaled_df.columns.get_loc(target_column)

for i in range(timesteps, len(scaled_df)):
    X.append(scaled_df.iloc[i-timesteps:i].values)
    y.append(scaled_df.iloc[i, target_feature_index])

X = np.array(X)
y = np.array(y)

print(f"\nShape of X (sequences): {X.shape}")
print(f"Shape of y (target values): {y.shape}")

## 7. Train-Test Split

### Split the sequences into training and testing sets chronologically.

In [None]:
train_split_ratio = 0.8
train_size = int(len(X) * train_split_ratio)

X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

print("\n--- Preprocessing and Data Preparation Complete ---")
print("You now have X_train, y_train, X_test, y_test ready for LSTM model training.")
print("The 'scaler' object is also preserved for inverse transforming predictions later.")

## 8. Save Processed Data

### Save the processed data and scaler for use in the LSTM training script.

In [None]:
import joblib
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(features_to_scale, 'features_to_scale.pkl')
print("\nSaved X_train, X_test, y_train, y_test, scaler, and features_to_scale.")