In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Define correct column names
column_names = ["Date", "Open", "High", "Low", "Close", "Volume"]

# Load CSV, skipping first two rows, and setting column names
df = pd.read_csv("../data/bitcoin_prices.csv", skiprows=2, names=column_names)

# Convert "Date" to datetime format
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# Drop any rows where "Date" is missing
df.dropna(subset=["Date"], inplace=True)

# Set "Date" as the index
df.set_index("Date", inplace=True)

# Normalize the "Close" price
scaler = MinMaxScaler(feature_range=(0, 1))
df["Close_Scaled"] = scaler.fit_transform(df[["Close"]])

# Create a target column (shifted Close price for prediction)
df["Prediction"] = df["Close_Scaled"].shift(-30)

# Drop the last 30 rows since they have no target values
df.dropna(inplace=True)

# Save the cleaned & processed data
df.to_csv("../data/processed_bitcoin_prices.csv")

# Display final dataset
df.head()


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Close_Scaled,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-02,32127.267578,33155.117188,29091.181641,29376.455078,67865420765,0.150436,0.191803
2021-01-03,32782.023438,34608.558594,32052.316406,32129.408203,78665235202,0.180901,0.196436
2021-01-04,31971.914062,33440.21875,28722.755859,32810.949219,81163475344,0.188443,0.21832
2021-01-05,33992.429688,34437.589844,30221.1875,31977.041016,67547324782,0.179215,0.240058
2021-01-06,36824.363281,36879.699219,33514.035156,34013.613281,75289433811,0.201752,0.234042
