In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Define correct column names
column_names = ["Date", "Open", "High", "Low", "Close", "Volume"]

# Load CSV, skipping first two rows, and setting column names
df = pd.read_csv("../data/bitcoin_prices.csv", skiprows=2, names=column_names)

# Convert "Date" to datetime format
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# Drop any rows where "Date" is missing
df.dropna(subset=["Date"], inplace=True)

# Set "Date" as the index
df.set_index("Date", inplace=True)

# Normalize the "Close" price
scaler = MinMaxScaler(feature_range=(0, 1))
df["Close_Scaled"] = scaler.fit_transform(df[["Close"]])

# Create a target column (shifted Close price for prediction)
df["Prediction"] = df["Close_Scaled"].shift(-30)

# Drop the last 30 rows since they have no target values
df.dropna(inplace=True)

# Save the cleaned & processed data
df.to_csv("../data/processed_bitcoin_prices.csv")

# Display final dataset
df.head()


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Close_Scaled,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-02,6985.470215,7212.155273,6935.27002,7202.55127,20802083465,0.035173,0.069448
2020-01-03,7344.884277,7413.715332,6914.996094,6984.428711,28111481032,0.031686,0.070143
2020-01-04,7410.656738,7427.385742,7309.51416,7345.375488,18444271275,0.037456,0.069421
2020-01-05,7411.317383,7544.49707,7400.535645,7410.45166,19725074095,0.038497,0.068592
2020-01-06,7769.219238,7781.867188,7409.292969,7410.452148,23276261598,0.038497,0.066843
