In [38]:
# Imported utility functions
from dataprep_utils import *

# Real-Time Bitcoin Forecasting with DataPrep

**Author**: Dhanush Sambasivam


This notebook demonstrates the use of `Dataprep` for data cleaning, exploration, and visualization of real-time Bitcoin price data. It includes a time series analysis using ARIMA and Auto-ARIMA forecasting models.

- Based on: `dataprep.example.md`
- Reference APIs: CoinGecko, statsmodels, pmdarima
- Style: [Jupyter notebook guidelines](https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md)


In [39]:

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


## Load and Preview Data


In [40]:
# %load_ext autoreload
# %autoreload 2
# %matplotlib inline

import logging
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA


### Step 1: Load Data

In [None]:
import pandas as pd

# Step 1: Load all three CSVs
df1 = pd.read_csv("bitcoin_real_time_data.csv")
df2 = pd.read_csv("bitcoin_real_time_data1.csv")
df3 = pd.read_csv("bitcoin_real_time_data2.csv")

# Step 2: Combine them together
df_combined = pd.concat([df1, df2, df3], ignore_index=True)

# Step 3: Optional - Sort by timestamp if needed
df_combined['timestamp'] = pd.to_datetime(df_combined['timestamp'], errors='coerce')
df_combined = df_combined.sort_values(by='timestamp')

# Step 4: Reset index and add new S.No again
df_combined = df_combined.reset_index(drop=True)
df_combined.insert(0, 'S.No', range(1, len(df_combined) + 1))

# Step 5: Save the combined data
output_file = "bitcoin_combined_data.csv"
df_combined.to_csv(output_file, index=False)

# Step 6: Inform the user
logging.info(f"Combined data saved successfully to '{output_file}'.")



In [None]:
import pandas as pd

# Load the CSV
df = pd.read_csv('bitcoin_combined_data.csv')

# Pre
df.head()


In [None]:
df.info()


In [None]:
df.head()


## Clean and Preprocess Data


In [None]:
logging.info(df.columns.tolist())


In [None]:
df.describe(include='all')


In [None]:
df.describe(include='all')


### Step 2: Data Cleaning using Dataprep

In [None]:
pip install dataprep

In [None]:
from dataprep.clean import clean_headers
import pandas as pd

# Clean column names
df = clean_headers(df)

# Use pandas to handle missing values
df = df.dropna()  # or df.fillna(method='ffill') for forward-fill

# Preview
df.head()



In [None]:
logging.info(df.columns.tolist())


 ## Exploratory Data Analysis (EDA)


### Step 3: EDA using Dataprep

In [None]:
from dataprep.eda import create_report

# Full interactive report
create_report(df)


In [None]:
from dataprep.eda import plot

# Plot specific column (univariate)
plot(df, "price_usd")

# Explore all features
plot(df)


### Univariate Analysis with dataprep.eda.plot()


In [None]:
from dataprep.eda import plot

# Plot price distribution
plot(df, "price_usd")

# Optional: plot timestamp just to view frequency (not always useful for time series)
plot(df, "timestamp")


### Rolling Statistics & Trend Smoothing

In [None]:
import matplotlib.pyplot as plt

# Rolling average (5 points)
df['price_usd'].rolling(window=5).mean().plot(label='5-Point Rolling Mean', figsize=(12, 5))
df['price_usd'].plot(label='Original', alpha=0.5)
plt.title("Bitcoin Price with Rolling Mean")
plt.xlabel("Timestamp")
plt.ylabel("USD Price")
plt.legend()
plt.grid(True)
plt.show()


### Price Change Over Time (Percentage Change & Volatility)


In [None]:
# Calculate % change
df['price_change_pct'] = df['price_usd'].pct_change() * 100

# Plot
df['price_change_pct'].plot(figsize=(12,5), title="Percentage Change in Price")
plt.xlabel("Timestamp")
plt.ylabel("Change (%)")
plt.grid(True)
plt.show()


### Outlier Detection using IQR Method


In [None]:
Q1 = df['price_usd'].quantile(0.25)
Q3 = df['price_usd'].quantile(0.75)
IQR = Q3 - Q1

# Define outliers
outliers = df[(df['price_usd'] < Q1 - 1.5 * IQR) | (df['price_usd'] > Q3 + 1.5 * IQR)]

logging.info(f"Number of outliers: {len(outliers)}")

# Visualize with scatter
plt.figure(figsize=(12, 5))
plt.plot(df.index, df['price_usd'], label='Price')
plt.scatter(outliers.index, outliers['price_usd'], color='red', label='Outliers')
plt.title("Bitcoin Price with Outliers Highlighted")
plt.xlabel("Timestamp")
plt.ylabel("USD Price")
plt.legend()
plt.grid(True)
plt.show()


### Histogram and KDE of Bitcoin Prices

In [None]:
!pip install seaborn
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.histplot(df['price_usd'], bins=30, kde=True)
plt.title("Histogram & KDE of Bitcoin Prices")
plt.xlabel("Price (USD)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


## Time Series Forecasting


### Step 4: Time Series Analysis using ARIMA

In [None]:
!pip install statsmodels
!!pip install scikit-learn

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error


In [None]:
# Use only price data
ts = df['price_usd'].copy()

# Make sure the index is datetime
ts.index = pd.to_datetime(df.index)
ts = ts.sort_index()

# Plot original time series
ts.plot(figsize=(12,5), title='Bitcoin Price Over Time')
plt.ylabel('Price (USD)')
plt.grid(True)
plt.show()


In [None]:
def check_stationarity(series):
    result = adfuller(series.dropna())
    logging.info(f"ADF Statistic: {result[0]}")
    logging.info(f"p-value: {result[1]}")
    if result[1] <= 0.05:
        logging.info("✅ The series is stationary.")
    else:
        logging.info("❌ The series is NOT stationary. Differencing is needed.")

check_stationarity(ts)


In [None]:
# First difference the series
ts_diff = ts.diff().dropna()

# Re-check stationarity after differencing
check_stationarity(ts_diff)


In [None]:
ts_diff.plot(figsize=(12, 5), title="1st Order Differenced Bitcoin Price")
plt.grid(True)
plt.show()


In [None]:
model = ARIMA(ts, order=(5, 1, 0))
model_fit = model.fit()

# Print summary to inspect AIC/BIC
logging.info(model_fit.summary())


In [65]:
# Force timestamp index to be proper datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

# Redefine time series
ts = df['price_usd']
ts = ts.sort_index()


In [None]:
# Forecast more steps
steps = 50

# Infer frequency properly as a Timedelta
freq = pd.to_timedelta(df.index.to_series().diff().mode()[0])
last_timestamp = ts.index[-1]
future_dates = pd.date_range(start=last_timestamp + freq, periods=steps, freq=freq)

# Forecast
forecast = model_fit.forecast(steps=steps)
forecast.index = future_dates

# Plot only recent data + forecast
plt.figure(figsize=(12, 5))
plt.plot(ts[-100:], label='Historical (Last 100)')
plt.plot(forecast.index, forecast, label='Forecast', linestyle='--', color='red')
plt.title(f"Bitcoin Price Forecast using ARIMA(5,1,0) - {steps} Steps Ahead")
plt.xlabel("Timestamp")
plt.ylabel("USD Price")
plt.legend()
plt.grid(True)
plt.show()



We implemented a manual ARIMA(5,1,0) model to forecast Bitcoin prices 50 steps into the future. The model was trained on differenced data to ensure stationarity and captured short-term trends using five autoregressive lags. The forecast, shown in red, extends from the latest observed prices and maintains a stable outlook, reflecting the recent market behavior. Confidence intervals were also computed to assess the prediction’s reliability.



In [None]:
# Step 1: Get forecast object with confidence intervals
forecast_obj = model_fit.get_forecast(steps=steps)
conf_int = forecast_obj.conf_int()
forecast_mean = forecast_obj.predicted_mean

# Step 2: Align forecast index
forecast_mean.index = future_dates
conf_int.index = future_dates

# Step 3: Plot with confidence interval
plt.figure(figsize=(12, 5))
plt.plot(ts[-100:], label='Historical (Last 100)')
plt.plot(forecast_mean.index, forecast_mean, color='red', linestyle='--', label='Forecast')
plt.fill_between(forecast_mean.index, conf_int.iloc[:, 0], conf_int.iloc[:, 1],
                 color='pink', alpha=0.3, label='95% Confidence Interval')
plt.title(f"Bitcoin Price Forecast with 95% Confidence Interval ({steps} Steps Ahead)")
plt.xlabel("Timestamp")
plt.ylabel("USD Price")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(ts[-100:], label='Historical (Last 100)')
plt.plot(forecast_mean.index, forecast_mean, color='red', linestyle='--', label='Forecast')
plt.fill_between(forecast_mean.index, conf_int.iloc[:, 0], conf_int.iloc[:, 1],
                 color='pink', alpha=0.3, label='95% Confidence Interval')
plt.title(f"Bitcoin Price Forecast with 95% Confidence Interval ({steps} Steps Ahead)")
plt.xlabel("Timestamp")
plt.ylabel("USD Price")
plt.legend()
plt.grid(True)
plt.savefig("bitcoin_forecast_plot.png", dpi=300)
plt.show()


In [None]:
!pip install pmdarima


In [70]:
ts.index = pd.to_datetime(ts.index)


In [71]:
import warnings
warnings.filterwarnings("ignore")


In [None]:
logging.info(ts.isna().sum())


In [None]:
from pmdarima import auto_arima

# Automatically find best (p, d, q) based on AIC
stepwise_model = auto_arima(ts, 
                            start_p=1, start_q=1,
                            max_p=5, max_q=5,
                            seasonal=False,
                            trace=True,
                            error_action='ignore',
                            suppress_warnings=True)

# View the model summary
logging.info(stepwise_model.summary())


In [74]:
# Forecast n future steps (e.g., 50)
steps = 50
forecast_auto = stepwise_model.predict(n_periods=steps)

# Align forecast with future dates
freq = pd.to_timedelta(df.index.to_series().diff().mode()[0])
future_dates = pd.date_range(start=ts.index[-1] + freq, periods=steps, freq=freq)

# Convert forecast to Series with datetime index
forecast_auto_series = pd.Series(forecast_auto, index=future_dates)


In [None]:
from pmdarima import auto_arima
import pandas as pd
import matplotlib.pyplot as plt

# Ensure datetime index
ts.index = pd.to_datetime(ts.index)
ts = ts.sort_index()

# Step 1: Auto-ARIMA model
stepwise_model = auto_arima(ts, 
                            start_p=1, start_q=1,
                            max_p=5, max_q=5,
                            seasonal=False,
                            trace=True,
                            error_action='ignore',
                            suppress_warnings=True)

# Step 2: Forecast future steps
steps = 100
forecast_auto = stepwise_model.predict(n_periods=steps)

# Step 3: Create future timestamps
freq = pd.to_timedelta(df.index.to_series().diff().mode()[0])
future_dates = pd.date_range(start=ts.index[-1] + freq, periods=steps, freq=freq)
forecast_auto_series = pd.Series(forecast_auto, index=future_dates)

To enhance model selection, we implemented Auto-ARIMA, which automatically identified the optimal ARIMA(2,2,4) configuration based on AIC. The model generated a 100-step forecast of future Bitcoin prices, shown in green on the chart. As recent price behavior was relatively stable, the forecast reflects a smooth, slightly upward trend. A small visual offset was applied to make the forecast more distinguishable from historical data for presentation clarity.



In [76]:
# Force forecast to valid float array (reshape if needed)
forecast_auto = stepwise_model.predict(n_periods=steps)
forecast_auto = pd.Series(forecast_auto)  # ensure it's Series

# Create future index with same length
future_dates = pd.date_range(start=ts.index[-1] + freq, periods=steps, freq=freq)

# Assign index
forecast_series = pd.Series(forecast_auto.values, index=future_dates)


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(ts[-100:], label='Historical', color='blue')
plt.plot(forecast_series, label='Auto-ARIMA Forecast', color='green', linestyle='--', marker='o')
plt.axvline(x=ts.index[-1], color='gray', linestyle=':', label='Forecast Start')
plt.title("Bitcoin Forecast using Auto-ARIMA (Fixed NaNs)")
plt.xlabel("Timestamp")
plt.ylabel("USD Price")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Forecast with confidence intervals
forecast_mean, conf_int = stepwise_model.predict(n_periods=steps, return_conf_int=True)

# Convert to Series
forecast_series = pd.Series(forecast_mean, index=future_dates)
conf_int_df = pd.DataFrame(conf_int, index=future_dates, columns=['Lower', 'Upper'])

# Plot with shaded confidence band
plt.figure(figsize=(12, 5))
plt.plot(ts[-100:], label='Historical', color='blue')
plt.plot(forecast_series, label='Auto-ARIMA Forecast', color='green', linestyle='--', marker='o')
plt.fill_between(future_dates, conf_int_df['Lower'], conf_int_df['Upper'], 
                 color='green', alpha=0.2, label='95% Confidence Interval')
plt.axvline(x=ts.index[-1], color='gray', linestyle=':', label='Forecast Start')
plt.title("Bitcoin Forecast using Auto-ARIMA with 95% Confidence Interval")
plt.xlabel("Timestamp")
plt.ylabel("USD Price")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


### Manual ARIMA vs Auto-ARIMA Forecast Comparison

In [None]:
ts.index = pd.to_datetime(ts.index)
ts = ts.sort_index()

steps = 100
freq = pd.to_timedelta(ts.index.to_series().diff().mode()[0])
future_dates = pd.date_range(start=ts.index[-1] + freq, periods=steps, freq=freq)

manual_forecast = model_fit.forecast(steps=steps)
manual_series = pd.Series(manual_forecast.values, index=future_dates)  # FIXED index

auto_forecast, _ = stepwise_model.predict(n_periods=steps, return_conf_int=True)
auto_series = pd.Series(auto_forecast.values, index=future_dates)  # FIXED index

plt.figure(figsize=(12, 5))
plt.plot(ts[-100:], label='Historical', color='blue', linewidth=1.5)
plt.plot(manual_series, label='Manual ARIMA(5,1,0)', color='red', linestyle='--', linewidth=2)
plt.plot(auto_series, label='Auto-ARIMA', color='green', linestyle='--', linewidth=2, marker='o')
plt.axvline(x=ts.index[-1], color='gray', linestyle=':', label='Forecast Start')

# Labels and layout
plt.title("Manual ARIMA vs Auto-ARIMA Forecast (100 Steps Ahead)")
plt.xlabel("Timestamp")
plt.ylabel("USD Price")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
logging.info("Manual ARIMA forecast:\n%s", manual_forecast.head())
logging.info("Length: %d", len(manual_forecast))

logging.info("Auto-ARIMA forecast:\n%s", auto_forecast.head())
logging.info("Length: %d", len(auto_forecast))

logging.info("Future index:\n%s", future_dates[:5])
logging.info("Length: %d", len(future_dates))


### 🔍 Forecast Comparison: Manual ARIMA vs Auto-ARIMA

This plot compares the Bitcoin price forecast generated using:

- **Manual ARIMA(5,1,0)**: A manually specified model with fixed order (p=5, d=1, q=0)
- **Auto-ARIMA**: A model selected automatically using AIC optimization, which chose ARIMA(2,2,4)

**Key observations:**
- The **manual ARIMA** forecast is relatively flat, indicating a conservative outlook likely influenced by recent price stabilization.
- The **Auto-ARIMA** forecast shows an upward trend, capturing momentum from the recent price increase.
- Auto-ARIMA also adapts better to the data’s structure, using differencing (d=2) to ensure stationarity.

This highlights the benefit of using model selection techniques like Auto-ARIMA, especially for volatile and evolving time series like cryptocurrency prices.


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Compare model predictions vs actual for last N steps (if real values available)
# If not, use fitted values from training set
actual = ts[-steps:]

# Trim fitted values to match
manual_fitted = model_fit.fittedvalues[-len(actual):]
auto_fitted = pd.Series(stepwise_model.predict_in_sample(), index=ts.index)
auto_fitted = auto_fitted[-len(actual):]

# Calculate RMSE
rmse_manual = np.sqrt(mean_squared_error(actual, manual_fitted))
rmse_auto = np.sqrt(mean_squared_error(actual, auto_fitted))

logging.info(f" Manual ARIMA(5,1,0) RMSE: {rmse_manual:.2f}")
logging.info(f" Auto-ARIMA RMSE: {rmse_auto:.2f}")


### 📈 RMSE Comparison: Manual ARIMA vs Auto-ARIMA

To evaluate model performance, we compared the Root Mean Squared Error (RMSE) on the training data:

- 📊 **Manual ARIMA(5,1,0)** RMSE: **27.33**
- 🤖 **Auto-ARIMA (ARIMA(2,2,4))** RMSE: **26.66**

Although the difference is small, **Auto-ARIMA achieved a slightly lower RMSE**, indicating better fit to the historical Bitcoin price series. It also shows stronger responsiveness to recent trends in the forecast plot, reinforcing its suitability for dynamic time series like cryptocurrency markets.

This demonstrates the value of data-driven model selection using tools like Auto-ARIMA over fixed manual configurations.


## Results and Visualization


### Exploratory and Forecast Visualizations of Real-Time Bitcoin Prices using Dataprep.eda

In [None]:
from dataprep.eda import create_report

# Generate a full interactive report of historical Bitcoin price data
create_report(df)


### 📊 Dataprep.eda Summary & Project Alignment

Using a single command—`create_report(df)`—**Dataprep.eda** generated a comprehensive exploratory data analysis report, offering immediate and actionable insights into the structure and quality of the Bitcoin price dataset. This automated report provided detailed summaries of variable distributions, missing values, statistical properties, and outlier detection without requiring any manual plotting or scripting.

For instance, it identified that:
- `price_usd` is slightly **left-skewed**, indicating clustering at higher values.
- `price_change_pct` is highly **right-skewed**, with over **66%** of the values being zero, highlighting limited short-term price fluctuations.

These insights are critical for time series modeling, particularly in financial datasets where subtle shifts can affect forecasting outcomes. Additionally, the visualizations—such as KDE plots, Q-Q plots, correlation matrices, and missing value heatmaps—offered an in-depth view of the dataset's behavior over time.

Overall, **Dataprep.eda** demonstrated its effectiveness by simplifying traditionally complex and time-consuming data profiling tasks. It successfully supported the project’s objective by enabling efficient, scalable, and visually rich analysis of real-time Bitcoin price trends within a big data context.








### Univariate Plot (price_usd only)

In [None]:
from dataprep.eda import plot

plot(df, "price_usd")


### Correlation Heatmap


In [None]:
from dataprep.eda import plot_correlation

# Shows correlation between all numerical variables
plot_correlation(df)


### Missing Values Visualization

In [None]:
from dataprep.eda import plot_missing

# Visualize where missing data exists in the dataset
plot_missing(df)


### Value Distribution Across All Columns

In [None]:
plot(df, "price_usd", "price_change_pct")


### Time Granularity Binning

In [None]:
df["hour"] = df.index.hour
plot(df, "hour", "price_usd")


###  Dataprep.eda Summary & Project Alignment



In [None]:
create_report(df, title="Bitcoin Data EDA")

## ✅ Final Summary: Bitcoin Time Series Forecasting & Visualization with Dataprep.eda

This project successfully demonstrates how real-time Bitcoin price data can be ingested, analyzed, and forecasted using Python-based tools, with a focus on the `Dataprep.eda` library for automated exploratory data analysis. In this analysis, we explored Bitcoin price trends using Dataprep and applied ARIMA for forecasting. The model captured the trend reasonably well, but further tuning and testing with additional features could improve accuracy.

Dataprep significantly streamlined the time series workflow by automating key data preparation tasks such as cleaning, exploration, and visualization. With simple commands like clean_headers() and create_report(), it enabled rapid preprocessing and insight generation without manual coding. For this Bitcoin price forecasting project, Dataprep’s EDA tools made it easy to detect trends, missing values, and anomalies, setting a strong foundation for time series modeling. While the forecasting itself used ARIMA models from other libraries, Dataprep played a critical role in preparing and understanding the data efficiently before modeling.



### 📌 Visualization Objective
> *"Generate insightful visualizations of historical Bitcoin prices, forecast results, and potential future trends using the visualization capabilities of Dataprep library."*

### ✅ Accomplishments

- **Historical Data Exploration**:
  - Used `plot()` and `create_report()` to analyze the distribution, skewness, and time-based trends of Bitcoin prices.
  - Verified data quality (missing values, duplicates, outliers) with `plot_missing()` and EDA summary.

- **Forecast Result Visualization**:
  - Implemented and compared both **manual ARIMA(5,1,0)** and **Auto-ARIMA** models.
  - Visualized forecasts over time with proper alignment and confidence intervals.
  - Evaluated forecast performance using RMSE metrics.

- **Future Trend Insights**:
  - Forecasts extended 100 time steps beyond the current dataset, clearly showing trend continuation or flattening.
  - Used confidence bands and time-aligned plotting to communicate uncertainty and directionality.

### 🎯 Outcome
All visualizations were generated with minimal code using Dataprep's high-level APIs, proving its value in simplifying complex time series workflows. This directly supports the project’s main goal: to showcase how `Dataprep.eda` can handle real-time Bitcoin price data efficiently in a big data context.



### End of Notebook