In [9]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
%matplotlib inline

In [10]:
# Load your preprocessed dataset
df = pd.read_csv('processed_data.csv')
df['date'] = pd.to_datetime(df['date'])

# Display basic info
print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
df.head()

Dataset shape: (1000, 11)
Date range: 2022-03-16 00:00:00 to 2022-03-17 00:00:00


Unnamed: 0.1,Unnamed: 0,coin,symbol,price,1h,24h,7d,24h_volume,mkt_cap,date,liq_ratio
0,0,67,68,40859.46,0.022,0.03,0.055,35390760000.0,770991500000.0,2022-03-16,0.045903
1,1,148,150,2744.41,0.024,0.034,0.065,19748700000.0,327104400000.0,2022-03-16,0.060374
2,2,417,445,1.0,-0.001,-0.001,0.0,57934970000.0,79965160000.0,2022-03-16,0.724503
3,3,45,62,383.43,0.018,0.028,0.004,1395854000.0,64043820000.0,2022-03-16,0.021795
4,4,439,442,0.999874,-0.001,0.0,-0.0,3872274000.0,52222140000.0,2022-03-16,0.07415


In [11]:
# Define target variable
target_column = 'liq_ratio'

# Exclude non-feature columns
exclude_columns = ['coin', 'symbol', 'date', target_column, '24h_volume', 'mkt_cap']

# Get feature columns (all numeric columns except excluded ones)
feature_columns = [col for col in df.select_dtypes(include=[np.number]).columns 
                  if col not in exclude_columns]

X = df[feature_columns]
y = df[target_column]

print(f"Number of features: {len(feature_columns)}")
print("Feature names:", feature_columns)

Number of features: 5
Feature names: ['Unnamed: 0', 'price', '1h', '24h', '7d']


In [12]:
# Sort by date first to ensure chronological order
df_sorted = df.sort_values('date')

# Find the split point (last 3 months of data for testing)
split_date = df_sorted['date'].max() - pd.DateOffset(months=3)

# Create masks for train/test split
train_mask = df_sorted['date'] <= split_date
test_mask = df_sorted['date'] > split_date

# Apply the split
X_train, X_test = X.loc[train_mask], X.loc[test_mask]
y_train, y_test = y.loc[train_mask], y.loc[test_mask]

print(f"Training set: {X_train.shape[0]} samples (until {split_date.date()})")
print(f"Test set: {X_test.shape[0]} samples (after {split_date.date()})")

Training set: 0 samples (until 2021-12-17)
Test set: 1000 samples (after 2021-12-17)


In [14]:
# # Initialize scaler
# scaler = StandardScaler()

# # Fit on training data, transform both train and test
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Convert back to DataFrame for better visualization
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_columns, index=X_train.index)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_columns, index=X_test.index)

In [6]:
# Dictionary to store models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    'LightGBM': LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
}