# Yiedl Crypto Data Analysis and Modeling

This notebook demonstrates how to download and analyze Yiedl crypto data, and build models with polynomial feature combinations using XGBoost and LightGBM.

In [None]:
# Install required packages
!pip install -q requests pandas numpy matplotlib sklearn xgboost lightgbm h2o scikit-learn polars numerapi pyarrow

## Download Yiedl Data

We'll download both the latest and historical Yiedl crypto datasets.

In [None]:
import requests
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from sklearn.model_selection import train_test_split
import polars as pl
from pathlib import Path
import time

# Helper function to download files with retry mechanism
def download_file(url, output_filename, max_retries=3, timeout=300):
    for attempt in range(max_retries):
        try:
            print(f"Download attempt {attempt + 1}/{max_retries}")
            response = requests.get(url, stream=True, timeout=timeout)
            response.raise_for_status()  # Raise an exception for HTTP errors
            
            # Get total file size if available
            total_size = int(response.headers.get('content-length', 0))
            
            # Download with progress tracking for large files
            if total_size > 10 * 1024 * 1024:  # If file is larger than 10MB
                print(f"Downloading {output_filename} ({total_size / (1024 * 1024):.1f} MB)")
                
                with open(output_filename, 'wb') as file:
                    downloaded = 0
                    start_time = time.time()
                    last_print_time = start_time
                    
                    for chunk in response.iter_content(chunk_size=1024*1024):  # 1MB chunks
                        if chunk:
                            file.write(chunk)
                            downloaded += len(chunk)
                            
                            # Update progress every 5 seconds
                            current_time = time.time()
                            if current_time - last_print_time > 5:
                                speed = downloaded / (current_time - start_time) / (1024 * 1024)  # MB/s
                                percent = 100 * downloaded / total_size if total_size > 0 else 0
                                print(f"Progress: {percent:.1f}% ({downloaded / (1024 * 1024):.1f}/{total_size / (1024 * 1024):.1f} MB) - {speed:.1f} MB/s")
                                last_print_time = current_time
            else:
                # Small file, download without progress tracking
                with open(output_filename, 'wb') as file:
                    file.write(response.content)
            
            print(f"File downloaded successfully as {output_filename}")
            return True
            
        except requests.exceptions.RequestException as e:
            print(f"Download failed (attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print("Max retries exceeded.")
                return False

# Create data directory if it doesn't exist
data_dir = Path("../data/yiedl")
data_dir.mkdir(exist_ok=True, parents=True)

# Download Yiedl latest dataset
print("Downloading Yiedl latest dataset...")
latest_url = 'https://api.yiedl.ai/yiedl/v1/downloadDataset?type=latest'
latest_path = data_dir / "yiedl_latest.parquet"
if not latest_path.exists():
    download_file(latest_url, latest_path)
else:
    print(f"File already exists: {latest_path}")

# Download Yiedl historical dataset (which is a zip file)
print("\nDownloading Yiedl historical dataset...")
historical_url = 'https://api.yiedl.ai/yiedl/v1/downloadDataset?type=historical'
historical_zip_path = data_dir / "yiedl_historical.zip"
historical_path = data_dir / "yiedl_historical.parquet"

if not historical_path.exists():
    if not historical_zip_path.exists():
        download_file(historical_url, historical_zip_path)
    
    # Extract the zip file
    print("Extracting historical dataset...")
    with zipfile.ZipFile(historical_zip_path, 'r') as zip_ref:
        zip_files = zip_ref.namelist()
        if len(zip_files) == 1:
            # Extract the file with its original name
            zip_ref.extract(zip_files[0], path=data_dir)
            # Rename the file to yiedl_historical.parquet
            extracted_file = data_dir / zip_files[0]
            os.rename(extracted_file, historical_path)
        else:
            print(f"Unexpected files in zip: {zip_files}")
    print(f"Extracted to {historical_path}")
else:
    print(f"File already exists: {historical_path}")

## Load and Explore the Data

Let's load and explore both datasets to understand their structure.

In [None]:
# Load datasets using polars for better memory management
print("Loading latest dataset...")
df_latest = pl.read_parquet(latest_path)

# Print basic info about the latest dataset
print(f"\nLatest dataset shape: {df_latest.shape}")
print("\nLatest dataset columns preview:")
print(df_latest.columns[:10])
print(f"Total columns: {len(df_latest.columns)}")

# Print the first few rows of the latest dataset
print("\nLatest dataset sample:")
display(df_latest.head().to_pandas())

# For historical data, we'll check if it exists but not load it completely to avoid memory issues
if historical_path.exists():
    print("\nLoading historical dataset (only metadata)...")
    schema = pl.read_parquet_schema(historical_path)
    num_rows = pl.scan_parquet(historical_path).select(pl.count()).collect().item()
    print(f"Historical dataset has approximately {num_rows} rows")
    print(f"Historical dataset has {len(schema)} columns")
    print("\nHistorical dataset columns preview:")
    print(list(schema.keys())[:10])
else:
    print("Historical dataset not found. Please extract it first.")

## Data Preprocessing

Let's preprocess the data before creating models:

In [None]:
# Function to identify column types based on naming patterns
def identify_column_groups(columns):
    pvm_cols = [col for col in columns if col.startswith('pvm_')]
    sentiment_cols = [col for col in columns if col.startswith('sentiment_')]
    onchain_cols = [col for col in columns if col.startswith('onchain_')]
    date_symbol_cols = ['date', 'symbol']
    other_cols = [col for col in columns if col not in pvm_cols + sentiment_cols + onchain_cols + date_symbol_cols]
    
    return {
        'pvm': pvm_cols,
        'sentiment': sentiment_cols,
        'onchain': onchain_cols,
        'date_symbol': date_symbol_cols,
        'other': other_cols
    }

# Identify column groups
column_groups = identify_column_groups(df_latest.columns)

# Print summary of column groups
print("Column Group Summary:")
for group, cols in column_groups.items():
    print(f"{group}: {len(cols)} columns")

# Sample a few columns from each group
for group, cols in column_groups.items():
    if cols:
        print(f"\nSample {group} columns: {cols[:5]}")

## Create a Target Variable

Since we do not have a specific target variable for this exercise, we'll create a synthetic one based on the data features. This will allow us to demonstrate the modeling process.

In [None]:
# Function to create a synthetic target variable for demonstration purposes
def create_synthetic_target(df, method='pvm_based'):
    if method == 'pvm_based':
        # Get a PVM column if available
        pvm_cols = column_groups['pvm']
        if pvm_cols:
            # Use the first PVM column as a basis
            pvm_col = pvm_cols[0]
            # Convert to pandas for easier manipulation
            pvm_series = df.select(pvm_col).to_pandas()[pvm_col]
            # Create target: 1 if value > median, 0 otherwise
            target = (pvm_series > pvm_series.median()).astype(int)
            return target
    
    # Fallback: create random target
    print("Using random target as fallback")
    return np.random.randint(0, 2, size=len(df))

# Convert to pandas for further processing (using a subset of columns to manage memory)
columns_to_use = [
    *column_groups['date_symbol'], 
    *column_groups['pvm'][:50], 
    *column_groups['sentiment'][:50], 
    *column_groups['onchain'][:50]
]

print(f"Using {len(columns_to_use)} columns for analysis")

# Convert to pandas DataFrame
df_pandas = df_latest.select(columns_to_use).to_pandas()

# Create synthetic target
df_pandas['target'] = create_synthetic_target(df_latest)

# Check class distribution
print("\nTarget distribution:")
print(df_pandas['target'].value_counts())

# Count of unique symbols
print(f"\nNumber of unique symbols: {df_pandas['symbol'].nunique()}")

# Display a few rows with target
print("\nSample rows with target:")
display(df_pandas[['date', 'symbol', 'target']].head(10))

## Feature Engineering with Polynomial Combinations

Let's create polynomial features and interactions between the different data types

In [None]:
from sklearn.preprocessing import PolynomialFeatures
import warnings
import gc  # For garbage collection
warnings.filterwarnings('ignore')

# Function to create polynomial features for specific column groups
def create_polynomial_features(df, feature_cols, degree=2, interaction_only=False, max_features=100):
    # Fill NaN values with 0 for the feature columns
    feature_df = df[feature_cols].copy().fillna(0)
    
    # Limit the number of features to avoid memory issues
    if len(feature_cols) > max_features:
        print(f"Limiting features from {len(feature_cols)} to {max_features}")
        feature_cols = feature_cols[:max_features]
        feature_df = feature_df[feature_cols]
    
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)
    poly_features = poly.fit_transform(feature_df)
    
    # Create feature names
    feature_names = poly.get_feature_names_out(feature_cols)
    
    # Convert to DataFrame
    poly_df = pd.DataFrame(poly_features, columns=feature_names, index=df.index)
    
    # Clean up to free memory
    del feature_df, poly_features
    gc.collect()
    
    return poly_df

# Function to process features in batches to manage memory
def process_features_in_batches(df, feature_cols, batch_size=20, degree=2, interaction_only=True):
    all_batches = []
    # Process features in batches
    for i in range(0, len(feature_cols), batch_size):
        batch_end = min(i + batch_size, len(feature_cols))
        print(f"Processing batch {i//batch_size + 1}: features {i+1}-{batch_end}")
        batch_features = feature_cols[i:batch_end]
        
        batch_poly = create_polynomial_features(
            df, batch_features, degree=degree, 
            interaction_only=interaction_only, max_features=batch_size
        )
        
        all_batches.append(batch_poly)
        
        # Force garbage collection to free memory
        gc.collect()
    
    # Combine all batches
    if all_batches:
        combined_df = pd.concat(all_batches, axis=1)
        # Clean up to free memory
        del all_batches
        gc.collect()
        return combined_df
    else:
        return pd.DataFrame(index=df.index)

# Create polynomial features for each group
print("Creating polynomial features...")

# For PVM features
pvm_features = df_pandas.columns[df_pandas.columns.str.startswith('pvm_')].tolist()
if pvm_features:
    print(f"Creating polynomial features for {len(pvm_features)} PVM features")
    pvm_poly_df = process_features_in_batches(df_pandas, pvm_features, batch_size=20, degree=2, interaction_only=True)
    print(f"Created {pvm_poly_df.shape[1]} polynomial PVM features")
else:
    pvm_poly_df = pd.DataFrame(index=df_pandas.index)
    print("No PVM features found")

# For sentiment features
sentiment_features = df_pandas.columns[df_pandas.columns.str.startswith('sentiment_')].tolist()
if sentiment_features:
    print(f"Creating polynomial features for {len(sentiment_features)} sentiment features")
    sentiment_poly_df = process_features_in_batches(df_pandas, sentiment_features, batch_size=20, degree=2, interaction_only=True)
    print(f"Created {sentiment_poly_df.shape[1]} polynomial sentiment features")
else:
    sentiment_poly_df = pd.DataFrame(index=df_pandas.index)
    print("No sentiment features found")

# For onchain features
onchain_features = df_pandas.columns[df_pandas.columns.str.startswith('onchain_')].tolist()
if onchain_features:
    print(f"Creating polynomial features for {len(onchain_features)} onchain features")
    onchain_poly_df = process_features_in_batches(df_pandas, onchain_features, batch_size=20, degree=2, interaction_only=True)
    print(f"Created {onchain_poly_df.shape[1]} polynomial onchain features")
else:
    onchain_poly_df = pd.DataFrame(index=df_pandas.index)
    print("No onchain features found")

# Create cross-category interaction features
print("\nCreating cross-category interaction features...")
cross_features = []
if pvm_features and sentiment_features:
    # Select a subset of features from each category
    pvm_subset = pvm_features[:10] if len(pvm_features) > 10 else pvm_features
    sentiment_subset = sentiment_features[:10] if len(sentiment_features) > 10 else sentiment_features
    
    # Combine the subsets and create cross features
    combined_features = pvm_subset + sentiment_subset
    cross_pvm_sentiment = create_polynomial_features(
        df_pandas, combined_features, degree=2, interaction_only=True, max_features=40
    )
    
    # Only keep interactions between different categories
    cross_cols = [col for col in cross_pvm_sentiment.columns 
                 if any(p in col for p in pvm_subset) and any(s in col for s in sentiment_subset)]
    
    if cross_cols:
        cross_features.append(cross_pvm_sentiment[cross_cols])
        print(f"Created {len(cross_cols)} PVM-Sentiment interaction features")
    
    # Clean up to free memory
    del cross_pvm_sentiment
    gc.collect()

# Combine all feature sets
feature_dfs = [pvm_poly_df, sentiment_poly_df, onchain_poly_df] + cross_features
all_poly_features = pd.concat(feature_dfs, axis=1)

print(f"\nTotal polynomial features created: {all_poly_features.shape[1]}")

# Clean up individual feature dataframes to free memory
del pvm_poly_df, sentiment_poly_df, onchain_poly_df, cross_features
gc.collect()

# Combine with original features for modeling
modeling_df = pd.concat([df_pandas.drop(['date', 'symbol'], axis=1), all_poly_features], axis=1)

# Remove any constant columns to reduce dimensionality
modeling_df = modeling_df.loc[:, (modeling_df != modeling_df.iloc[0]).any()]

print(f"\nFinal dataset shape for modeling: {modeling_df.shape}")
print(f"Number of features: {modeling_df.shape[1] - 1}")

# Free up memory
del all_poly_features
gc.collect()

## Train-Test Split

Let's split the data into training and testing sets.

In [None]:
# Split the data
X = modeling_df.drop('target', axis=1)
y = modeling_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

## Train XGBoost Model

Let's train an XGBoost model using the polynomial features.

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Create DMatrix objects for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set XGBoost parameters with GPU acceleration
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 1,
    'tree_method': 'gpu_hist',  # Use GPU acceleration
    'gpu_id': 0,  # Use first GPU
    'seed': 42
}

# Check if GPU is available
try:
    # Train XGBoost model with early stopping
    print("Training XGBoost model with GPU acceleration...")
    watchlist = [(dtrain, 'train'), (dtest, 'test')]
    xgb_model = xgb.train(
        params, 
        dtrain, 
        num_boost_round=1000,
        evals=watchlist,
        early_stopping_rounds=50,
        verbose_eval=100
    )
except Exception as e:
    print(f"GPU training failed with error: {e}")
    print("Falling back to CPU training...")
    params['tree_method'] = 'hist'  # Fall back to CPU
    params.pop('gpu_id', None)  # Remove GPU parameter
    xgb_model = xgb.train(
        params, 
        dtrain, 
        num_boost_round=1000,
        evals=watchlist,
        early_stopping_rounds=50,
        verbose_eval=100
    )

# Make predictions
y_pred_proba = xgb_model.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"\nXGBoost Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Get feature importance
importance = xgb_model.get_score(importance_type='gain')
importance_df = pd.DataFrame({'Feature': list(importance.keys()), 'Importance': list(importance.values())})
importance_df = importance_df.sort_values('Importance', ascending=False).reset_index(drop=True)

# Plot feature importance
plt.figure(figsize=(14, 8))
plt.barh(importance_df['Feature'][:20], importance_df['Importance'][:20])
plt.title('XGBoost Feature Importance (Top 20)')
plt.xlabel('Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Train LightGBM Model

Let's also train a LightGBM model and compare performance.

In [None]:
import lightgbm as lgb

# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Set LightGBM parameters with GPU acceleration
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'device': 'gpu',  # Use GPU
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'seed': 42
}

# Try training with GPU, fall back to CPU if not available
try:
    # Train LightGBM model with early stopping
    print("Training LightGBM model with GPU acceleration...")
    lgb_model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data, test_data],
        valid_names=['train', 'test'],
        early_stopping_rounds=50,
        verbose_eval=100
    )
except Exception as e:
    print(f"GPU training failed with error: {e}")
    print("Falling back to CPU training...")
    # Remove GPU parameters
    params['device'] = 'cpu'
    params.pop('gpu_platform_id', None)
    params.pop('gpu_device_id', None)
    
    lgb_model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data, test_data],
        valid_names=['train', 'test'],
        early_stopping_rounds=50,
        verbose_eval=100
    )

# Make predictions
y_pred_proba_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
y_pred_lgb = (y_pred_proba_lgb > 0.5).astype(int)

# Evaluate the model
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
auc_lgb = roc_auc_score(y_test, y_pred_proba_lgb)

print(f"\nLightGBM Model Performance:")
print(f"Accuracy: {accuracy_lgb:.4f}")
print(f"AUC: {auc_lgb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lgb))

# Plot feature importance
plt.figure(figsize=(14, 8))
lgb.plot_importance(lgb_model, max_num_features=20, importance_type='gain')
plt.title('LightGBM Feature Importance (Top 20)')
plt.tight_layout()
plt.show()

## Compare Model Performance

Let's compare the performance of XGBoost and LightGBM models.

In [None]:
# Create a comparison table
model_comparison = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM'],
    'Accuracy': [accuracy, accuracy_lgb],
    'AUC': [auc, auc_lgb]
})

print("Model Performance Comparison:")
display(model_comparison)

# Plot ROC curves
from sklearn.metrics import roc_curve

# Get ROC curve data
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_proba)
fpr_lgb, tpr_lgb, _ = roc_curve(y_test, y_pred_proba_lgb)

# Plot ROC curves
plt.figure(figsize=(10, 8))
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc:.4f})')
plt.plot(fpr_lgb, tpr_lgb, label=f'LightGBM (AUC = {auc_lgb:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

## Feature Importance Analysis

Let's analyze the most important features from both models.

In [None]:
# Get LightGBM feature importance
lgb_importance = pd.DataFrame({
    'Feature': lgb_model.feature_name(),
    'Importance': lgb_model.feature_importance(importance_type='gain')
})
lgb_importance = lgb_importance.sort_values('Importance', ascending=False).reset_index(drop=True)

# Compare top features from both models
top_features_xgb = set(importance_df['Feature'][:10])
top_features_lgb = set(lgb_importance['Feature'][:10])
common_features = top_features_xgb.intersection(top_features_lgb)

print(f"Number of common top features: {len(common_features)}")
print("Common top features:")
for feature in common_features:
    print(f"- {feature}")

# Analyze feature types in top features
def categorize_feature(feature):
    if any(feature.startswith(prefix) for prefix in ['pvm_', 'pvm']):
        return 'PVM'
    elif any(feature.startswith(prefix) for prefix in ['sentiment_', 'sentiment']):
        return 'Sentiment'
    elif any(feature.startswith(prefix) for prefix in ['onchain_', 'onchain']):
        return 'Onchain'
    elif ' ' in feature:  # Interaction feature
        return 'Interaction'
    else:
        return 'Other'

# Categorize top features
xgb_top_categorized = importance_df.head(20).copy()
xgb_top_categorized['Category'] = xgb_top_categorized['Feature'].apply(categorize_feature)

lgb_top_categorized = lgb_importance.head(20).copy()
lgb_top_categorized['Category'] = lgb_top_categorized['Feature'].apply(categorize_feature)

# Count feature types
print("\nXGBoost top feature categories:")
print(xgb_top_categorized['Category'].value_counts())

print("\nLightGBM top feature categories:")
print(lgb_top_categorized['Category'].value_counts())

## Save Models for Future Use

Let's save the trained models for future use.

In [None]:
import joblib
import os

# Create models directory if it doesn't exist
models_dir = Path("../models/yiedl")
models_dir.mkdir(exist_ok=True, parents=True)

# Save XGBoost model
xgb_model.save_model(models_dir / "xgboost_model.json")
print(f"XGBoost model saved to {models_dir / 'xgboost_model.json'}")

# Save LightGBM model
lgb_model.save_model(str(models_dir / "lightgbm_model.txt"))
print(f"LightGBM model saved to {models_dir / 'lightgbm_model.txt'}")

# Save feature columns
joblib.dump(X.columns.tolist(), models_dir / "feature_columns.joblib")
print(f"Feature columns saved to {models_dir / 'feature_columns.joblib'}")

## Conclusion

In this notebook, we:

1. Downloaded and explored the Yiedl crypto datasets
2. Created polynomial feature combinations
3. Built XGBoost and LightGBM models
4. Compared model performance
5. Analyzed feature importance

The models performed well on our synthetic target variable, and we identified important features for predicting the target.