In [None]:
import pandas as pd
import numpy as np
import joblib
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')


In [None]:
df = pd.read_csv('../data/arbitrage_training_data.csv')
df.shape


In [None]:
df.head()


In [None]:
df['is_profitable'].value_counts()


In [None]:
df.describe()


In [None]:
feature_cols = [
    'spread_binance_coinbase', 'spread_binance_kraken', 'spread_coinbase_kraken',
    'volume_binance', 'volume_coinbase', 'volume_kraken',
    'volatility', 'hour_of_day', 'day_of_week', 'liquidity_score', 'max_spread_bps'
]

X = df[feature_cols].values
y = df['is_profitable'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# xgb_clf = xgb.XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
# xgb_clf.fit(X_train, y_train)

xgb_clf = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)
xgb_clf.fit(X_train, y_train, verbose=False)


In [None]:
y_pred = xgb_clf.predict(X_test)
y_prob = xgb_clf.predict_proba(X_test)[:, 1]

accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_prob)


In [None]:
pd.DataFrame({'feature': feature_cols, 'importance': xgb_clf.feature_importances_}).sort_values('importance', ascending=False)


In [None]:
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
device


In [None]:
# class ArbitrageNet(nn.Module):
#     def __init__(self, input_size):
#         super().__init__()
#         self.fc1 = nn.Linear(input_size, 64)
#         self.fc2 = nn.Linear(64, 32)
#         self.fc3 = nn.Linear(32, 1)
#     def forward(self, x):
#         x = torch.relu(self.fc1(x))
#         x = torch.relu(self.fc2(x))
#         return torch.sigmoid(self.fc3(x))

class ArbitrageNet(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ArbitrageNet(len(feature_cols)).to(device)


In [None]:
X_t = torch.FloatTensor(X_train_scaled)
y_t = torch.FloatTensor(y_train).unsqueeze(1)
loader = DataLoader(TensorDataset(X_t, y_t), batch_size=512, shuffle=True)

# opt = torch.optim.SGD(model.parameters(), lr=0.01)
opt = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()


In [None]:
model.train()
losses = []
for epoch in range(50):
    total_loss = 0
    for bx, by in loader:
        bx, by = bx.to(device), by.to(device)
        opt.zero_grad()
        loss = loss_fn(model(bx), by)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    losses.append(total_loss / len(loader))
    if (epoch + 1) % 10 == 0:
        print(f'epoch {epoch+1}: {losses[-1]:.4f}')


In [None]:
plt.plot(losses)
plt.xlabel('epoch')
plt.ylabel('loss')


In [None]:
model.eval()
with torch.no_grad():
    xt = torch.FloatTensor(X_test_scaled).to(device)
    yp = model(xt).cpu().numpy().flatten()

y_pred_nn = (yp > 0.5).astype(int)
accuracy_score(y_test, y_pred_nn), f1_score(y_test, y_pred_nn), roc_auc_score(y_test, yp)


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', ax=axes[0], cmap='Blues')
axes[0].set_title('xgboost')
sns.heatmap(confusion_matrix(y_test, y_pred_nn), annot=True, fmt='d', ax=axes[1], cmap='Blues')
axes[1].set_title('nn')


In [None]:
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

joblib.dump(xgb_clf, models_dir / 'xgboost_classifier.joblib')
joblib.dump(scaler, models_dir / 'scaler.joblib')
with open(models_dir / 'feature_cols.txt', 'w') as f:
    f.write('\n'.join(feature_cols))

torch.save({
    'model_state_dict': model.state_dict(),
    'input_size': len(feature_cols),
    'feature_cols': feature_cols
}, models_dir / 'arbitrage_net.pth')


In [None]:
list(models_dir.iterdir())


In [None]:
# test inference
xgb_loaded = joblib.load(models_dir / 'xgboost_classifier.joblib')
xgb_loaded.predict_proba(X_test[:5])[:, 1]


# ArbFinder - Arbitrage Analysis & ML Model Training

This notebook provides:
1. Synthetic data generation for training
2. XGBoost model for arbitrage opportunity prediction
3. Neural Network model for price spread prediction
4. Model evaluation and saving


In [None]:
# Install dependencies
# !pip install pandas numpy scikit-learn xgboost torch matplotlib seaborn joblib


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pathlib import Path
import warnings
import joblib

# ML imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score, mean_squared_error
)

# XGBoost
import xgboost as xgb

# Neural Network
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset
    TORCH_AVAILABLE = True
except ImportError:
    TORCH_AVAILABLE = False
    print("PyTorch not available, will use sklearn MLP instead")
    from sklearn.neural_network import MLPClassifier, MLPRegressor

warnings.filterwarnings('ignore')
np.random.seed(42)

print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"PyTorch available: {TORCH_AVAILABLE}")


## 1. Synthetic Data Generation

Generate realistic arbitrage training data with:
- Price data from multiple exchanges
- Spread calculations
- Volume data
- Labels for profitable opportunities


In [None]:
def generate_synthetic_data(n_samples=10000, seed=42):
    """
    Generate synthetic arbitrage data for model training.
    
    Features:
    - price_exchange_a: Price on exchange A
    - price_exchange_b: Price on exchange B
    - price_exchange_c: Price on exchange C
    - spread_ab: Spread between A and B (bps)
    - spread_ac: Spread between A and C (bps)
    - spread_bc: Spread between B and C (bps)
    - volume_a: Volume on exchange A
    - volume_b: Volume on exchange B
    - volume_c: Volume on exchange C
    - volatility: Recent price volatility
    - hour_of_day: Hour (0-23)
    - day_of_week: Day (0-6)
    
    Target:
    - is_profitable: Binary label (1 if profitable after fees)
    - profit_bps: Continuous profit in basis points
    """
    np.random.seed(seed)
    
    # Base price (e.g., BTC around 50000)
    base_prices = np.random.uniform(40000, 60000, n_samples)
    
    # Exchange-specific noise (different exchanges have different prices)
    # Exchange A: baseline
    price_a = base_prices + np.random.normal(0, 10, n_samples)
    
    # Exchange B: slightly different, sometimes higher
    price_b = base_prices + np.random.normal(5, 15, n_samples)
    
    # Exchange C: more variance
    price_c = base_prices + np.random.normal(-3, 20, n_samples)
    
    # Calculate spreads in basis points
    spread_ab = ((price_b - price_a) / price_a) * 10000
    spread_ac = ((price_c - price_a) / price_a) * 10000
    spread_bc = ((price_c - price_b) / price_b) * 10000
    
    # Volume (in quote currency)
    volume_a = np.random.exponential(50000, n_samples)
    volume_b = np.random.exponential(40000, n_samples)
    volume_c = np.random.exponential(30000, n_samples)
    
    # Volatility (rolling std of returns, simulated)
    volatility = np.random.exponential(0.02, n_samples)
    
    # Time features
    hour_of_day = np.random.randint(0, 24, n_samples)
    day_of_week = np.random.randint(0, 7, n_samples)
    
    # Fee structure (in decimal)
    fee_a = 0.001  # 0.1%
    fee_b = 0.005  # 0.5%
    fee_c = 0.0026 # 0.26%
    
    # Calculate actual profitability
    # Best arbitrage: buy low, sell high
    max_spread = np.maximum.reduce([spread_ab, spread_ac, spread_bc, -spread_ab, -spread_ac, -spread_bc])
    
    # Calculate profit after fees (in bps)
    # Fees for round trip: buy fee + sell fee
    total_fee_ab = (fee_a + fee_b) * 10000  # ~60 bps
    total_fee_ac = (fee_a + fee_c) * 10000  # ~36 bps
    total_fee_bc = (fee_b + fee_c) * 10000  # ~76 bps
    
    # Net profit (using minimum fee pair for best case)
    min_fee = np.minimum.reduce([total_fee_ab, total_fee_ac, total_fee_bc])
    profit_bps = max_spread - min_fee
    
    # Add some noise to profit (execution slippage, timing, etc.)
    profit_bps = profit_bps + np.random.normal(0, 5, n_samples)
    
    # Profitable if profit > 10 bps threshold
    is_profitable = (profit_bps > 10).astype(int)
    
    # Create DataFrame
    df = pd.DataFrame({
        'price_exchange_a': price_a,
        'price_exchange_b': price_b,
        'price_exchange_c': price_c,
        'spread_ab': spread_ab,
        'spread_ac': spread_ac,
        'spread_bc': spread_bc,
        'volume_a': volume_a,
        'volume_b': volume_b,
        'volume_c': volume_c,
        'volatility': volatility,
        'hour_of_day': hour_of_day,
        'day_of_week': day_of_week,
        'profit_bps': profit_bps,
        'is_profitable': is_profitable,
    })
    
    return df


# Generate training data
print("Generating synthetic training data...")
df = generate_synthetic_data(n_samples=50000)
print(f"Generated {len(df)} samples")
print(f"Profitable opportunities: {df['is_profitable'].sum()} ({df['is_profitable'].mean()*100:.1f}%)")
print(f"\nData shape: {df.shape}")
df.head()


In [None]:
# Prepare features and target
feature_cols = [
    'spread_ab', 'spread_ac', 'spread_bc',
    'volume_a', 'volume_b', 'volume_c',
    'volatility', 'hour_of_day', 'day_of_week'
]

X = df[feature_cols].values
y_class = df['is_profitable'].values  # For classification
y_reg = df['profit_bps'].values       # For regression

# Split data
X_train, X_test, y_train_class, y_test_class = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)
_, _, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Features: {feature_cols}")


## 2. XGBoost Model

XGBoost is excellent for tabular data and handles feature interactions well.
We train both a classifier (is_profitable) and regressor (profit_bps).


In [None]:
# XGBoost Classifier for profitability prediction
print("Training XGBoost Classifier...")

xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_clf.fit(X_train, y_train_class)

# Predictions
y_pred_xgb = xgb_clf.predict(X_test)
y_prob_xgb = xgb_clf.predict_proba(X_test)[:, 1]

# Evaluation
print("\n--- XGBoost Classifier Results ---")
print(f"Accuracy:  {accuracy_score(y_test_class, y_pred_xgb):.4f}")
print(f"Precision: {precision_score(y_test_class, y_pred_xgb):.4f}")
print(f"Recall:    {recall_score(y_test_class, y_pred_xgb):.4f}")
print(f"F1 Score:  {f1_score(y_test_class, y_pred_xgb):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test_class, y_prob_xgb):.4f}")

print("\nClassification Report:")
print(classification_report(y_test_class, y_pred_xgb, target_names=['Not Profitable', 'Profitable']))


In [None]:
# XGBoost Regressor for profit prediction
print("Training XGBoost Regressor...")

xgb_reg = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_reg.fit(X_train, y_train_reg)

# Predictions
y_pred_reg = xgb_reg.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_test_reg - y_pred_reg))

print("\n--- XGBoost Regressor Results ---")
print(f"RMSE: {rmse:.4f} bps")
print(f"MAE:  {mae:.4f} bps")


In [None]:
# Feature importance
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_clf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance['feature'], importance['importance'])
plt.xlabel('Feature Importance')
plt.title('XGBoost Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../exports/xgboost_feature_importance.png', dpi=150)
plt.show()

print("\nFeature Importance:")
print(importance.to_string(index=False))


## 3. Neural Network Model

A feedforward neural network for arbitrage prediction.
Uses PyTorch if available, otherwise sklearn MLPClassifier.


In [None]:
if TORCH_AVAILABLE:
    # PyTorch Neural Network
    class ArbitrageNet(nn.Module):
        def __init__(self, input_size, hidden_sizes=[64, 32, 16]):
            super(ArbitrageNet, self).__init__()
            
            layers = []
            prev_size = input_size
            
            for hidden_size in hidden_sizes:
                layers.append(nn.Linear(prev_size, hidden_size))
                layers.append(nn.ReLU())
                layers.append(nn.BatchNorm1d(hidden_size))
                layers.append(nn.Dropout(0.2))
                prev_size = hidden_size
            
            layers.append(nn.Linear(prev_size, 1))
            layers.append(nn.Sigmoid())
            
            self.network = nn.Sequential(*layers)
        
        def forward(self, x):
            return self.network(x)
    
    # Prepare PyTorch data
    X_train_tensor = torch.FloatTensor(X_train_scaled)
    y_train_tensor = torch.FloatTensor(y_train_class).unsqueeze(1)
    X_test_tensor = torch.FloatTensor(X_test_scaled)
    y_test_tensor = torch.FloatTensor(y_test_class).unsqueeze(1)
    
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    
    # Initialize model
    model = ArbitrageNet(input_size=X_train.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    print("Neural Network Architecture:")
    print(model)
else:
    print("Using sklearn MLPClassifier instead of PyTorch")


In [None]:
# Train Neural Network
if TORCH_AVAILABLE:
    print("Training PyTorch Neural Network...")
    
    epochs = 50
    train_losses = []
    
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_loss)
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")
    
    # Evaluate
    model.eval()
    with torch.no_grad():
        y_prob_nn = model(X_test_tensor).numpy().flatten()
        y_pred_nn = (y_prob_nn > 0.5).astype(int)
    
    print("\n--- PyTorch Neural Network Results ---")
    print(f"Accuracy:  {accuracy_score(y_test_class, y_pred_nn):.4f}")
    print(f"Precision: {precision_score(y_test_class, y_pred_nn):.4f}")
    print(f"Recall:    {recall_score(y_test_class, y_pred_nn):.4f}")
    print(f"F1 Score:  {f1_score(y_test_class, y_pred_nn):.4f}")
    print(f"ROC AUC:   {roc_auc_score(y_test_class, y_prob_nn):.4f}")
    
else:
    # Use sklearn MLP
    print("Training sklearn MLP Classifier...")
    
    mlp_clf = MLPClassifier(
        hidden_layer_sizes=(64, 32, 16),
        activation='relu',
        solver='adam',
        max_iter=200,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1
    )
    
    mlp_clf.fit(X_train_scaled, y_train_class)
    
    y_pred_nn = mlp_clf.predict(X_test_scaled)
    y_prob_nn = mlp_clf.predict_proba(X_test_scaled)[:, 1]
    
    print("\n--- sklearn MLP Classifier Results ---")
    print(f"Accuracy:  {accuracy_score(y_test_class, y_pred_nn):.4f}")
    print(f"Precision: {precision_score(y_test_class, y_pred_nn):.4f}")
    print(f"Recall:    {recall_score(y_test_class, y_pred_nn):.4f}")
    print(f"F1 Score:  {f1_score(y_test_class, y_pred_nn):.4f}")
    print(f"ROC AUC:   {roc_auc_score(y_test_class, y_prob_nn):.4f}")


## 4. Model Comparison & Visualization


In [None]:
# Compare models
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Confusion matrices
cm_xgb = confusion_matrix(y_test_class, y_pred_xgb)
cm_nn = confusion_matrix(y_test_class, y_pred_nn)

sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('XGBoost Confusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Neural Network Confusion Matrix')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

# Model comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC']
xgb_scores = [
    accuracy_score(y_test_class, y_pred_xgb),
    precision_score(y_test_class, y_pred_xgb),
    recall_score(y_test_class, y_pred_xgb),
    f1_score(y_test_class, y_pred_xgb),
    roc_auc_score(y_test_class, y_prob_xgb)
]
nn_scores = [
    accuracy_score(y_test_class, y_pred_nn),
    precision_score(y_test_class, y_pred_nn),
    recall_score(y_test_class, y_pred_nn),
    f1_score(y_test_class, y_pred_nn),
    roc_auc_score(y_test_class, y_prob_nn)
]

x = np.arange(len(metrics))
width = 0.35

axes[2].bar(x - width/2, xgb_scores, width, label='XGBoost', color='steelblue')
axes[2].bar(x + width/2, nn_scores, width, label='Neural Net', color='forestgreen')
axes[2].set_ylabel('Score')
axes[2].set_title('Model Comparison')
axes[2].set_xticks(x)
axes[2].set_xticklabels(metrics, rotation=45)
axes[2].legend()
axes[2].set_ylim(0, 1)

plt.tight_layout()
plt.savefig('../exports/model_comparison.png', dpi=150)
plt.show()


## 5. Save Models

Save trained models for deployment:
- XGBoost: .joblib format
- Neural Network: .pth format (PyTorch) or .joblib (sklearn)
- Scaler: for preprocessing new data


In [None]:
# Create models directory
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save XGBoost models
print("Saving XGBoost models...")
joblib.dump(xgb_clf, models_dir / 'xgboost_classifier.joblib')
joblib.dump(xgb_reg, models_dir / 'xgboost_regressor.joblib')
print(f"  Saved: {models_dir / 'xgboost_classifier.joblib'}")
print(f"  Saved: {models_dir / 'xgboost_regressor.joblib'}")

# Save scaler
joblib.dump(scaler, models_dir / 'scaler.joblib')
print(f"  Saved: {models_dir / 'scaler.joblib'}")

# Save Neural Network
if TORCH_AVAILABLE:
    # Save PyTorch model
    torch.save({
        'model_state_dict': model.state_dict(),
        'input_size': X_train.shape[1],
        'hidden_sizes': [64, 32, 16],
    }, models_dir / 'neural_net.pth')
    print(f"  Saved: {models_dir / 'neural_net.pth'}")
else:
    # Save sklearn MLP
    joblib.dump(mlp_clf, models_dir / 'mlp_classifier.joblib')
    print(f"  Saved: {models_dir / 'mlp_classifier.joblib'}")

# Save feature columns for reference
with open(models_dir / 'feature_cols.txt', 'w') as f:
    f.write('\n'.join(feature_cols))
print(f"  Saved: {models_dir / 'feature_cols.txt'}")

print("\nAll models saved successfully!")


## 6. Model Loading & Inference Example

Demonstrate how to load and use the saved models.


In [None]:
# Example: Load and use models for inference
print("Loading models for inference...")

# Load scaler
loaded_scaler = joblib.load(models_dir / 'scaler.joblib')

# Load XGBoost classifier
loaded_xgb = joblib.load(models_dir / 'xgboost_classifier.joblib')

# Example new data point (simulated arbitrage opportunity)
new_data = np.array([[
    15.0,   # spread_ab (bps)
    -5.0,   # spread_ac (bps)
    20.0,   # spread_bc (bps)
    45000,  # volume_a
    38000,  # volume_b
    32000,  # volume_c
    0.015,  # volatility
    14,     # hour_of_day
    2       # day_of_week (Tuesday)
]])

# Preprocess
new_data_scaled = loaded_scaler.transform(new_data)

# Predict
prediction = loaded_xgb.predict(new_data_scaled)[0]
probability = loaded_xgb.predict_proba(new_data_scaled)[0, 1]

print("\n--- Inference Example ---")
print(f"Input features: {new_data[0]}")
print(f"Prediction: {'Profitable' if prediction == 1 else 'Not Profitable'}")
print(f"Probability: {probability:.4f}")


## Summary

Models trained:
1. **XGBoost Classifier** - Best for fast inference, handles feature interactions well
2. **XGBoost Regressor** - Predicts actual profit in basis points
3. **Neural Network** - Can capture complex non-linear patterns

Files saved in `../models/`:
- `xgboost_classifier.joblib` - Classification model
- `xgboost_regressor.joblib` - Regression model
- `neural_net.pth` or `mlp_classifier.joblib` - Neural network
- `scaler.joblib` - Feature scaler
- `feature_cols.txt` - List of feature names
