In [None]:
# Kaggle-compatible Google Drive integration
!pip install -q pydrive2

import os
import json
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

# Note: This is a simplified version for Kaggle environment
# Full Google Drive integration requires authentication
# which is handled differently in Kaggle vs Colab

print("Google Drive integration is disabled in Kaggle environment")
print("To use Google Drive with this notebook:")
print("1. Run this notebook in Google Colab instead")
print("2. Or use Kaggle Datasets for data storage")

# Create placeholder functions to avoid errors
def setup_pydrive():
    print("Drive setup is disabled in Kaggle environment")
    return None

def create_folder_if_not_exists(drive, folder_name):
    print(f"Would create folder: {folder_name} (disabled in Kaggle)")
    return "placeholder-folder-id"

def save_notebook_to_drive(drive, folder_id, file_name):
    print(f"Would save file: {file_name} (disabled in Kaggle)")
    return "placeholder-file-id"

# Use Kaggle's built-in output directory for saving files
output_dir = "/kaggle/working"
os.makedirs(output_dir, exist_ok=True)
print(f"Files will be saved to: {output_dir}")

try:
    # Setup PyDrive (placeholder in Kaggle)
    drive = setup_pydrive()
    
    # Create Numer_crypto folder if it doesn't exist (placeholder)
    folder_id = create_folder_if_not_exists(drive, "Numer_crypto")
    
    # Save the current notebook to the folder (placeholder)
    notebook_name = "numerai_sparkling_water_kaggle.ipynb"
    file_id = save_notebook_to_drive(drive, folder_id, notebook_name)
    
except Exception as e:
    print(f"Error with PyDrive: {e}")


In [None]:
# Financial Modeling Prep API Integration
!pip install -q requests pandas

import requests
import pandas as pd

# Financial Modeling Prep API key
FMP_API_KEY = "aDFEO9rxgvGL3VQgPcBxXblSZ3laRLap"

def get_economic_indicators():
    """Get economic indicators from Financial Modeling Prep API"""
    url = f"https://financialmodelingprep.com/api/v3/economic/country_indicator?apikey={FMP_API_KEY}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        return pd.DataFrame(data)
    else:
        print(f"Error fetching economic indicators: {response.status_code}")
        return pd.DataFrame()

def get_country_currency_mapping():
    """Get country to currency mapping from Financial Modeling Prep API"""
    url = f"https://financialmodelingprep.com/api/v3/fx?apikey={FMP_API_KEY}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        
        # Extract unique currencies and their countries
        currency_map = {}
        for item in data:
            if 'name' in item and '/' in item['name']:
                currencies = item['name'].split('/')
                if len(currencies) == 2:
                    currency_map[currencies[0].strip()] = currencies[1].strip()
        
        return currency_map
    else:
        print(f"Error fetching currency mapping: {response.status_code}")
        return {}

def get_crypto_country_associations(cryptocurrencies):
    """Associate cryptocurrencies with countries based on currency usage"""
    # Get country-currency mapping
    currency_map = get_country_currency_mapping()
    
    # Get economic indicators for additional country data
    economic_df = get_economic_indicators()
    
    # Create mapping data
    mapping_data = []
    
    # For demonstration, we'll create some sample associations
    # In a real implementation, this would use more sophisticated analysis
    country_crypto_affinity = {
        "USD": ["Bitcoin", "Ethereum", "Ripple"],
        "EUR": ["Ethereum", "Cardano"],
        "JPY": ["Ripple", "Cardano"],
        "GBP": ["Bitcoin", "Ethereum"],
        "CHF": ["Bitcoin", "Solana"],
        "CAD": ["Ethereum", "Cardano"],
        "AUD": ["Ripple", "Solana"],
        "CNY": ["Bitcoin", "Ethereum"]
    }
    
    for country, currency_code in currency_map.items():
        for crypto in cryptocurrencies:
            if currency_code in country_crypto_affinity and crypto in country_crypto_affinity[currency_code]:
                mapping_data.append({
                    "country": country,
                    "cryptocurrency": crypto,
                    "currency_code": currency_code
                })
    
    return pd.DataFrame(mapping_data)

# Test the Financial Modeling Prep API integration
try:
    print("Testing Financial Modeling Prep API integration...")
    economic_indicators = get_economic_indicators()
    if not economic_indicators.empty:
        print(f"Successfully retrieved {len(economic_indicators)} economic indicators")
        print(economic_indicators.head(3))
    else:
        print("No economic indicators retrieved")
        
    # Test cryptocurrency country associations
    cryptocurrencies = ["Bitcoin", "Ethereum", "Ripple", "Cardano", "Solana"]
    crypto_country_data = get_crypto_country_associations(cryptocurrencies)
    if not crypto_country_data.empty:
        print(f"\nSuccessfully created {len(crypto_country_data)} cryptocurrency-country associations")
        print(crypto_country_data.head(3))
        
        # Save to CSV in Kaggle working directory
        output_path = "/kaggle/working/crypto_country_associations.csv"
        crypto_country_data.to_csv(output_path, index=False)
        print(f"Saved associations to {output_path}")
    else:
        print("No cryptocurrency-country associations created")
except Exception as e:
    print(f"Error testing Financial Modeling Prep API: {e}")


# Numerai Crypto Competitie Voorspellingsmodel met H2O Sparkling Water

Dit notebook implementeert een voorspellingsmodel voor de Numerai/Numerai Crypto competitie met behulp van H2O Sparkling Water, wat H2O integreert met Apache Spark voor gedistribueerde verwerking.

## Installatie van benodigde packages

Eerst moeten we Java, Spark en H2O Sparkling Water installeren. Dit kan enige tijd duren.

In [None]:
# Installeer Java (vereist voor H2O en Spark)
!apt-get update -qq
!apt-get install -y default-jre > /dev/null
!java -version

# Installeer Spark en PySpark
!pip install -q pyspark==3.1.2

# Installeer H2O Sparkling Water
!pip install -q h2o-pysparkling-3.1

# Installeer andere benodigde packages
!pip install -q numerapi pandas h2o cloudpickle==2.2.1 pyarrow scikit-learn scipy==1.10.1 matplotlib

## Importeren van benodigde libraries

In [None]:
from numerapi import NumerAPI
import pandas as pd
import json
import os
import numpy as np
import time
import matplotlib.pyplot as plt

# Spark imports
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

# H2O Sparkling Water imports
from pysparkling import H2OContext
# Original import that causes error: from pysparkling.ml import H2OXGBoostEstimator
# Using the correct import for H2OXGBoostEstimator
from h2o.estimators.xgboost import H2OXGBoostEstimator

import h2o
import cloudpickle

## Initialiseren van Spark en H2O Sparkling Water

In [None]:
# Initialiseer Spark sessie
spark = SparkSession.builder \
    .appName("NumeraiSparklingWater") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()

# Initialiseer H2O Sparkling Water context
h2o_context = H2OContext.getOrCreate()

# Print Spark en H2O versie informatie
print(f"Spark version: {spark.version}")
print(f"H2O cluster version: {h2o.version()}")
print(f"Sparkling Water version: {h2o_context.getSparklingWaterVersion()}")

## Initialiseren van de Numerai API

In [None]:
# Initialiseer de Numerai API client
# Voor het indienen van voorspellingen zijn API keys nodig
# napi = NumerAPI(public_id="UW_PUBLIC_ID", secret_key="UW_SECRET_KEY")
napi = NumerAPI()

## Data downloaden en laden

In [None]:
# Gebruik een van de nieuwste dataversies
DATA_VERSION = "v5.0"

# Maak een data directory
!mkdir -p {DATA_VERSION}

# Download data
print("Downloading training data...")
napi.download_dataset(f"{DATA_VERSION}/train.parquet")
napi.download_dataset(f"{DATA_VERSION}/features.json")

# Laad feature metadata
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
print("Available feature sets:", list(feature_metadata["feature_sets"].keys()))
features = feature_metadata["feature_sets"]["small"]  # gebruik "small" voor sneller testen, "medium" of "all" voor betere prestaties

## Data laden met PySpark

In [None]:
# Laad trainingsdata met Spark
print("Loading training data with Spark...")
train_spark = spark.read.parquet(f"{DATA_VERSION}/train.parquet")

# Selecteer alleen de benodigde kolommen
columns_to_select = ["era"] + features + ["target"]
train_spark = train_spark.select(*columns_to_select)

# Downsampling voor snelheid (optioneel)
print("Preparing data for training...")
# Haal unieke era's op en sample 25% (elke 4e era)
unique_eras = [row.era for row in train_spark.select("era").distinct().collect()]
sampled_eras = unique_eras[::4]
train_spark = train_spark.filter(col("era").isin(sampled_eras))

# Bekijk de data
print(f"Training data count: {train_spark.count()}")
print(f"Number of features: {len(features)}")
print(f"Number of eras: {len(sampled_eras)}")

# Toon schema
train_spark.printSchema()

## Data voorbereiden met PySpark

In [None]:
# Bereid data voor met Spark ML Pipeline
print("Preparing feature vector with Spark...")

# Maak een feature vector van alle features
assembler = VectorAssembler(inputCols=features, outputCol="features")
train_spark = assembler.transform(train_spark)

# Toon een voorbeeld van de getransformeerde data
train_spark.select("era", "features", "target").show(5, truncate=True)

## Converteren van Spark DataFrame naar H2O Frame

In [None]:
# Converteer Spark DataFrame naar H2O Frame
print("Converting Spark DataFrame to H2O Frame...")
train_h2o = h2o_context.asH2OFrame(train_spark)

# Bekijk H2O Frame info
train_h2o.describe()

## Model trainen met H2O XGBoost via Sparkling Water

In [None]:
# Train model met H2O XGBoost via Sparkling Water
print("Training H2O XGBoost model via Sparkling Water...")
start_time = time.time()

# Configureer XGBoost model
from h2o.estimators.xgboost import H2OXGBoostEstimator

xgb_model = H2OXGBoostEstimator(
    ntrees=2000,
    max_depth=5,
    learn_rate=0.01,
    sample_rate=0.8,
    col_sample_rate=0.8,
    tree_method="auto",  # auto selecteert GPU indien beschikbaar
    booster="gbtree",
    seed=42
)

# Train het model
xgb_model.train(x=features, y="target", training_frame=train_h2o)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Toon model informatie
print(xgb_model)

## Feature importance visualiseren

In [None]:
# Feature importance visualiseren
feature_importance = xgb_model.varimp(use_pandas=True)
if feature_importance is not None:
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(feature_importance[:20])), feature_importance[:20]['relative_importance'])
    plt.yticks(range(len(feature_importance[:20])), feature_importance[:20]['variable'])
    plt.title('H2O XGBoost Feature Importance (top 20)')
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    plt.show()

## Model opslaan als MOJO

In [None]:
# Sla het model op als MOJO (Model Object, Optimized)
mojo_path = xgb_model.download_mojo(path="./", get_genmodel_jar=True)
print(f"Model saved as MOJO: {mojo_path}")

## Validatiedata laden en voorbereiden met PySpark

In [None]:
# Download validatiedata voor testen
print("Downloading validation data for testing...")
napi.download_dataset(f"{DATA_VERSION}/validation.parquet")

# Laad validatiedata met Spark
print("Loading validation data with Spark...")
validation_spark = spark.read.parquet(f"{DATA_VERSION}/validation.parquet")

# Selecteer alleen de benodigde kolommen
columns_to_select = ["era", "data_type"] + features
validation_spark = validation_spark.select(*columns_to_select)

# Filter alleen validatie data
validation_spark = validation_spark.filter(col("data_type") == "validation")

# Neem een kleine subset voor geheugenefficiëntie
validation_spark = validation_spark.limit(1000)

# Maak een feature vector van alle features
validation_spark = assembler.transform(validation_spark)

# Converteer Spark DataFrame naar H2O Frame
validation_h2o = h2o_context.asH2OFrame(validation_spark)

## Voorspellingen maken met het model

In [None]:
# Maak voorspellingen met het model
print("Making predictions...")
predictions_h2o = xgb_model.predict(validation_h2o)

# Converteer H2O Frame terug naar Spark DataFrame
predictions_spark = h2o_context.asSparkFrame(predictions_h2o)

# Toon voorspellingen
print("Sample predictions:")
predictions_spark.show(5)

## Voorspellingsfunctie definiëren

In [None]:
# Definieer voorspellingsfunctie die werkt met H2O model
def predict(
    live_features: pd.DataFrame,
    live_benchmark_models: pd.DataFrame
) -> pd.DataFrame:
    # Converteer pandas DataFrame naar Spark DataFrame
    live_features_spark = spark.createDataFrame(live_features[features])
    
    # Maak een feature vector van alle features
    live_features_spark = assembler.transform(live_features_spark)
    
    # Converteer Spark DataFrame naar H2O Frame
    live_features_h2o = h2o_context.asH2OFrame(live_features_spark)
    
    # Maak voorspellingen met het H2O model
    preds = xgb_model.predict(live_features_h2o)
    
    # Converteer H2O voorspellingen terug naar pandas
    predictions = h2o.as_list(preds)["predict"].values
    
    # Maak submission DataFrame
    submission = pd.Series(predictions, index=live_features.index)
    return submission.to_frame("prediction")

## Voorspellingsfunctie testen

In [None]:
# Converteer Spark DataFrame terug naar pandas voor testen
validation_pd = validation_spark.toPandas()

# Test voorspellingsfunctie
print("Testing prediction function...")
# Maak een lege DataFrame voor benchmark_models (niet gebruikt in onze voorspellingsfunctie)
empty_benchmark = pd.DataFrame(index=validation_pd.index)
predictions = predict(validation_pd, empty_benchmark)

print(f"Predictions shape: {predictions.shape}")
print("\nSample predictions:")
print(predictions.head())

## Voorspellingsfunctie opslaan met cloudpickle

In [None]:
# Pickle voorspellingsfunctie
print("Saving prediction function with cloudpickle...")
p = cloudpickle.dumps(predict)
with open("numerai_sparkling_water_model.pkl", "wb") as f:
    f.write(p)

print("Prediction function saved as 'numerai_sparkling_water_model.pkl'")

## Kaggle specifieke functies voor het opslaan van resultaten

In [None]:
# Opslaan van resultaten in Kaggle output
# Dit maakt het mogelijk om de resultaten te downloaden of als dataset te gebruiken
try:
    # Maak een output directory
    !mkdir -p /kaggle/working/output
    
    # Kopieer de belangrijke bestanden
    !cp numerai_sparkling_water_model.pkl /kaggle/working/output/
    !cp {mojo_path} /kaggle/working/output/
    
    print("Model bestanden opgeslagen in Kaggle output directory")
except Exception as e:
    print(f"Fout bij opslaan in Kaggle output: {e}")

## Voordelen van Sparkling Water

In [None]:
# Hier zou je een vergelijking kunnen maken tussen standaard H2O en Sparkling Water
print("Sparkling Water Voordelen:")
print("1. Gedistribueerde verwerking met Spark voor grote datasets")
print("2. Combinatie van Spark's data processing met H2O's machine learning algoritmes")
print("3. Betere schaalbaarheid voor complexe modellen en grote datasets")
print("4. Mogelijkheid om Spark ML Pipeline te integreren met H2O modellen")
print(f"5. Onze training duurde {training_time:.2f} seconden met Sparkling Water")

## Afsluiten van Spark en H2O

In [None]:
# Sluit H2O cluster af
h2o.cluster().shutdown()

# Sluit Spark sessie af
spark.stop()

In [None]:
# Financial Modeling Prep API Integration
import requests
import pandas as pd

FMP_API_KEY = "aDFEO9rxgvGL3VQgPcBxXblSZ3laRLap"
DEEPSEEK_API_KEY = "sk-6a3502649b0048259e0009a328c71960"

# Function to get economic indicators from Financial Modeling Prep
def get_economic_indicators():
    url = f"https://financialmodelingprep.com/api/v3/economic/economic_indicators?apikey={FMP_API_KEY}"
    response = requests.get(url)
    data = response.json()
    return pd.DataFrame(data)

# Get country and currency data
def get_country_currency_data():
    url = f"https://financialmodelingprep.com/api/v3/fx?apikey={FMP_API_KEY}"
    response = requests.get(url)
    fx_data = response.json()
    
    # Get country profiles for ISO codes
    url = f"https://financialmodelingprep.com/api/v4/country_list?apikey={FMP_API_KEY}"
    response = requests.get(url)
    country_data = response.json()
    
    # Create comprehensive country-currency mapping
    country_df = pd.DataFrame(country_data)
    fx_df = pd.DataFrame(fx_data)
    
    # Extract currency codes from FX pairs
    currency_codes = set()
    for pair in fx_df["ticker"].values:
        if "/" in pair:
            base, quote = pair.split("/")
            currency_codes.add(base)
            currency_codes.add(quote)
    
    # Create final mapping dataframe
    mapping_data = []
    for country in country_df.to_dict("records"):
        country_name = country.get("name", "")
        country_code = country.get("code", "")
        currency_name = country.get("currency", "")
        currency_code = ""
        
        # Try to find currency code
        for code in currency_codes:
            if len(code) == 3 and code.upper() in currency_name.upper():
                currency_code = code
                break
        
        mapping_data.append({
            "country_name": country_name,
            "country_code": country_code,
            "currency_name": currency_name,
            "currency_code": currency_code
        })
    
    return pd.DataFrame(mapping_data)

# Get economic indicators
try:
    economic_indicators = get_economic_indicators()
    print("Economic Indicators:")
    print(economic_indicators.head())
except Exception as e:
    print(f"Error fetching economic indicators: {e}")

# Get country-currency mapping
try:
    country_currency_mapping = get_country_currency_data()
    print("
Country-Currency Mapping:")
    print(country_currency_mapping.head(20))
    
    # Save the mapping to CSV
    country_currency_mapping.to_csv("country_currency_mapping.csv", index=False)
    print("
Saved country-currency mapping to CSV file")
except Exception as e:
    print(f"Error creating country-currency mapping: {e}")


In [None]:
# DeepSeek API Integration for Crypto-Country Association
import requests
import json

def get_crypto_country_associations(cryptocurrencies):
    url = "https://api.deepseek.com/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
    }
    
    crypto_list = ", ".join(cryptocurrencies)
    
    data = {
        "model": "deepseek-chat",
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant that provides accurate information about cryptocurrencies."
            },
            {
                "role": "user",
                "content": f"For each of these cryptocurrencies: {crypto_list}, provide the country where they have their entity registered or where they primarily report taxes. Return the data in JSON format with cryptocurrency name, country name, and ISO country code."
            }
        ],
        "temperature": 0.1,
        "max_tokens": 2000
    }
    
    try:
        response = requests.post(url, headers=headers, json=data)
        response_data = response.json()
        
        if "choices" in response_data and len(response_data["choices"]) > 0:
            content = response_data["choices"][0]["message"]["content"]
            
            # Extract JSON from the response
            try:
                # Try to find JSON in the response
                start_idx = content.find("{")
                end_idx = content.rfind("}")
                
                if start_idx != -1 and end_idx != -1:
                    json_str = content[start_idx:end_idx+1]
                    return json.loads(json_str)
                else:
                    return {"error": "No JSON found in response", "raw_response": content}
            except json.JSONDecodeError:
                return {"error": "Failed to parse JSON", "raw_response": content}
        else:
            return {"error": "No response from DeepSeek API"}
    except Exception as e:
        return {"error": str(e)}

# Example usage
cryptocurrencies = ["Bitcoin", "Ethereum", "Ripple", "Cardano", "Solana"]
try:
    crypto_country_data = get_crypto_country_associations(cryptocurrencies)
    print("Cryptocurrency Country Associations:")
    print(json.dumps(crypto_country_data, indent=2))
    
    # Convert to DataFrame and save
    if not isinstance(crypto_country_data, dict) or not crypto_country_data.get("error"):
        crypto_df = pd.DataFrame(crypto_country_data)
        crypto_df.to_csv("crypto_country_associations.csv", index=False)
        print("
Saved cryptocurrency country associations to CSV file")
except Exception as e:
    print(f"Error getting cryptocurrency country associations: {e}")
