In [21]:
import sqlite3
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

def create_dataloader(db_file, num_samples):
    conn = sqlite3.connect(db_file)

    # Read in the relevant tables as dataframes
    matches_df = pd.read_sql_query('SELECT * FROM matches', conn)
    players_df = pd.read_sql_query('SELECT * FROM players', conn)
    match_players_df = pd.read_sql_query('SELECT * FROM match_players', conn)
    match_player_actions_df = pd.read_sql_query('SELECT * FROM match_player_actions', conn)

    # Merge the dataframes on the foreign keys
    merged_df = pd.merge(matches_df, match_players_df, on='match_id')
    merged_df = pd.merge(merged_df, players_df, on='player_id')
    merged_df = pd.merge(merged_df, match_player_actions_df, on='match_player_id')
    merged_df = pd.merge(merged_df, openings_df, on='opening_id')

    # Extract the label column and convert it to numerical values
    labels = pd.to_numeric(merged_df['victory'], downcast='integer')

    # Extract the feature columns and convert them to numerical values
    features = merged_df.drop(columns=['victory'])
    features = features.apply(pd.to_numeric, downcast='float')

    # Select a random subset of the data
    data = features.sample(n=num_samples)

    # Convert the data and labels to PyTorch tensors
    data = torch.tensor(data.values, dtype=torch.float32)
    labels = torch.tensor(labels.values, dtype=torch.long)

    # Create a PyTorch dataset and dataloader
    dataset = TensorDataset(data, labels)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

    return dataloader

In [22]:
train_loader = create_dataloader('bigdata.db', 5000)
val_loader = create_dataloader('bigdata.db', 5000)

MemoryError: 

In [None]:
#Preprocess and normalize the data

import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_data(train_loader, val_loader):
    # Get the mean and standard deviation of the training data
    scaler = StandardScaler()
    for x, y in train_loader:
        scaler.partial_fit(x.numpy())
    mean = scaler.mean_
    std = np.sqrt(scaler.var_)
    
    # Normalize the data using the mean and standard deviation
    for loader in [train_loader, val_loader]:
        for i, (x, y) in enumerate(loader):
            x = x.numpy()
            x = (x - mean) / std
            loader.dataset.data[i] = (torch.from_numpy(x), y)
    
    return train_loader, val_loader
train_loader, val_loader = preprocess_data(train_loader, val_loader)
print("All done with preprocessing and normalizing the data!")

In [None]:
#Use CART model from sklearn

from sklearn.tree import DecisionTreeClassifier

# Create a CART model
model = DecisionTreeClassifier()

# Train the model using the training DataLoader
for x, y in train_loader:
    model.fit(x.numpy(), y.numpy())
    
# Evaluate the model on the validation DataLoader
correct = 0
total = 0
for x, y in val_loader:
    y_pred = model.predict(x.numpy())
    correct += (y_pred == y.numpy()).sum().item()
    total += y.shape[0]
accuracy = correct / total
print(f"Validation accuracy: {accuracy:.3f}")