In [None]:
#This is the MoE Class

In [None]:
class MoE(nn.Module):
    def __init__(self, input_size, num_experts, expert_hidden_size, output_size, gating_hidden_size=None):
        super(MoE, self).__init__()
        
        # Expert networks
        self.experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_size, expert_hidden_size),
                nn.ReLU(),
                nn.Linear(expert_hidden_size, output_size)
            )
            for _ in range(num_experts)
        ])
        
        # Gating network
        gating_hidden_size = gating_hidden_size or input_size
        self.gating_network = nn.Sequential(
            nn.Linear(input_size, gating_hidden_size),
            nn.ReLU(),
            nn.Linear(gating_hidden_size, num_experts),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, x):
        # Compute gating weights
        gating_weights = self.gating_network(x)  # [batch_size, num_experts]
        
        # Compute expert outputs
        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)  # [batch_size, output_size, num_experts]
        
        # Weighted sum of expert outputs
        output = torch.sum(expert_outputs * gating_weights.unsqueeze(1), dim=-1)  # [batch_size, output_size]
        
        return output


In [None]:
#Change the MyModule Class with the MoE, still need to tune the hyperparameters

In [None]:
def create_pipeline_with_moe(trial):
    # Hyperparameters for MoE
    num_experts = trial.suggest_int('num_experts', 2, 8)
    expert_hidden_size = trial.suggest_categorical('expert_hidden_size', [64, 128, 256])
    gating_hidden_size = trial.suggest_int('gating_hidden_size', 32, 128)
    dropout = trial.suggest_float('dropout', 0.0, 0.5)
    
    # NeuralNetClassifier with MoE
    net = NeuralNetClassifier(
        module=MoE,
        module__input_size=None,  # Set dynamically after preprocessing
        module__num_experts=num_experts,
        module__expert_hidden_size=expert_hidden_size,
        module__output_size=len(classes),
        module__gating_hidden_size=gating_hidden_size,
        max_epochs=trial.suggest_int('max_epochs', 20, 50),
        lr=trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        optimizer=torch.optim.Adam,
        criterion=nn.CrossEntropyLoss(weight=class_weights_tensor),
        batch_size=256,
        iterator_train__shuffle=True,
        device=device,
        verbose=0,
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('to_float32', to_float32),
        ('nn', net)
    ])

    temp_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('to_float32', to_float32)
    ])
    temp_pipeline.fit(X_train)
    X_train_processed_temp = temp_pipeline.transform(X_train)
    num_input_features = X_train_processed_temp.shape[1]

    # Update MoE input size dynamically
    pipeline.named_steps['nn'].set_params(module__input_size=num_input_features)

    return pipeline
