codewithdark-git
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 0 additions & 96 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 0 additions & 96 deletions
diff --git a/‎diffusionLM/__init__.py‎
Lines changed: 8 additions & 3 deletions b/‎diffusionLM/__init__.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎diffusionLM/model/MLP.py‎
Lines changed: 22 additions & 2 deletions b/‎diffusionLM/model/MLP.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎diffusionLM/model/__init__.py‎
Lines changed: 10 additions & 0 deletions b/‎diffusionLM/model/__init__.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎diffusionLM/model/attention.py‎
Lines changed: 28 additions & 2 deletions b/‎diffusionLM/model/attention.py‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎diffusionLM/model/diffusionLM.py‎
Lines changed: 24 additions & 4 deletions b/‎diffusionLM/model/diffusionLM.py‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎diffusionLM/model/mask_token.py‎
Lines changed: 9 additions & 5 deletions b/‎diffusionLM/model/mask_token.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎diffusionLM/model/time_embedding.py‎
Lines changed: 16 additions & 3 deletions b/‎diffusionLM/model/time_embedding.py‎
Lines changed: 16 additions & 3 deletions
@@ -1,13 +1,17 @@
 """DiffusionLM: A Diffusion-based Language Model Package"""
 
 from .trainer import trainer, TrainingError, evaluate
-from .model import DiffusionLLM, DiffusionConfig
+from .model import (
+        DiffusionLLM, 
+        DiffusionConfig, 
+        mask_tokens_for_diffusion 
+        )
 from .save_model import (
         save_model, 
         load_model, 
         ModelSaveError, 
         registerANDpush,
-        ModelRegistrationError,
+        ModelRegistrationError
         )
 from .utils import (
         prepare_dataset, 
@@ -16,7 +20,7 @@
         DatasetError,
         setup_logging,
         DiffusionLMError,
-        handle_errors,
+        handle_errors
         )
 
 __version__ = "0.1.0"
@@ -43,4 +47,5 @@
     "DatasetError",
     "prepare_dataset",
     "DatasetPreparationError",
+    "mask_tokens_for_diffusion",
 ]
@@ -1,10 +1,21 @@
-
 import torch 
 import torch.nn as nn
 import torch.nn.functional as F
 
 class MLP(nn.Module):
-    """Feed-forward network"""
+    """
+    Feed-forward neural network (MLP) used in transformer blocks.
+
+    This class implements a two-layer feed-forward network with GELU activation and dropout.
+
+    Args:
+        config: Configuration object containing model hyperparameters.
+
+    Attributes:
+        fc1: First linear layer.
+        fc2: Second linear layer.
+        dropout: Dropout layer applied after the second linear layer.
+    """
     def __init__(self, config):
         super().__init__()
         self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -19,6 +30,15 @@ def _init_weights(self, module):
                 nn.init.zeros_(module.bias)
 
     def forward(self, x):
+        """
+        Perform a forward pass through the MLP.
+
+        Args:
+            x: Input tensor of shape [batch_size, seq_length, hidden_size].
+
+        Returns:
+            Output tensor of shape [batch_size, seq_length, hidden_size].
+        """
         h = F.gelu(self.fc1(x))
         h = self.fc2(h)
         h = self.dropout(h)
 
@@ -2,9 +2,19 @@
 
 from .transformers_model import DiffusionLLM, DiffusionConfig
 from .mask_token import mask_tokens_for_diffusion
+from .attention import MultiHeadAttention
+from .diffusionLM import LLaDAModel
+from .MLP import MLP
+from .time_embedding import TimeEmbedding
+from .transformer_block import TransformerBlock
 
 __all__ = [
     "DiffusionLLM",
     "DiffusionConfig",
     "mask_tokens_for_diffusion",
+    "MultiHeadAttention",
+    "LLaDAModel",
+    "MLP",
+    "TimeEmbedding",
+    "TransformerBlock",
 ]
@@ -1,11 +1,23 @@
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 
 class MultiHeadAttention(nn.Module):
-    """Multi-head attention module"""
+    """
+    Multi-head self-attention mechanism.
+    This class implements the scaled dot-product attention mechanism with multiple attention heads.
+    Args:
+        config: Configuration object containing model hyperparameters.
+
+    Attributes:
+        q_proj: Linear layer for projecting queries.
+        k_proj: Linear layer for projecting keys.
+        v_proj: Linear layer for projecting values.
+        out_proj: Linear layer for projecting the output.
+        attn_dropout: Dropout layer for attention probabilities.
+        resid_dropout: Dropout layer for residual connections.
+    """
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -48,6 +60,20 @@ def merge_heads(self, x, batch_size):
         return x.view(batch_size, -1, self.hidden_size)
 
     def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        """
+        Perform a forward pass through the multi-head attention mechanism.
+
+        Args:
+            hidden_states: Input tensor of shape [batch_size, seq_length, hidden_size].
+            attention_mask: Optional mask to avoid attending to padding tokens.
+            head_mask: Optional mask for specific attention heads.
+            output_attentions: Whether to return attention probabilities.
+
+        Returns:
+            A tuple containing:
+                - output: Output tensor of shape [batch_size, seq_length, hidden_size].
+                - attention_probs (optional): Attention probabilities if output_attentions=True.
+        """
         batch_size, seq_length = hidden_states.shape[:2]
 
         # Project to queries, keys, values
 
@@ -8,9 +8,31 @@
 
 class LLaDAModel(nn.Module):
     """
-    LLaDA: Large Language Diffusion Model
-    Uses a transformer backbone with timestep conditioning for diffusion-based language modeling
+    A torch-based language model that incorporates diffusion-based generation through time step conditioning.
+    The model allows for various text generation strategies including random sampling, confidence-based sampling,
+    semi-autoregressive generation, and beam search.
+    Attributes:
+        config: Configuration object containing model hyperparameters
+        wte (nn.Embedding): Token embeddings
+        wpe (nn.Embedding): Position embeddings
+        dropout (nn.Dropout): Dropout layer
+        h (nn.ModuleList): List of transformer blocks
+        ln_f (nn.LayerNorm): Final layer normalization
+        time_embed (TimeEmbedding): Time step embedding module
+        time_proj (nn.ModuleList): Time projection layers for each transformer block
+        lm_head (nn.Linear): Output projection to vocabulary
+    Methods:
+        forward(input_ids, attention_mask, timesteps, labels):
+            Forward pass through the model for training and inference
+        generate(prompt, max_length, num_inference_steps, temperature, strategy, top_p, top_k, num_beams, return_scores):
+            Generate text using various sampling strategies and the reverse diffusion process
+        generate_stream(prompt, max_length, num_inference_steps, temperature, strategy, top_p, top_k, num_beams, callback_fn):
+    Example:
+        >>> config = ModelConfig(vocab_size=50257, hidden_size=768)
+        >>> model = LLaDAModel(config)
+        >>> output = model.generate(prompt="Hello", max_length=50, temperature=0.7)
     """
+    
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -61,13 +83,11 @@ def forward(
     ) -> Dict[str, torch.Tensor]:
         """
         Forward pass through the model
-
         Args:
             input_ids: Tensor of token ids [batch_size, seq_len]
             attention_mask: Mask tensor [batch_size, seq_len]
             timesteps: Current diffusion timesteps [batch_size]
             labels: Target token ids for masked positions [batch_size, seq_len]
-
         Returns:
             dict with loss and logits
         """
 
@@ -8,15 +8,19 @@ def mask_tokens_for_diffusion(
     mask_token_id: int,
 ):
     """
-    Apply forward diffusion process by masking tokens according to timestep.
+    Apply forward diffusion process by masking tokens according to the timestep.
 
     Args:
-        batch: Batch of token sequences
-        timestep: Current time step (between 0 and 1)
-        mask_token_id: ID of the mask token
+        batch: A dictionary containing input token sequences and attention masks.
+        timestep: Current timestep (between 0 and 1) for masking probability.
+        mask_token_id: ID of the mask token to replace selected tokens.
 
     Returns:
-        Dictionary with masked inputs and labels
+        A dictionary containing:
+            - input_ids: Masked input token IDs.
+            - attention_mask: Attention mask for the input.
+            - labels: Labels for the masked tokens (-100 for unmasked tokens).
+            - mask_ratio: Ratio of tokens that were masked.
     """
     input_ids = batch["input_ids"].clone()
     attention_mask = batch["attention_mask"]
 
@@ -3,7 +3,17 @@
 import torch
 
 class TimeEmbedding(nn.Module):
-    """Embedding for diffusion timesteps."""
+    """
+    Embedding layer for diffusion timesteps.
+
+    This class generates sinusoidal embeddings for diffusion timesteps and projects them to a higher-dimensional space.
+
+    Args:
+        time_embed_dim: Dimensionality of the time embedding.
+
+    Attributes:
+        time_embed: Sequential model for projecting sinusoidal embeddings.
+    """
     def __init__(self, time_embed_dim):
         super().__init__()
         self.time_embed_dim = time_embed_dim
@@ -15,10 +25,13 @@ def __init__(self, time_embed_dim):
 
     def forward(self, timesteps):
         """
+        Generate time embeddings for the given timesteps.
+
         Args:
-            timesteps: [batch_size] tensor of timestep values
+            timesteps: Tensor of shape [batch_size] containing timestep values.
+
         Returns:
-            [batch_size, time_embed_dim] tensor of embeddings
+            Tensor of shape [batch_size, time_embed_dim] containing time embeddings.
         """
         half_dim = self.time_embed_dim // 8
         emb = math.log(10000) / (half_dim - 1)