Skip to content

Commit 2c006e9

Browse files
Update the package modules and FIX the importing Error.
2 parents a404585 + 966155e commit 2c006e9

File tree

15 files changed

+309
-156
lines changed

15 files changed

+309
-156
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 96 deletions
This file was deleted.

diffusionLM/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
"""DiffusionLM: A Diffusion-based Language Model Package"""
22

33
from .trainer import trainer, TrainingError, evaluate
4-
from .model import DiffusionLLM, DiffusionConfig
4+
from .model import (
5+
DiffusionLLM,
6+
DiffusionConfig,
7+
mask_tokens_for_diffusion
8+
)
59
from .save_model import (
610
save_model,
711
load_model,
812
ModelSaveError,
913
registerANDpush,
10-
ModelRegistrationError,
14+
ModelRegistrationError
1115
)
1216
from .utils import (
1317
prepare_dataset,
@@ -16,7 +20,7 @@
1620
DatasetError,
1721
setup_logging,
1822
DiffusionLMError,
19-
handle_errors,
23+
handle_errors
2024
)
2125

2226
__version__ = "0.1.0"
@@ -43,4 +47,5 @@
4347
"DatasetError",
4448
"prepare_dataset",
4549
"DatasetPreparationError",
50+
"mask_tokens_for_diffusion",
4651
]

diffusionLM/model/MLP.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,21 @@
1-
21
import torch
32
import torch.nn as nn
43
import torch.nn.functional as F
54

65
class MLP(nn.Module):
7-
"""Feed-forward network"""
6+
"""
7+
Feed-forward neural network (MLP) used in transformer blocks.
8+
9+
This class implements a two-layer feed-forward network with GELU activation and dropout.
10+
11+
Args:
12+
config: Configuration object containing model hyperparameters.
13+
14+
Attributes:
15+
fc1: First linear layer.
16+
fc2: Second linear layer.
17+
dropout: Dropout layer applied after the second linear layer.
18+
"""
819
def __init__(self, config):
920
super().__init__()
1021
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -19,6 +30,15 @@ def _init_weights(self, module):
1930
nn.init.zeros_(module.bias)
2031

2132
def forward(self, x):
33+
"""
34+
Perform a forward pass through the MLP.
35+
36+
Args:
37+
x: Input tensor of shape [batch_size, seq_length, hidden_size].
38+
39+
Returns:
40+
Output tensor of shape [batch_size, seq_length, hidden_size].
41+
"""
2242
h = F.gelu(self.fc1(x))
2343
h = self.fc2(h)
2444
h = self.dropout(h)

diffusionLM/model/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,19 @@
22

33
from .transformers_model import DiffusionLLM, DiffusionConfig
44
from .mask_token import mask_tokens_for_diffusion
5+
from .attention import MultiHeadAttention
6+
from .diffusionLM import LLaDAModel
7+
from .MLP import MLP
8+
from .time_embedding import TimeEmbedding
9+
from .transformer_block import TransformerBlock
510

611
__all__ = [
712
"DiffusionLLM",
813
"DiffusionConfig",
914
"mask_tokens_for_diffusion",
15+
"MultiHeadAttention",
16+
"LLaDAModel",
17+
"MLP",
18+
"TimeEmbedding",
19+
"TransformerBlock",
1020
]

diffusionLM/model/attention.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,23 @@
1-
21
import torch
32
import torch.nn as nn
43
import torch.nn.functional as F
54

65

76
class MultiHeadAttention(nn.Module):
8-
"""Multi-head attention module"""
7+
"""
8+
Multi-head self-attention mechanism.
9+
This class implements the scaled dot-product attention mechanism with multiple attention heads.
10+
Args:
11+
config: Configuration object containing model hyperparameters.
12+
13+
Attributes:
14+
q_proj: Linear layer for projecting queries.
15+
k_proj: Linear layer for projecting keys.
16+
v_proj: Linear layer for projecting values.
17+
out_proj: Linear layer for projecting the output.
18+
attn_dropout: Dropout layer for attention probabilities.
19+
resid_dropout: Dropout layer for residual connections.
20+
"""
921
def __init__(self, config):
1022
super().__init__()
1123
self.config = config
@@ -48,6 +60,20 @@ def merge_heads(self, x, batch_size):
4860
return x.view(batch_size, -1, self.hidden_size)
4961

5062
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
63+
"""
64+
Perform a forward pass through the multi-head attention mechanism.
65+
66+
Args:
67+
hidden_states: Input tensor of shape [batch_size, seq_length, hidden_size].
68+
attention_mask: Optional mask to avoid attending to padding tokens.
69+
head_mask: Optional mask for specific attention heads.
70+
output_attentions: Whether to return attention probabilities.
71+
72+
Returns:
73+
A tuple containing:
74+
- output: Output tensor of shape [batch_size, seq_length, hidden_size].
75+
- attention_probs (optional): Attention probabilities if output_attentions=True.
76+
"""
5177
batch_size, seq_length = hidden_states.shape[:2]
5278

5379
# Project to queries, keys, values

diffusionLM/model/diffusionLM.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,31 @@
88

99
class LLaDAModel(nn.Module):
1010
"""
11-
LLaDA: Large Language Diffusion Model
12-
Uses a transformer backbone with timestep conditioning for diffusion-based language modeling
11+
A torch-based language model that incorporates diffusion-based generation through time step conditioning.
12+
The model allows for various text generation strategies including random sampling, confidence-based sampling,
13+
semi-autoregressive generation, and beam search.
14+
Attributes:
15+
config: Configuration object containing model hyperparameters
16+
wte (nn.Embedding): Token embeddings
17+
wpe (nn.Embedding): Position embeddings
18+
dropout (nn.Dropout): Dropout layer
19+
h (nn.ModuleList): List of transformer blocks
20+
ln_f (nn.LayerNorm): Final layer normalization
21+
time_embed (TimeEmbedding): Time step embedding module
22+
time_proj (nn.ModuleList): Time projection layers for each transformer block
23+
lm_head (nn.Linear): Output projection to vocabulary
24+
Methods:
25+
forward(input_ids, attention_mask, timesteps, labels):
26+
Forward pass through the model for training and inference
27+
generate(prompt, max_length, num_inference_steps, temperature, strategy, top_p, top_k, num_beams, return_scores):
28+
Generate text using various sampling strategies and the reverse diffusion process
29+
generate_stream(prompt, max_length, num_inference_steps, temperature, strategy, top_p, top_k, num_beams, callback_fn):
30+
Example:
31+
>>> config = ModelConfig(vocab_size=50257, hidden_size=768)
32+
>>> model = LLaDAModel(config)
33+
>>> output = model.generate(prompt="Hello", max_length=50, temperature=0.7)
1334
"""
35+
1436
def __init__(self, config):
1537
super().__init__()
1638
self.config = config
@@ -61,13 +83,11 @@ def forward(
6183
) -> Dict[str, torch.Tensor]:
6284
"""
6385
Forward pass through the model
64-
6586
Args:
6687
input_ids: Tensor of token ids [batch_size, seq_len]
6788
attention_mask: Mask tensor [batch_size, seq_len]
6889
timesteps: Current diffusion timesteps [batch_size]
6990
labels: Target token ids for masked positions [batch_size, seq_len]
70-
7191
Returns:
7292
dict with loss and logits
7393
"""

diffusionLM/model/mask_token.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,19 @@ def mask_tokens_for_diffusion(
88
mask_token_id: int,
99
):
1010
"""
11-
Apply forward diffusion process by masking tokens according to timestep.
11+
Apply forward diffusion process by masking tokens according to the timestep.
1212
1313
Args:
14-
batch: Batch of token sequences
15-
timestep: Current time step (between 0 and 1)
16-
mask_token_id: ID of the mask token
14+
batch: A dictionary containing input token sequences and attention masks.
15+
timestep: Current timestep (between 0 and 1) for masking probability.
16+
mask_token_id: ID of the mask token to replace selected tokens.
1717
1818
Returns:
19-
Dictionary with masked inputs and labels
19+
A dictionary containing:
20+
- input_ids: Masked input token IDs.
21+
- attention_mask: Attention mask for the input.
22+
- labels: Labels for the masked tokens (-100 for unmasked tokens).
23+
- mask_ratio: Ratio of tokens that were masked.
2024
"""
2125
input_ids = batch["input_ids"].clone()
2226
attention_mask = batch["attention_mask"]

diffusionLM/model/time_embedding.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,17 @@
33
import torch
44

55
class TimeEmbedding(nn.Module):
6-
"""Embedding for diffusion timesteps."""
6+
"""
7+
Embedding layer for diffusion timesteps.
8+
9+
This class generates sinusoidal embeddings for diffusion timesteps and projects them to a higher-dimensional space.
10+
11+
Args:
12+
time_embed_dim: Dimensionality of the time embedding.
13+
14+
Attributes:
15+
time_embed: Sequential model for projecting sinusoidal embeddings.
16+
"""
717
def __init__(self, time_embed_dim):
818
super().__init__()
919
self.time_embed_dim = time_embed_dim
@@ -15,10 +25,13 @@ def __init__(self, time_embed_dim):
1525

1626
def forward(self, timesteps):
1727
"""
28+
Generate time embeddings for the given timesteps.
29+
1830
Args:
19-
timesteps: [batch_size] tensor of timestep values
31+
timesteps: Tensor of shape [batch_size] containing timestep values.
32+
2033
Returns:
21-
[batch_size, time_embed_dim] tensor of embeddings
34+
Tensor of shape [batch_size, time_embed_dim] containing time embeddings.
2235
"""
2336
half_dim = self.time_embed_dim // 8
2437
emb = math.log(10000) / (half_dim - 1)

0 commit comments

Comments
 (0)