-
Notifications
You must be signed in to change notification settings - Fork 13.7k
convert : set expert gating func in base class #17279
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -825,6 +825,15 @@ def set_gguf_parameters(self): | |
| self.gguf_writer.add_expert_group_used_count(n_group_used) | ||
| logger.info(f"gguf: expert groups used count = {n_group_used}") | ||
|
|
||
| if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None: | ||
| if score_func == "sigmoid": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) | ||
| elif score_func == "softmax": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) | ||
| else: | ||
| raise ValueError(f"Unsupported expert score gating function value: {score_func}") | ||
| logger.info(f"gguf: expert score gating function = {score_func}") | ||
|
|
||
| if (head_dim := self.hparams.get("head_dim")) is not None: | ||
| self.gguf_writer.add_key_length(head_dim) | ||
| self.gguf_writer.add_value_length(head_dim) | ||
|
|
@@ -2553,15 +2562,6 @@ def set_gguf_parameters(self): | |
| if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None: | ||
| self.gguf_writer.add_leading_dense_block_count(n_dense_layers) | ||
|
|
||
| # Expert Gating Function | ||
| score_func = self.hparams.get("score_func") | ||
| if score_func == "sigmoid": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) | ||
| elif score_func == "softmax": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) | ||
| elif score_func is not None: | ||
| raise ValueError(f"Unsupported score_function value: {score_func}") | ||
|
|
||
| # Route normalization and scaling | ||
| if (route_norm := self.hparams.get("route_norm")) is not None: | ||
| self.gguf_writer.add_expert_weights_norm(route_norm) | ||
|
|
@@ -7182,13 +7182,6 @@ def set_gguf_parameters(self): | |
| self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) | ||
| self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) | ||
|
|
||
| if hparams["scoring_func"] == "sigmoid": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) | ||
| elif hparams["scoring_func"] == "softmax": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) | ||
| else: | ||
| raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}") | ||
|
|
||
| self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) | ||
|
|
||
| rope_scaling = self.hparams.get("rope_scaling") or {} | ||
|
|
@@ -7294,12 +7287,6 @@ def __init__(self, *args, **kwargs): | |
|
|
||
| def set_gguf_parameters(self): | ||
| super().set_gguf_parameters() | ||
| if self.hparams["scoring_func"] == "sigmoid": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) | ||
| elif self.hparams["scoring_func"] == "softmax": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) | ||
| else: | ||
| raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}") | ||
|
|
||
| self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"])) | ||
| self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"])) | ||
|
|
@@ -7392,11 +7379,6 @@ def set_gguf_parameters(self): | |
| self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) | ||
| self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) | ||
|
|
||
| if self.hparams["scoring_func"] == "noaux_tc": | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's on purpose, as mentioned in OP, this was incorrectly set in |
||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) | ||
| else: | ||
| raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}") | ||
|
|
||
| def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): | ||
| if name.endswith("e_score_correction_bias"): | ||
| name = name.replace("e_score_correction_bias", "e_score_correction.bias") | ||
|
|
@@ -8717,13 +8699,6 @@ def set_gguf_parameters(self): | |
| self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) | ||
| self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) | ||
|
|
||
| if hparams["score_function"] == "sigmoid": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) | ||
| elif hparams["score_function"] == "softmax": | ||
| self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) | ||
| else: | ||
| raise ValueError(f"Unsupported score_function value: {hparams['score_function']}") | ||
|
|
||
| if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None: | ||
| self.gguf_writer.add_nextn_predict_layers(nextn_layers) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm wondering if there is a way to mark this as required (
optional=False) for certain modelThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Then you'd have to maintain a list, not very practical, besides if your model requires this it will throw at graph build.