## Target Function Extraction

### Implementation1

In [1]:
import re
from pprint import pprint

def extract_between(text, start, end, extend=False, multiple=False):
    target = r'(.*)' if extend else r'(.*?)'
    pattern = re.escape(start) + target + re.escape(end)
    matches = re.findall(pattern, text)

    if multiple:
        results = []
        for match in matches:
            results.append(match.strip())
        return results
    else:
        # Return the first match if available, else None
        return matches[0].strip() if matches else None

# Example usage
text = """
@converter(torch.sin, torch.Tensor.sin, channel_ordering_strategy=ChannelOrderingStrategy.MINIMUM_TRANSPOSITIONS)
def converter_sin(input, *args, **kwargs):
    def func(input, *args, **kwargs):
        return tf.math.sin(input)
    return func


@converter(torch.cos, torch.Tensor.cos, channel_ordering_strategy=ChannelOrderingStrategy.MINIMUM_TRANSPOSITIONS)
def converter_cos(input, *args, **kwargs):
    def func(input, *args, **kwargs):
        return tf.math.cos(input)
    return func


@converter(torch.add, torch.Tensor.add, torch.Tensor.add_, torch.Tensor.__add__, torch.Tensor.__iadd__, torch.Tensor.__radd__, channel_ordering_strategy=ChannelOrderingStrategy.MINIMUM_TRANSPOSITIONS_OR_PYTORCH, autocast=True)
def converter_add(input, other, *args, **kwargs):
    def func(input, other, *args, **kwargs):
        return input + other
    return func
"""


### Tests

#### end = ', channel_ordering_strategy'

It makes the `extend` argument useless, but free from the additional arguments to the target functions.

In [2]:
# Test the function
start = '@converter('
end = ', channel_ordering_strategy'

bools = [True, False]

for _bool_ex in bools:
	extend = _bool_ex
	for _bool_mul in bools:
		multiple = _bool_mul  
		print("extend:", str(_bool_ex), "multiple: ", str(_bool_mul))
		pprint(extract_between(text, start, end, extend=extend, multiple=multiple))
		print("\n")


extend: True multiple:  True
['torch.sin, torch.Tensor.sin',
 'torch.cos, torch.Tensor.cos',
 'torch.add, torch.Tensor.add, torch.Tensor.add_, torch.Tensor.__add__, '
 'torch.Tensor.__iadd__, torch.Tensor.__radd__']


extend: True multiple:  False
'torch.sin, torch.Tensor.sin'


extend: False multiple:  True
['torch.sin, torch.Tensor.sin',
 'torch.cos, torch.Tensor.cos',
 'torch.add, torch.Tensor.add, torch.Tensor.add_, torch.Tensor.__add__, '
 'torch.Tensor.__iadd__, torch.Tensor.__radd__']


extend: False multiple:  False
'torch.sin, torch.Tensor.sin'




#### end = ','

It makes the `extend` argument functional, but it can include non-target results.

In [3]:
# Test the function
start = '@converter('
end = ','

bools = [True, False]

for _bool_ex in bools:
	extend = _bool_ex
	for _bool_mul in bools:
		multiple = _bool_mul  
		print("extend:", str(_bool_ex), "multiple: ", str(_bool_mul))
		pprint(extract_between(text, start, end, extend=extend, multiple=multiple))
		print("\n")

extend: True multiple:  True
['torch.sin, torch.Tensor.sin',
 'torch.cos, torch.Tensor.cos',
 'torch.add, torch.Tensor.add, torch.Tensor.add_, torch.Tensor.__add__, '
 'torch.Tensor.__iadd__, torch.Tensor.__radd__, '
 'channel_ordering_strategy=ChannelOrderingStrategy.MINIMUM_TRANSPOSITIONS_OR_PYTORCH']


extend: True multiple:  False
'torch.sin, torch.Tensor.sin'


extend: False multiple:  True
['torch.sin', 'torch.cos', 'torch.add']


extend: False multiple:  False
'torch.sin'




## Implementation2

- discard `extend` argument
- use out[0] for single output instead
- split each result as `list(str)`


In [2]:
import re

def extract_between(text, start, end, multiple=False):
    target = r'(.*?)'
    pattern = re.escape(start) + target + re.escape(end)
    matches = re.findall(pattern, text)

    if multiple:
        results = []
        for match in matches:
            results.append([item.strip() for item in match.strip().split(',')])
        return results
    else:
        if matches:
            return [item.strip() for item in matches[0].strip().split(',')]
        else:
            return None

In [4]:
# Test the function
start = '@converter('
end = ', channel_ordering_strategy'

bools = [True, False]

for _bool_mul in bools:
	multiple = _bool_mul  
	print("multiple: ", str(_bool_mul))
	pprint(extract_between(text, start, end, multiple=multiple))
	print("\n")


multiple:  True


NameError: name 'pprint' is not defined

### Conclusion

Select the Implementation2 because it is more stable.

## `__doc__` Extration

In [1]:
# Example usage
text = """
from typing import Optional, Union, List, Tuple, Sequence, Any

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.types import _int, _bool, Number, _dtype, _size, _layout, _device

import tensorflow as tf
import keras.layers

from nobuco.commons import ChannelOrder, ChannelOrderingStrategy
from nobuco.converters.channel_ordering import set_channel_order, get_channel_order
from nobuco.converters.node_converter import converter
from nobuco.converters.tensor import _dim_make_positive, dim_pytorch2keras, perm_keras2pytorch, _permute, \
    perm_pytorch2keras, _ensure_iterable, _dims_make_positive, dims_pytorch2keras
from nobuco.converters.type_cast import dtype_pytorch2keras


@converter(torch.sin, torch.Tensor.sin, channel_ordering_strategy=ChannelOrderingStrategy.MINIMUM_TRANSPOSITIONS)
def converter_sin(input, *args, **kwargs):
    def func(input, *args, **kwargs):
        return tf.math.sin(input)
    return func


@converter(torch.cos, torch.Tensor.cos, channel_ordering_strategy=ChannelOrderingStrategy.MINIMUM_TRANSPOSITIONS)
def converter_cos(input, *args, **kwargs):
    def func(input, *args, **kwargs):
        return tf.math.cos(input)
    return func
"""

- Target Preparation

In [5]:
targets = extract_between(text, start, end, multiple=True)
targets

[['torch.sin', 'torch.Tensor.sin'], ['torch.cos', 'torch.Tensor.cos']]

### `__doc__` Analysis

- Both, function and class have
    Args and Example

- class have one additional useful information
    Shape

In [14]:
import torch
import torch.nn as nn

In [13]:
target_func = getattr(torch, 'sin')
print(target_func.__doc__)


sin(input, *, out=None) -> Tensor

Returns a new tensor with the sine of the elements of :attr:`input`.

.. math::
    \text{out}_{i} = \sin(\text{input}_{i})

Args:
    input (Tensor): the input tensor.

Keyword args:
    out (Tensor, optional): the output tensor.

Example::

    >>> a = torch.randn(4)
    >>> a
    tensor([-0.5461,  0.1347, -2.7266, -0.2746])
    >>> torch.sin(a)
    tensor([-0.5194,  0.1343, -0.4032, -0.2711])



In [15]:
target_func = getattr(nn, 'Conv2d')
print(target_func.__doc__)

Applies a 2D convolution over an input signal composed of several input
    planes.

    In the simplest case, the output value of the layer with input size
    :math:`(N, C_{\text{in}}, H, W)` and output :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})`
    can be precisely described as:

    .. math::
        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)


    where :math:`\star` is the valid 2D `cross-correlation`_ operator,
    :math:`N` is a batch size, :math:`C` denotes a number of channels,
    :math:`H` is a height of input planes in pixels, and :math:`W` is
    width in pixels.
    

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    * :attr:`stride` controls the stride for the cross-correlation, 

### Implementation1

- Output is the dictionary type.
- Name of headers must be specified.
- Find given '{header_name}:' patterns

In [33]:
text = target_func.__doc__

In [22]:
import re

def parse_sections(text):
    # Define the section headers
    headers = ['Args', 'Shape', 'Attributes', 'Example', 'Examples', 'Note']

    # Create a regex pattern to match the headers
    header_pattern = re.compile(r'^\s*(' + '|'.join(headers) + r')\s*:\s*$', re.MULTILINE)

    # Find all matches for headers and their positions
    matches = list(header_pattern.finditer(text))

    # Initialize the result dictionary
    sections = {}

    # Iterate over the matches to extract sections
    for i in range(len(matches)):
        start_header = matches[i]
        end_header = matches[i+1] if i+1 < len(matches) else None

        # Extract the section name
        section_name = start_header.group(1)

        # Extract the section content
        start_pos = start_header.end()
        end_pos = end_header.start() if end_header else len(text)
        section_content = text[start_pos:end_pos].strip()

        # Add the section to the dictionary
        sections[section_name] = section_content

    return sections

In [49]:
headers = ['Args', 'Shape', 'Attributes', 'Example', 'Examples', 'Note']

# Create a regex pattern to match the headers
header_pattern = re.compile(r'^\s*(' + '|'.join(headers) + r')\s*:\s*$', re.MULTILINE)

# Find all matches for headers and their positions
matches = list(header_pattern.finditer(target_func.__doc__))

matches

[<re.Match object; span=(2521, 2531), match='\n    Note:'>,
 <re.Match object; span=(3000, 3010), match='\n    Note:'>,
 <re.Match object; span=(3390, 3400), match='\n    Note:'>,
 <re.Match object; span=(3615, 3625), match='\n    Note:'>,
 <re.Match object; span=(3717, 3727), match='\n    Args:'>,
 <re.Match object; span=(4586, 4602), match='    \n\n    Shape:'>,
 <re.Match object; span=(5237, 5253), match='\n    Attributes:'>,
 <re.Match object; span=(6022, 6037), match='\n    Examples:\n'>]

In [24]:
pprint(parse_sections(target_func.__doc__))

{'Args': 'in_channels (int): Number of channels in the input image\n'
         '        out_channels (int): Number of channels produced by the '
         'convolution\n'
         '        kernel_size (int or tuple): Size of the convolving kernel\n'
         '        stride (int or tuple, optional): Stride of the convolution. '
         'Default: 1\n'
         '        padding (int, tuple or str, optional): Padding added to all '
         'four sides of\n'
         '            the input. Default: 0\n'
         "        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,\n"
         "            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``\n"
         '        dilation (int or tuple, optional): Spacing between kernel '
         'elements. Default: 1\n'
         '        groups (int, optional): Number of blocked connections from '
         'input\n'
         '            channels to output channels. Default: 1\n'
         '        bias (bool, optional): If ``True``, adds

### Implementation2

- Output is jsonl type.
- Name of headers do not need to be specified.

- Remove indent from all lines
- Starts with an alphabet
- Ends with `:`

- Less stable
    - Non-header can be added
    - There is no loss of the real headers

In [86]:
def parse_sections(text, header_indent, unit_indent):
    lines = text.split('\n')
    parsed_data = []
    current_header = None
    current_content = []

    for line in lines:
        # Remove leading indentation
        line = line[header_indent:]

        # Check if the line is a header
        if line and line[0].isalpha() and line.strip().endswith(':'):
            # If there's a current header, save its content before starting a new one
            if current_header is not None:
                parsed_data.append({'header': current_header, 'content': '\n'.join(current_content)})
                current_content = []
            # Set new header
            current_header = line.rstrip(':')
        else:
            if current_header is not None:
                # Add line to current content
                current_content.append(line[unit_indent:])

    # Don't forget to save the last header-content pair
    if current_header is not None:
        parsed_data.append({'header': current_header, 'content': '\n'.join(current_content)})

    return parsed_data

In [87]:
function_doc = getattr(torch, 'sin').__doc__
print(function_doc)


sin(input, *, out=None) -> Tensor

Returns a new tensor with the sine of the elements of :attr:`input`.

.. math::
    \text{out}_{i} = \sin(\text{input}_{i})

Args:
    input (Tensor): the input tensor.

Keyword args:
    out (Tensor, optional): the output tensor.

Example::

    >>> a = torch.randn(4)
    >>> a
    tensor([-0.5461,  0.1347, -2.7266, -0.2746])
    >>> torch.sin(a)
    tensor([-0.5194,  0.1343, -0.4032, -0.2711])



In [88]:
class_doc = getattr(nn, 'Conv2d').__doc__
print(class_doc)

Applies a 2D convolution over an input signal composed of several input
    planes.

    In the simplest case, the output value of the layer with input size
    :math:`(N, C_{\text{in}}, H, W)` and output :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})`
    can be precisely described as:

    .. math::
        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)


    where :math:`\star` is the valid 2D `cross-correlation`_ operator,
    :math:`N` is a batch size, :math:`C` denotes a number of channels,
    :math:`H` is a height of input planes in pixels, and :math:`W` is
    width in pixels.
    

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    * :attr:`stride` controls the stride for the cross-correlation, 

In [89]:
parsed_class_doc = parse_sections(class_doc, indent=4)
parsed_class_doc

TypeError: parse_sections() got an unexpected keyword argument 'indent'

In [90]:
print(function_doc)


sin(input, *, out=None) -> Tensor

Returns a new tensor with the sine of the elements of :attr:`input`.

.. math::
    \text{out}_{i} = \sin(\text{input}_{i})

Args:
    input (Tensor): the input tensor.

Keyword args:
    out (Tensor, optional): the output tensor.

Example::

    >>> a = torch.randn(4)
    >>> a
    tensor([-0.5461,  0.1347, -2.7266, -0.2746])
    >>> torch.sin(a)
    tensor([-0.5194,  0.1343, -0.4032, -0.2711])



In [93]:
parsed_class_doc = parse_sections(class_doc, header_indent=4, unit_indent=4)

started_header = False
for session in parsed_class_doc:
    if "Note" in session['header']:
        started_header = True
    if started_header:
        print(session['header'], ':')
        print(session['content'], '\n')

Note :
When `groups == in_channels` and `out_channels == K * in_channels`,
where `K` is a positive integer, this operation is also known as a "depthwise convolution".

In other words, for an input of size :math:`(N, C_{in}, L_{in})`,
a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments
:math:`(C_\text{in}=C_\text{in}, C_\text{out}=C_\text{in} \times \text{K}, ..., \text{groups}=C_\text{in})`.
 

Note :
In some circumstances when given tensors on a CUDA device and using CuDNN, this operator may select a nondeterministic algorithm to increase performance. If this is undesirable, you can try to make the operation deterministic (potentially at a performance cost) by setting ``torch.backends.cudnn.deterministic = True``. See :doc:`/notes/randomness` for more information.
 

Note :
``padding='valid'`` is the same as no padding. ``padding='same'`` pads
the input so the output has the shape as the input. However, this mode
doesn't support any stride value

In [92]:
parsed_function_doc = parse_sections(function_doc, header_indent=0, unit_indent=4)
for session in parsed_function_doc:
    print(session['header'], ':')
    print(session['content'], '\n')

Args :
input (Tensor): the input tensor.
 

Keyword args :
out (Tensor, optional): the output tensor.
 

Example :

>>> a = torch.randn(4)
>>> a
tensor([-0.5461,  0.1347, -2.7266, -0.2746])
>>> torch.sin(a)
tensor([-0.5194,  0.1343, -0.4032, -0.2711])
 



### Conclusion

Select the Implementation2 for the following reasons

- for the sake of RAG, jsonl structure has its advantage
- we are free from the header specification

## Class Converter Template

### Phase1 : Constructor Args

In [101]:
class_doc = getattr(nn, 'Linear').__doc__
parsed_class_doc = parse_sections(class_doc, header_indent=4, unit_indent=4)
for session in parsed_class_doc:
    print(session['header'], ':')
    print(session['content'], '\n')

Args :
in_features: size of each input sample
out_features: size of each output sample
bias: If set to ``False``, the layer will not learn an additive bias.
    Default: ``True``
 

Shape :
- Input: :math:`(*, H_{in})` where :math:`*` means any number of
  dimensions including none and :math:`H_{in} = \text{in\_features}`.
- Output: :math:`(*, H_{out})` where all but the last dimension
  are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
 

Attributes :
weight: the learnable weights of the module of shape
    :math:`(\text{out\_features}, \text{in\_features})`. The values are
    initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
    :math:`k = \frac{1}{\text{in\_features}}`
bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
        If :attr:`bias` is ``True``, the values are initialized from
        :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
        :math:`k = \frac{1}{\text{in\_features}}`
 

Examples :

>>> m 

In [106]:
parsed_function_doc

[{'header': 'Args', 'content': 'input (Tensor): the input tensor.\n'},
 {'header': 'Keyword args',
  'content': 'out (Tensor, optional): the output tensor.\n'},
 {'header': 'Example',
  'content': '\n>>> a = torch.randn(4)\n>>> a\ntensor([-0.5461,  0.1347, -2.7266, -0.2746])\n>>> torch.sin(a)\ntensor([-0.5194,  0.1343, -0.4032, -0.2711])\n'}]

In [None]:
torch_model = torch.nn.Conv2d(
    in_channels=1, 
    out_channels=2, 
    kernel_size=3
)

In [123]:
function_doc

'\nsin(input, *, out=None) -> Tensor\n\nReturns a new tensor with the sine of the elements of :attr:`input`.\n\n.. math::\n    \\text{out}_{i} = \\sin(\\text{input}_{i})\n\nArgs:\n    input (Tensor): the input tensor.\n\nKeyword args:\n    out (Tensor, optional): the output tensor.\n\nExample::\n\n    >>> a = torch.randn(4)\n    >>> a\n    tensor([-0.5461,  0.1347, -2.7266, -0.2746])\n    >>> torch.sin(a)\n    tensor([-0.5194,  0.1343, -0.4032, -0.2711])\n'

In [190]:
# Phase1

# Prompt Template Blue Print
# Not just the exact target, but the context
# We'd better start with the smallest example

doc = class_doc
target = 'torch.nn.Linear'

phase1_example = f'''
Generate the code lines to initiate an instance of the target module.

target: torch.nn.Conv2d

output:
torch_model = torch.nn.Conv2d(
    in_channels=10, 
    out_channels=20, 
    kernel_size=3
)
'''

phase1_prompt = f'''

<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for {target}>>

{doc}

<<Example>>
{phase1_example}

<<Task>>

Write the code lines to initiate an instance of the target module in python code block.

target: {target}

output:


'''

In [169]:
print(phase1_prompt)



<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for torch.nn.Linear>>

Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
          dimensions including none and :math:`H_{in} = \text{in\_features}`.
        - Output: :math:`(*, H_{out})` where all but the last dimension
          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :

### Phase2

In [191]:
# Phase 2

# Check if an instance is initiated by the generated text
# If the test fails, return to Phase1
# If passes, do the next

# Phase1

# Prompt Template Blue Print
# Not just the exact target, but the context
# We'd better start with the smallest example

doc = class_doc
target = 'torch.nn.Linear'

phase1_output = """
torch_model = torch.nn.Linear(
    in_features=32, 
    out_features=64, 
    bias=True
)
"""

phase2_example = f'''
Write the inputs of the target's forward function, and the line of forward as well in python code block.

target: 
torch_model = torch.nn.Conv2d(
    in_channels=10, 
    out_channels=20, 
    kernel_size=3
)

output:

```python
inputs = [torch.randn(1, 10, 32, 32)]
output = torch_model.forward(*inputs)

```

'''

phase2_prompt = f'''

<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for {target}>>

{doc}

<<Example>>
{phase2_example}

<<Task>>

Write the inputs of the target's forward function, and the line of forward as well in python code block.

target: {phase1_output}

output:

'''

In [148]:
print(phase2_prompt)



<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for torch.nn.Linear>>

Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
          dimensions including none and :math:`H_{in} = \text{in\_features}`.
        - Output: :math:`(*, H_{out})` where all but the last dimension
          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :

In [150]:
print(phase1_output)


torch_model = torch.nn.Linear(
    in_features=32, 
    out_features=64, 
    bias=True
)



In [151]:
torch_model = torch.nn.Linear(
    in_features=32, 
    out_features=64, 
    bias=True
)

In [152]:
inputs = [torch.randn(128, 32)]
output = torch_model.forward(*inputs)


### Experiment


In [154]:
def generate_phase2_prompt(phase1_output):
    return f'''

<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for {target}>>

{doc}

<<Example>>
{example}

<<Task>>

Write the inputs of the target's forward function, and the line of forward as well in python code block.

target: {phase1_output}

output:

'''

In [160]:
print(phase1_prompt)



<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for torch.nn.Linear>>

Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
          dimensions including none and :math:`H_{in} = \text{in\_features}`.
        - Output: :math:`(*, H_{out})` where all but the last dimension
          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :

#### GPT4o

- Phase1

```python
torch_model = torch.nn.Linear(
    in_features=20, 
    out_features=30, 
    bias=True
)
```

In [161]:
torch_model = torch.nn.Linear(
    in_features=20, 
    out_features=30, 
    bias=True
)

In [163]:
print(generate_phase2_prompt('''
torch_model = torch.nn.Linear(
    in_features=20, 
    out_features=30, 
    bias=True
)
'''))



<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for torch.nn.Linear>>

Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
          dimensions including none and :math:`H_{in} = \text{in\_features}`.
        - Output: :math:`(*, H_{out})` where all but the last dimension
          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :

- Phase2

```python
inputs = [torch.randn(128, 20)]
output = torch_model.forward(*inputs)
```

In [164]:
inputs = [torch.randn(128, 20)]
output = torch_model.forward(*inputs)

#### GPT3.5

```python
torch_model = torch.nn.Linear(
    in_features=20, 
    out_features=30
)
```

In [165]:
torch_model = torch.nn.Linear(
    in_features=20, 
    out_features=30
)

In [166]:
print(generate_phase2_prompt('''
torch_model = torch.nn.Linear(
    in_features=20, 
    out_features=30
)
'''))



<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for torch.nn.Linear>>

Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
          dimensions including none and :math:`H_{in} = \text{in\_features}`.
        - Output: :math:`(*, H_{out})` where all but the last dimension
          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :

In [167]:
inputs = [torch.randn(128, 20)]
output = torch_model.forward(*inputs)

#### GPT3.5 fhgenie

```python
torch_model = torch.nn.Linear(
    in_features=10,
    out_features=5,
    bias=True
)
```

In [170]:
torch_model = torch.nn.Linear(
    in_features=10,
    out_features=5,
    bias=True
)

In [171]:
print(generate_phase2_prompt('''
torch_model = torch.nn.Linear(
    in_features=10,
    out_features=5,
    bias=True
)
'''))



<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for torch.nn.Linear>>

Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
          dimensions including none and :math:`H_{in} = \text{in\_features}`.
        - Output: :math:`(*, H_{out})` where all but the last dimension
          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :

In [173]:

input = torch.randn(32, 10)  # Example input
output = torch_model(input)  # Forward pass


## Unittest

In [178]:
# Class target is assumed

# The target to be converted
constructor = 'torch.nn.Linear'

# The function name representing the functions properly. Ex) log, do_something.
# Option1 : Use the name of the first function. (preferred)
# Option2 : Ask AI.
# No matter what we choose, we must refine them.
class_name = "linear_layer"

phase1_output = """
torch_model = torch.nn.Linear(
    in_features=10,
    out_features=5,
    bias=True
)
"""

# Inputs as a list generated by AI.
# It will be unpacked to be passed.
# Would kwargs be better than args? It would be nice to try both.
inputs = 'torch.randn(32, 10)'

# Another inputs are required to generate the target instance.
# Try kwargs here.
construction_args = r"{'in_channels':10, 'out_channels':10, 'kernel_size'=1}"

template = f"""
def test_{class_name}_converter(self):
    # Initialize the model directly from its constructor
    {phase1_output}
    torch_model.eval()
	# Initialize the model and input tensor
    inputs = {inputs}

    # Convert the model and ensure the HTML trace is saved
    keras_model = nobuco.pytorch_to_keras(
        torch_model,
        args=[*inputs], kwargs=None,
        inputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        outputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        save_trace_html=True
    )

    # Read the contents of the trace.html file
    with open('trace.html', 'r', encoding='utf-8') as file:
        trace_html = file.read()

    # Assertions for the content of trace_html
    self.assertNotIn('Max diff', trace_html, "The trace HTML should not contain 'Max diff'")
"""

In [207]:
def generate_unittest_template(class_name, phase1_output, inputs):
    return f"""
def test_{class_name}_converter(self):
    # Initialize the model directly from its constructor
    {phase1_output}
    torch_model.eval()
	# Initialize the model and input tensor
    {inputs}

    # Convert the model and ensure the HTML trace is saved
    keras_model = nobuco.pytorch_to_keras(
        torch_model,
        args=[*inputs], kwargs=None,
        inputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        outputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        save_trace_html=True
    )

    # Read the contents of the trace.html file
    with open('trace.html', 'r', encoding='utf-8') as file:
        trace_html = file.read()

    # Assertions for the content of trace_html
    self.assertNotIn('Max diff', trace_html, "The trace HTML should not contain 'Max diff'")
"""

In [179]:
print(template)


def test_linear_layer_converter(self):
    # Initialize the model directly from its constructor
    
torch_model = torch.nn.Linear(
    in_features=10,
    out_features=5,
    bias=True
)

    torch_model.eval()
	# Initialize the model and input tensor
    inputs = torch.randn(32, 10)

    # Convert the model and ensure the HTML trace is saved
    keras_model = nobuco.pytorch_to_keras(
        torch_model,
        args=[*inputs], kwargs=None,
        inputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        outputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        save_trace_html=True
    )

    # Read the contents of the trace.html file
    with open('trace.html', 'r', encoding='utf-8') as file:
        trace_html = file.read()

    # Assertions for the content of trace_html
    self.assertNotIn('Max diff', trace_html, "The trace HTML should not contain 'Max diff'")



### New Target

In [196]:
def generate_phase1_prompt(target, doc, example):
    return f'''

<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for {target}>>

{doc}

<<Example>>
{example}

<<Task>>

Write the code lines to initiate an instance of the target module in python code block.

target: {target}

output:


'''

In [185]:
def generate_phase2_prompt(target, doc, example, phase1_output):
    return f'''

<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for {target}>>

{doc}

<<Example>>
{example}

<<Task>>

Write the inputs of the target's forward function, and the line of forward as well in python code block.

target: {phase1_output}

output:

'''

In [197]:
target = "nn.modules.activation.MultiheadAttention"

In [198]:
class_doc = nn.modules.activation.MultiheadAttention.__doc__

In [199]:
print(generate_phase1_prompt(target, class_doc, phase1_example))



<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for nn.modules.activation.MultiheadAttention>>

Allows the model to jointly attend to information
    from different representation subspaces as described in the paper:
    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.

    Multi-Head Attention is defined as:

    .. math::
        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O

    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.

    ``nn.MultiHeadAttention`` will use the optimized implementations of
    ``scaled_dot_product_attention()`` when possible.

    In addition to support for the new ``scaled_dot_product_attention()``
    function, for speeding up Inference, MHA will use
    fastpath inference with support for Nested Tensors, iff:

    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor).
    - inputs are batched (3D) with ``batch_first==True``
    - E

```python
torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=256,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)
```

In [200]:
torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=256,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)

In [201]:
phase1_output = """
torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=256,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)
"""

In [202]:
print(generate_phase2_prompt(target, class_doc, phase2_example, phase1_output))



<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for nn.modules.activation.MultiheadAttention>>

Allows the model to jointly attend to information
    from different representation subspaces as described in the paper:
    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.

    Multi-Head Attention is defined as:

    .. math::
        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O

    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.

    ``nn.MultiHeadAttention`` will use the optimized implementations of
    ``scaled_dot_product_attention()`` when possible.

    In addition to support for the new ``scaled_dot_product_attention()``
    function, for speeding up Inference, MHA will use
    fastpath inference with support for Nested Tensors, iff:

    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor).
    - inputs are batched (3D) with ``batch_first==True``
    - E

```python
query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
output, attn_output_weights = torch_model(query, key, value)
```

In [210]:
torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=256,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)

query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
output, attn_output_weights = torch_model(query, key, value)


AssertionError: was expecting embedding dimension of 256, but got 64

In [211]:
debugging_template = """

<<Executed Code>>

torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=256,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)

query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
output, attn_output_weights = torch_model(query, key, value)

<<Error>>

File ~/.local/lib/python3.10/site-packages/torch/nn/modules/activation.py:1241, in MultiheadAttention.forward(self, query, key, value, key_padding_mask, need_weights, attn_mask, average_attn_weights, is_causal)
   1227     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1228         query, key, value, self.embed_dim, self.num_heads,
   1229         self.in_proj_weight, self.in_proj_bias,
   (...)
   1238         average_attn_weights=average_attn_weights,
   1239         is_causal=is_causal)
   1240 else:
-> 1241     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1242         query, key, value, self.embed_dim, self.num_heads,
   1243         self.in_proj_weight, self.in_proj_bias,
   1244         self.bias_k, self.bias_v, self.add_zero_attn,
   1245         self.dropout, self.out_proj.weight, self.out_proj.bias,
   1246         training=self.training,
   1247         key_padding_mask=key_padding_mask,
   1248         need_weights=need_weights,
   1249         attn_mask=attn_mask,
   1250         average_attn_weights=average_attn_weights,
   1251         is_causal=is_causal)
   1252 if self.batch_first and is_batched:
   1253     return attn_output.transpose(1, 0), attn_output_weights

File ~/.local/lib/python3.10/site-packages/torch/nn/functional.py:5280, in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
   5274     if key_padding_mask is not None:
   5275         # We have the attn_mask, and use that to merge kpm into it.
   5276         # Turn off use of is_causal hint, as the merged mask is no
   5277         # longer causal.
   5278         is_causal = False
-> 5280 assert embed_dim == embed_dim_to_check, \
   5281     f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
   5282 if isinstance(embed_dim, torch.Tensor):
   5283     # embed_dim can be a tensor when JIT tracing
   5284     head_dim = embed_dim.div(num_heads, rounding_mode='trunc')

AssertionError: was expecting embedding dimension of 256, but got 64

<<Task>>

Given the error, rewrite the code in python code block.

"""

In [208]:
query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
inputs = """
query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
inputs = (query, key, value)
"""



In [214]:
print(class_doc)

Allows the model to jointly attend to information
    from different representation subspaces as described in the paper:
    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.

    Multi-Head Attention is defined as:

    .. math::
        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O

    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.

    ``nn.MultiHeadAttention`` will use the optimized implementations of
    ``scaled_dot_product_attention()`` when possible.

    In addition to support for the new ``scaled_dot_product_attention()``
    function, for speeding up Inference, MHA will use
    fastpath inference with support for Nested Tensors, iff:

    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor).
    - inputs are batched (3D) with ``batch_first==True``
    - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
    -

In [None]:
query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
inputs = (query, key, value)

```python
torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=64,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)

query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
output, attn_output_weights = torch_model(query, key, value)
```

In [212]:

torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=64,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)

query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
output, attn_output_weights = torch_model(query, key, value)


RuntimeError: shape '[32, 80, 8]' is invalid for input of size 40960

In [None]:
"""

<<Executed Code>>


torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=64,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)

query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
output, attn_output_weights = torch_model(query, key, value)


<<Error>>

File ~/.local/lib/python3.10/site-packages/torch/nn/modules/activation.py:1241, in MultiheadAttention.forward(self, query, key, value, key_padding_mask, need_weights, attn_mask, average_attn_weights, is_causal)
   1227     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1228         query, key, value, self.embed_dim, self.num_heads,
   1229         self.in_proj_weight, self.in_proj_bias,
   (...)
   1238         average_attn_weights=average_attn_weights,
   1239         is_causal=is_causal)
   1240 else:
-> 1241     attn_output, attn_output_weights = F.multi_head_attention_forward(
   1242         query, key, value, self.embed_dim, self.num_heads,
   1243         self.in_proj_weight, self.in_proj_bias,
   1244         self.bias_k, self.bias_v, self.add_zero_attn,
   1245         self.dropout, self.out_proj.weight, self.out_proj.bias,
   1246         training=self.training,
   1247         key_padding_mask=key_padding_mask,
   1248         need_weights=need_weights,
   1249         attn_mask=attn_mask,
   1250         average_attn_weights=average_attn_weights,
   1251         is_causal=is_causal)
   1252 if self.batch_first and is_batched:
   1253     return attn_output.transpose(1, 0), attn_output_weights

File ~/.local/lib/python3.10/site-packages/torch/nn/functional.py:5346, in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
   5344 q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
   5345 if static_k is None:
-> 5346     k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
   5347 else:
   5348     # TODO finish disentangling control flow so we don't do in-projections when statics are passed
   5349     assert static_k.size(0) == bsz * num_heads, \
   5350         f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"

RuntimeError: shape '[32, 80, 8]' is invalid for input of size 40960

<<Task>>

Given the error, rewrite the code in python code block.

"""

In [217]:
torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=64,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)

query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
output, attn_output_weights = torch_model(query, key, value)

RuntimeError: shape '[32, 80, 8]' is invalid for input of size 40960

In [216]:

torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=64,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=True  # Change batch_first to True
)

query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
output, attn_output_weights = torch_model(query, key, value)

In [209]:
print(generate_unittest_template("MultiheadAttention", phase1_output, inputs))


def test_MultiheadAttention_converter(self):
    # Initialize the model directly from its constructor
    
torch_model = nn.modules.activation.MultiheadAttention(
    embed_dim=256,
    num_heads=8,
    dropout=0.1,
    bias=True,
    add_bias_kv=False,
    add_zero_attn=False,
    kdim=None,
    vdim=None,
    batch_first=False
)

    torch_model.eval()
	# Initialize the model and input tensor
    
query = torch.randn(32, 10, 64)  # Example query tensor
key = torch.randn(32, 20, 64)  # Example key tensor
value = torch.randn(32, 20, 64)  # Example value tensor
inputs = (query, key, value)


    # Convert the model and ensure the HTML trace is saved
    keras_model = nobuco.pytorch_to_keras(
        torch_model,
        args=[*inputs], kwargs=None,
        inputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        outputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        save_trace_html=True
    )

    # Read the contents of the trace.html file
    with open('trace.html', 'r', en

### GPT4 32K fhgenie


```python
multihead_attn = nn.MultiheadAttention(
    embed_dim=512, 
    num_heads=8
)
```

In [236]:
multihead_attn = nn.modules.activation.MultiheadAttention(
    embed_dim=512, 
    num_heads=8
)

In [237]:
phase1_output = '''
multihead_attn = nn.MultiheadAttention(
    embed_dim=512, 
    num_heads=8
)
'''

In [238]:
print(generate_phase2_prompt(target, class_doc, phase2_example, phase1_output))



<<Instruction>>

Given the context, complete the <<Task>>.

<<Documentation for nn.modules.activation.MultiheadAttention>>

Allows the model to jointly attend to information
    from different representation subspaces as described in the paper:
    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.

    Multi-Head Attention is defined as:

    .. math::
        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O

    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.

    ``nn.MultiHeadAttention`` will use the optimized implementations of
    ``scaled_dot_product_attention()`` when possible.

    In addition to support for the new ``scaled_dot_product_attention()``
    function, for speeding up Inference, MHA will use
    fastpath inference with support for Nested Tensors, iff:

    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor).
    - inputs are batched (3D) with ``batch_first==True``
    - E

In [239]:
query = torch.randn(5, 32, 512)
key = torch.randn(5, 32, 512)
value = torch.randn(5, 32, 512)
output, weights = multihead_attn.forward(query, key, value)
inputs_tensor = (query, key, value)
inputs = """
query = torch.randn(5, 32, 512)
key = torch.randn(5, 32, 512)
value = torch.randn(5, 32, 512)
inputs = [query, key, value]

"""

In [240]:
inputs_tensor = (query, key, value)

In [223]:
print(generate_unittest_template("MultiheadAttention", phase1_output, inputs))


def test_MultiheadAttention_converter(self):
    # Initialize the model directly from its constructor
    
multihead_attn = nn.MultiheadAttention(
    embed_dim=512, 
    num_heads=8
)

    torch_model.eval()
	# Initialize the model and input tensor
    
query = torch.randn(5, 32, 512)
key = torch.randn(5, 32, 512)
value = torch.randn(5, 32, 512)
inputs = (query, key, value)



    # Convert the model and ensure the HTML trace is saved
    keras_model = nobuco.pytorch_to_keras(
        torch_model,
        args=[*inputs], kwargs=None,
        inputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        outputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
        save_trace_html=True
    )

    # Read the contents of the trace.html file
    with open('trace.html', 'r', encoding='utf-8') as file:
        trace_html = file.read()

    # Assertions for the content of trace_html
    self.assertNotIn('Max diff', trace_html, "The trace HTML should not contain 'Max diff'")



test failed

In [242]:
type(multihead_attn)

torch.nn.modules.activation.MultiheadAttention

In [249]:
import nobuco 
import torch
import torch.nn as nn


torch_model = nn.MultiheadAttention(
    embed_dim=512, 
    num_heads=8,
    batch_first=True
)

query = torch.randn(1, 32, 512)
key = torch.randn(1, 32, 512)
value = torch.randn(1, 32, 512)
output, weights = torch_model.forward(query, key, value)
inputs = (query, key, value)

keras_model = nobuco.pytorch_to_keras(
    torch_model,
    args=[*inputs_tensor], kwargs=None,
    inputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
    outputs_channel_order=nobuco.ChannelOrder.TENSORFLOW,
)


Traceback (most recent call last):
  File "/workspaces/nobuco/nobuco/converters/validation.py", line 47, in validate
    diffs = validate_diff_default(keras_op, pytorch_op, input_args, input_kwargs, output_tensors)
  File "/workspaces/nobuco/nobuco/converters/validation.py", line 89, in validate_diff_default
    raise Exception(f"Tensor shapes of output #{i} don't match: (Pytorch) {list(t_pt.shape)} vs {list(t_tf.shape)} (Tensorflow)")
Exception: Tensor shapes of output #1 don't match: (Pytorch) [5, 32, 32] vs [5, 8, 32, 32] (Tensorflow)


Legend:
    [32mGreen[0m — conversion successful
    [33mYellow[0m — conversion imprecise
    [31mRed[0m — conversion failed
    [31m[7mRed[0m — no converter found
    [0m[1mBold[0m — conversion applied directly
    * — subgraph reused
    [7mTensor[0m — this output is not dependent on any of subgraph's input tensors
    [4mTensor[0m — this input is a parameter / constant
    [90mTensor[0m — this tensor is useless

[90m I [0m[90m File "/workspaces/nobuco/nobuco/trace/trace.py", line 460[0m 
[90m D [0m[90m File "/home/codespace/.local/lib/python3.10/site-packages/torch/nn/modules/activation.py", line 906 [0m 
[31m[1m C [0m[90m File "/workspaces/nobuco/nobuco/node_converters/attention.py", line 14 [0m 
[31m[1mMultiheadAttention[torch.nn.modules.activation][0m(float32_0<5,32,512>[0m, float32_1<5,32,512>[0m, float32_2<5,32,512>[0m) -> (float32_37<5,32,512>[0m, float32_36<5,32,32>[0m)
[31m[1m │ [0m [0mtranspose[torch.Tensor][0m(float32_0<5,32,512>