In [1]:
import torch
import sys
sys.path.insert(0, "./src")
from glonet import Glonet

Error importing huggingface_hub.hf_api: No module named 'filelock'




In [2]:
if torch.cuda.is_available():
    print("CUDA is available.")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available.")

CUDA is available.
Device name: NVIDIA H100 NVL


In [3]:
model_path = "/Odyssey/public/glonet/TrainedWeights/glonet_p1.pt"

In [4]:
model = torch.jit.load(model_path, map_location=torch.device('cuda'))

In [5]:
# Move model to device, freeze all parameters, and prepare a gradient-tracked input
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Freeze parameters so only input can receive gradients
for p in model.parameters():
    p.requires_grad = False
model.eval()

# Try to inspect forward signature to guide a dummy input (best-effort).
schema = None
try:
    if hasattr(model, 'forward'):
        try:
            schema = str(model.forward.schema)
        except Exception:
            schema = None
except Exception:
    schema = None
print('forward schema:', schema)

# Fallback dummy input: adjust shape if your model expects different dims
# Common shape: (batch, channels, H, W). Replace as needed before running.
x = torch.randn(1, 2, 5, 672, 1440, device=device, requires_grad=True)

# Run forward; models may return tensor, tuple, or dict depending on implementation
try:
    out = model(x)
    print('forward executed')
except Exception as e:
    print('forward failed:', e)
    out = None

# Inspect output and attempt a backward pass to verify gradient flows into input
if out is None:
    print('No output to backprop')
else:
    # pick a tensor to reduce to scalar for backward
    if torch.is_tensor(out):
        loss = out.sum()
    elif isinstance(out, (tuple, list)):
        out_t = None
        for o in out:
            if torch.is_tensor(o):
                out_t = o
                break
        if out_t is None:
            print('no tensor in output to backprop')
            loss = None
        else:
            loss = out_t.sum()
    elif isinstance(out, dict):
        # pick the first tensor value in dict
        loss = None
        for v in out.values():
            if torch.is_tensor(v):
                loss = v.sum()
                break
    else:
        loss = None

    if loss is not None:
        loss.backward()
        if x.grad is not None:
            print('input.grad norm:', x.grad.norm().item())
        else:
            print('input.grad is None — gradient did not flow into input')
    else:
        print('Could not construct a scalar loss for backward')

forward schema: forward(__torch__.torch.nn.modules.container.___torch_mangle_1012.Sequential self, Tensor input) -> Tensor
forward executed


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [6]:
model.to('cpu')
torch.cuda.empty_cache()

**Why the error happens** (short)

- That RuntimeError occurs because you called backward() on a tensor that does not require gradients and has no grad_fn. In other words the tensor is not connected to any autograd graph.
- Common reasons in your setup:
 - The object you call backward() on is a Python float or a tensor with requires_grad=False (e.g., you summed or converted to .item(), .numpy(), or detached it).
 - The model's forward executed inside a torch.no_grad() block or used .detach() / .cpu().numpy() somewhere, preventing gradients.
 - The TorchScript module you loaded is an inference-only graph that returns detached results (or its forward uses no_grad()).
 - You froze parameters (that’s fine for input grads) but the forward still prevents grad flow.

In [7]:
# Diagnostic helper: finds first tensor in output and prints grad info
def find_tensor(o):
    if torch.is_tensor(o):
        return o
    if isinstance(o, (list, tuple)):
        for v in o:
            t = find_tensor(v)
            if t is not None:
                return t
    if isinstance(o, dict):
        for v in o.values():
            t = find_tensor(v)
            if t is not None:
                return t
    return None

out_t = find_tensor(out)
print("x.requires_grad:", getattr(x, "requires_grad", None))
if out_t is None:
    print("No tensor found in model output (out is None or not tensor-like).")
else:
    print("out_t type:", type(out_t))
    print("out_t.requires_grad:", out_t.requires_grad)
    print("out_t.grad_fn:", out_t.grad_fn)
    # If it does not require grad, show likely suspects
    if not out_t.requires_grad:
        print("-> Output does not require grad. Check for torch.no_grad(), .detach(), or conversions to numpy/float inside forward.")
    else:
        # safe backward check
        loss = out_t.sum()
        print("loss.requires_grad:", loss.requires_grad)
        loss.backward()
        print("x.grad is None?:", x.grad is None)
        if x.grad is not None:
            print("x.grad norm:", x.grad.norm().item())

x.requires_grad: True
out_t type: <class 'torch.Tensor'>
out_t.requires_grad: False
out_t.grad_fn: None
-> Output does not require grad. Check for torch.no_grad(), .detach(), or conversions to numpy/float inside forward.


***How to interpret results and fixes***

- If `out_t.requires_grad` is False:
  - Inspect your model forward for torch.no_grad(), .detach(), or explicit .cpu().numpy() / .item() conversions. Remove or alter them so the forward keeps tensors connected to the graph.
  - If you loaded a scripted/inference-only TorchScript (from tracing or an exported inference build), load the original model class + state_dict instead (instantiate `Glonet(...)`, load state_dict) so autograd is available.
- If the output is a Python float or you see `loss` created from `.item()` / `.numpy()`, don't convert to float before backward; keep it as a tensor.
- If `x.requires_grad` is False (shouldn’t be in your code): recreate x with requires_grad=True.
- If model intentionally uses `with torch.no_grad()` for some ops, remove that block for the ops that must be differentiable, or implement a separate "differentiable forward" for optimization.

If you want, I can:

- Edit the notebook to replace the fallback dummy input shape with the correct shape automatically (I can try to infer it), and add the diagnostic prints directly into the cell.
- Help load the model from source/state_dict instead of using `torch.jit.load` (if you have the class definition and checkpoint).


In [8]:
new_model = Glonet(dim=(2, 5, 672, 1440))

In [9]:
new_model

Glonet(
  (space): mspace(
    (norm): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
    (enc): Sequential(
      (0): Inception(
        (conv1): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1))
        (lls): Sequential(
          (0): GroupConv2d(
            (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=8)
            (norm): GroupNorm(8, 64, eps=1e-05, affine=True)
            (activate): LeakyReLU(negative_slope=0.2, inplace=True)
          )
          (1): GroupConv2d(
            (conv): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=8)
            (norm): GroupNorm(8, 64, eps=1e-05, affine=True)
            (activate): LeakyReLU(negative_slope=0.2, inplace=True)
          )
          (2): GroupConv2d(
            (conv): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=8)
            (norm): GroupNorm(8, 64, eps=1e-05, affine=True)
            (activate): LeakyReLU(negative_slope=0.2,

In [10]:
model

RecursiveScriptModule(
  original_name=Sequential
  (0): RecursiveScriptModule(
    original_name=Glonet
    (jump): RecursiveScriptModule(
      original_name=residual
      (maps): RecursiveScriptModule(
        original_name=Encoder
        (enc): RecursiveScriptModule(
          original_name=Sequential
          (0): RecursiveScriptModule(
            original_name=ConvSC
            (conv): RecursiveScriptModule(
              original_name=BasicConv2d
              (conv): RecursiveScriptModule(original_name=Conv2d)
              (norm): RecursiveScriptModule(original_name=GroupNorm)
              (act): RecursiveScriptModule(original_name=LeakyReLU)
            )
          )
          (1): RecursiveScriptModule(
            original_name=ConvSC
            (conv): RecursiveScriptModule(
              original_name=BasicConv2d
              (conv): RecursiveScriptModule(original_name=Conv2d)
              (norm): RecursiveScriptModule(original_name=GroupNorm)
              (act)

Inspect `model` contents — diagnostic snippet below.

In [11]:
# diagnostic
print("type(model):", type(model))
print("hasattr(model, 'state_dict'):", hasattr(model, 'state_dict'))
print("hasattr(model, 'named_parameters'):", hasattr(model, 'named_parameters'))
try:
    sd = model.state_dict()
    print("state_dict keys sample:", list(sd.keys())[:20])
    print("state_dict len:", len(sd))
except Exception as e:
    print("state_dict() not usable:", e)

# also inspect a sample of named parameters/buffers
if hasattr(model, 'named_parameters'):
    print("sample named_parameters:", [n for n, _ in list(model.named_parameters())[:20]])
if hasattr(model, 'named_buffers'):
    print("sample named_buffers:", [n for n, _ in list(model.named_buffers())[:20]])

type(model): <class 'torch.jit._script.RecursiveScriptModule'>
hasattr(model, 'state_dict'): True
hasattr(model, 'named_parameters'): True
state_dict keys sample: ['0.jump.maps.enc.0.conv.conv.weight', '0.jump.maps.enc.0.conv.conv.bias', '0.jump.maps.enc.0.conv.norm.weight', '0.jump.maps.enc.0.conv.norm.bias', '0.jump.maps.enc.1.conv.conv.weight', '0.jump.maps.enc.1.conv.conv.bias', '0.jump.maps.enc.1.conv.norm.weight', '0.jump.maps.enc.1.conv.norm.bias', '0.jump.maps.enc.2.conv.conv.weight', '0.jump.maps.enc.2.conv.conv.bias', '0.jump.maps.enc.2.conv.norm.weight', '0.jump.maps.enc.2.conv.norm.bias', '0.jump.maps.enc.3.conv.conv.weight', '0.jump.maps.enc.3.conv.conv.bias', '0.jump.maps.enc.3.conv.norm.weight', '0.jump.maps.enc.3.conv.norm.bias', '0.jump.mmb.enc.0.conv1.weight', '0.jump.mmb.enc.0.conv1.bias', '0.jump.mmb.enc.0.lls.0.conv.weight', '0.jump.mmb.enc.0.lls.0.conv.bias']
state_dict len: 1033
sample named_parameters: ['0.jump.maps.enc.0.conv.conv.weight', '0.jump.maps.enc.0.co

If `state_dict` available: `use new_model.load_state_dict(...)`.

In [12]:
# try re-loading file as a dict first (safe, no overwrite of 'model' variable)
ck = torch.jit.load(model_path, map_location=torch.device('cuda'))
if isinstance(ck, dict):
    # common wrappers
    if 'model_state_dict' in ck:
        sd = ck['model_state_dict']
    elif 'state_dict' in ck:
        sd = ck['state_dict']
    else:
        sd = ck
    # Attempt to load
    missing, unexpected = new_model.load_state_dict(sd, strict=False)
    print("missing keys:", missing)
    print("unexpected keys:", unexpected)
else:
    print("torch.load returned non-dict (it's probably a ScriptModule).")

torch.load returned non-dict (it's probably a ScriptModule).


If `model.state_dict()` is available (ScriptModule may support this), try direct load

In [13]:
try:
    sd = model.state_dict()
    missing, unexpected = new_model.load_state_dict(sd, strict=False)
    print("loaded state_dict -> missing:", missing, " unexpected:", unexpected)
except Exception as e:
    print("model.state_dict() not available:", e)

loaded state_dict -> missing: ['space.norm.weight', 'space.norm.bias', 'space.enc.0.conv1.weight', 'space.enc.0.conv1.bias', 'space.enc.0.lls.0.conv.weight', 'space.enc.0.lls.0.conv.bias', 'space.enc.0.lls.0.norm.weight', 'space.enc.0.lls.0.norm.bias', 'space.enc.0.lls.1.conv.weight', 'space.enc.0.lls.1.conv.bias', 'space.enc.0.lls.1.norm.weight', 'space.enc.0.lls.1.norm.bias', 'space.enc.0.lls.2.conv.weight', 'space.enc.0.lls.2.conv.bias', 'space.enc.0.lls.2.norm.weight', 'space.enc.0.lls.2.norm.bias', 'space.enc.1.conv1.weight', 'space.enc.1.conv1.bias', 'space.enc.1.lls.0.conv.weight', 'space.enc.1.lls.0.conv.bias', 'space.enc.1.lls.0.norm.weight', 'space.enc.1.lls.0.norm.bias', 'space.enc.1.lls.1.conv.weight', 'space.enc.1.lls.1.conv.bias', 'space.enc.1.lls.1.norm.weight', 'space.enc.1.lls.1.norm.bias', 'space.enc.1.lls.2.conv.weight', 'space.enc.1.lls.2.conv.bias', 'space.enc.1.lls.2.norm.weight', 'space.enc.1.lls.2.norm.bias', 'space.enc.2.conv1.weight', 'space.enc.2.conv1.bias',

Otherwise: copy named_parameters and named_buffers, or fall back to shape-based mapping.

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
new_model.to(device)

# collect source params/buffers
src_params = {n: p for n, p in model.named_parameters()} if hasattr(model, 'named_parameters') else {}
src_bufs   = {n: b for n, b in model.named_buffers()}    if hasattr(model, 'named_buffers')    else {}

copied = []
with torch.no_grad():
    for name, tgt in new_model.named_parameters():
        src = src_params.get(name)
        if src is None:
            continue
        if tuple(src.shape) == tuple(tgt.shape):
            tgt.copy_(src.to(tgt.device).to(tgt.dtype))
            copied.append(name)

    for name, tgt in new_model.named_buffers():
        src = src_bufs.get(name)
        if src is None:
            continue
        if tuple(src.shape) == tuple(tgt.shape):
            tgt.copy_(src.to(tgt.device).to(tgt.dtype))

print("params copied by name:", len(copied))

params copied by name: 0


In [15]:
# build lists of remaining src/tgt tensors
remaining_src = [p for n, p in src_params.items() if n not in set(copied)]
remaining_tgt = [(n, p) for n, p in new_model.named_parameters() if n not in set(copied)]

with torch.no_grad():
    for tgt_name, tgt in remaining_tgt:
        for i, src in enumerate(remaining_src):
            if tuple(src.shape) == tuple(tgt.shape):
                tgt.copy_(src.to(tgt.device).to(tgt.dtype))
                print(f"copied by shape: {tgt_name} <- src_index_{i} shape={src.shape}")
                remaining_src.pop(i)
                break

copied by shape: space.norm.weight <- src_index_568 shape=torch.Size([64])
copied by shape: space.norm.bias <- src_index_585 shape=torch.Size([64])
copied by shape: space.enc.0.conv1.bias <- src_index_289 shape=torch.Size([32])
copied by shape: space.enc.0.lls.0.conv.bias <- src_index_601 shape=torch.Size([64])
copied by shape: space.enc.0.lls.0.norm.weight <- src_index_618 shape=torch.Size([64])
copied by shape: space.enc.0.lls.0.norm.bias <- src_index_635 shape=torch.Size([64])
copied by shape: space.enc.0.lls.1.conv.bias <- src_index_652 shape=torch.Size([64])
copied by shape: space.enc.0.lls.1.norm.weight <- src_index_669 shape=torch.Size([64])
copied by shape: space.enc.0.lls.1.norm.bias <- src_index_686 shape=torch.Size([64])
copied by shape: space.enc.0.lls.2.conv.bias <- src_index_871 shape=torch.Size([64])
copied by shape: space.enc.0.lls.2.norm.weight <- src_index_888 shape=torch.Size([64])
copied by shape: space.enc.0.lls.2.norm.bias <- src_index_905 shape=torch.Size([64])
c

- If `model` is a traced/inference TorchScript that explicitly uses `torch.no_grad()` or detaches its outputs, gradients cannot flow even if you copy weights; you'll need the original model class + state_dict to get a differentiable forward.
- Name mismatches are common when model code changed or when using wrappers (DataParallel, prefix differences). Manual mapping may be required.
- Always check shapes before copying; copying mismatched shapes will raise errors.
- After copying, set `new_model.eval()` or `.train()` as needed, and move it to the device.

In [16]:
new_model.eval()
# create a small dummy input matching new_model expectation and run a forward
x_test = torch.randn(1, 2, 5, 672, 1440, device=device)  # replace with correct dims
try:
    out = new_model(x_test)
    print("new_model forward OK, out type:", type(out))
except Exception as e:
    print("forward failed:", e)

new_model forward OK, out type: <class 'torch.Tensor'>


- ScriptModule often supports `state_dict()` and `named_parameters()`; prefer `state_dict()` when available.
- Always use `torch.no_grad()` when copying `.data` to avoid creating graph connections.
- Ensure dtype/device match: use `.to(tgt.device).to(tgt.dtype)` when copying.
- If names differ due to wrappers or refactoring, manual name mapping might be needed.
- If the ScriptModule was traced/inference-only, weights still copy fine; the problem you had earlier came from trying `torch.load` with the wrong flags — using `torch.jit.load` (what you already did) is correct.

Empty gpu's memory

In [17]:
import gc
model.to('cpu')
del x_test
del out
gc.collect()
torch.cuda.empty_cache()

try:
    torch.cuda.reset_peak_memory_stats()
except Exception:
    pass

In [18]:

# Test that new_model allows gradients w.r.t. input (freeze weights, track input)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
new_model.to(device)
# Freeze model weights
for p in new_model.parameters():
    p.requires_grad = False
new_model.eval()

# Create input with requires_grad=True. Adjust shape if your model expects different dims.
x = torch.randn(1, 2, 5, 672, 1440, device=device, requires_grad=True)
print('x.requires_grad:', x.requires_grad)

# Forward pass and inspect output's grad properties
try:
    out = new_model(x)
    print('forward ok, out type:', type(out))
except Exception as e:
    print('forward failed:', e)
    out = None

def first_tensor(o):
    if torch.is_tensor(o):
        return o
    if isinstance(o, (list, tuple)):
        for v in o:
            t = first_tensor(v)
            if t is not None:
                return t
    if isinstance(o, dict):
        for v in o.values():
            t = first_tensor(v)
            if t is not None:
                return t
    return None

if out is None:
    print('No output to test')
else:
    out_t = first_tensor(out)
    if out_t is None:
        print('No tensor found inside model output')
    else:
        print('out_t.shape:', getattr(out_t, 'shape', None))
        print('out_t.requires_grad:', out_t.requires_grad)
        print('out_t.grad_fn:', out_t.grad_fn)
        if not out_t.requires_grad:
            print('Output does not require grad — likely the forward detached tensors or used no_grad.')
        else:
            # backprop and check x.grad
            loss = out_t.sum()
            print('loss.requires_grad:', loss.requires_grad)
            loss.backward()
            if x.grad is None:
                print('x.grad is None — gradient did not flow into input')
            else:
                print('x.grad norm:', x.grad.norm().item())

x.requires_grad: True
forward ok, out type: <class 'torch.Tensor'>
out_t.shape: torch.Size([1, 5, 672, 1440])
out_t.requires_grad: True
out_t.grad_fn: <SelectBackward0 object at 0x71afede19d20>
loss.requires_grad: True
x.grad norm: 98.10212707519531


In [19]:
# Simple gradient descent on input `x` to verify gradients flow into the input
# If `x` exists from earlier cells we reuse it, otherwise create a random init.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
new_model.to(device)
for p in new_model.parameters():
    p.requires_grad = False
new_model.eval()

try:
    x_init = x.detach().clone().to(device)
    print('reusing existing `x` from earlier cells')
except NameError:
    x_init = torch.randn(1, 2, 5, 672, 1440, device=device)
    print('created new random `x_init`')

# Wrap the input as a Parameter so it can be optimized by an optimizer
x_param = torch.nn.Parameter(x_init, requires_grad=True)
opt = torch.optim.SGD([x_param], lr=1e-2, momentum=0.7)

start_norm = x_param.data.norm().item()
print(f'start x norm: {start_norm:.6f}')

# small training loop
steps = 100
for i in range(steps):
    opt.zero_grad()
    out = new_model(x_param)

    # helper to find first tensor in output
    def first_tensor(o):
        if torch.is_tensor(o):
            return o
        if isinstance(o, (list, tuple)):
            for v in o:
                t = first_tensor(v)
                if t is not None:
                    return t
        if isinstance(o, dict):
            for v in o.values():
                t = first_tensor(v)
                if t is not None:
                    return t
        return None

    out_t = first_tensor(out)
    if out_t is None:
        print('Model returned no tensor; cannot compute loss. Stopping.')
        break

    loss = out_t.sum()  # simple scalar to exercise gradients
    loss.backward()

    grad_norm = x_param.grad.norm().item() if x_param.grad is not None else float('nan')
    print(f'step {i:02d} loss={loss.item():.6e} x.grad_norm={grad_norm:.6e} x.norm={x_param.data.norm().item():.6f}')

    opt.step()

end_norm = x_param.data.norm().item()
print(f'end x norm: {end_norm:.6f} (changed by {end_norm - start_norm:.6f})')
print('final x_param.requires_grad:', x_param.requires_grad)
print('out_t.requires_grad:', getattr(out_t, 'requires_grad', None), 'out_t.grad_fn:', getattr(out_t, 'grad_fn', None))


reusing existing `x` from earlier cells
start x norm: 3111.773682
step 00 loss=3.222114e+04 x.grad_norm=9.810213e+01 x.norm=3111.773682
step 01 loss=3.212335e+04 x.grad_norm=9.829488e+01 x.norm=3111.771973
step 02 loss=3.196002e+04 x.grad_norm=9.815225e+01 x.norm=3111.770020
step 03 loss=3.174892e+04 x.grad_norm=9.814280e+01 x.norm=3111.768311
step 04 loss=3.150391e+04 x.grad_norm=9.867900e+01 x.norm=3111.768555
step 05 loss=3.123515e+04 x.grad_norm=9.872153e+01 x.norm=3111.771240
step 06 loss=3.094846e+04 x.grad_norm=9.851047e+01 x.norm=3111.777100
step 07 loss=3.065337e+04 x.grad_norm=9.836958e+01 x.norm=3111.785400
step 08 loss=3.035120e+04 x.grad_norm=9.775131e+01 x.norm=3111.797607
step 09 loss=3.004380e+04 x.grad_norm=9.781171e+01 x.norm=3111.813232
step 10 loss=2.973248e+04 x.grad_norm=9.803963e+01 x.norm=3111.832275
step 11 loss=2.941838e+04 x.grad_norm=9.788206e+01 x.norm=3111.854980
step 12 loss=2.910250e+04 x.grad_norm=9.806225e+01 x.norm=3111.881348
step 13 loss=2.878544e+0