{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":88071101,"defaultBranch":"devel","name":"pytorch","ownerLogin":"csarofeen","currentUserCanPush":false,"isFork":true,"isEmpty":false,"createdAt":"2017-04-12T16:02:31.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/22205833?v=4","public":true,"private":false,"isOrgOwned":false},"refInfo":{"name":"","listCacheKey":"v0:1680023399.010941","currentOid":""},"activityList":{"items":[{"before":"609089bc27375bee121f6a18f9436dc376f7d8bf","after":"0d62871ea1a715ff94db2e3275cd39d50753cc17","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-20T00:48:42.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"add findVectorizedOutputOf","shortMessageHtmlLink":"add findVectorizedOutputOf"}},{"before":"61595ed5e0ae72b8ccba717231303351da347fcc","after":"609089bc27375bee121f6a18f9436dc376f7d8bf","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-19T20:45:29.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"propagate vectorization to cached_gmem_reload","shortMessageHtmlLink":"propagate vectorization to cached_gmem_reload"}},{"before":"9b39928380dcdd530e8e2e3646c3518bdea19271","after":"61595ed5e0ae72b8ccba717231303351da347fcc","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-19T13:41:01.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"rename to vectorization_factor_outer","shortMessageHtmlLink":"rename to vectorization_factor_outer"}},{"before":"2a82abcb53a3a0d2ce841b26e37b37f08072a704","after":"9b39928380dcdd530e8e2e3646c3518bdea19271","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-18T18:53:59.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"revise disjoint set check","shortMessageHtmlLink":"revise disjoint set check"}},{"before":"be11c7d5e0f9b21a1f7632558bcab9e05e5169ab","after":"2a82abcb53a3a0d2ce841b26e37b37f08072a704","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-17T20:56:48.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"detect links through consumer's producer","shortMessageHtmlLink":"detect links through consumer's producer"}},{"before":"8f677253edb1a96231642004f8bb5d01e92d249c","after":"be11c7d5e0f9b21a1f7632558bcab9e05e5169ab","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-13T18:56:33.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"disjointset and vect","shortMessageHtmlLink":"disjointset and vect"}},{"before":"93c4cd9131e5390a8181ea36206de4abf15e9c72","after":"8f677253edb1a96231642004f8bb5d01e92d249c","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-06T15:03:00.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"avoid propagatge parallelization to boundary nodes","shortMessageHtmlLink":"avoid propagatge parallelization to boundary nodes"}},{"before":"f1f94131bf857167cb023daa815aebb037bf8bd2","after":"93c4cd9131e5390a8181ea36206de4abf15e9c72","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-05T23:38:16.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"use hasReduction api","shortMessageHtmlLink":"use hasReduction api"}},{"before":"79c6df1b414ac9873407d493290397ea6dd17195","after":"f1f94131bf857167cb023daa815aebb037bf8bd2","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-05T23:31:38.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"check consumer's shared non-reduction producer","shortMessageHtmlLink":"check consumer's shared non-reduction producer"}},{"before":"97779cde55f6f001d573fcecd7133024f6cf4fe7","after":"79c6df1b414ac9873407d493290397ea6dd17195","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-05T14:31:22.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"dependency check","shortMessageHtmlLink":"dependency check"}},{"before":"5a9e814306d58ac19730a3ca8dbaa30e2961861c","after":"97779cde55f6f001d573fcecd7133024f6cf4fe7","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-04T15:40:33.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"check shared consumer","shortMessageHtmlLink":"check shared consumer"}},{"before":"d551ceb206eb445d2807f826ac96f152190a4d56","after":"5a9e814306d58ac19730a3ca8dbaa30e2961861c","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-04-03T18:43:46.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"check vect factor of tmp buffer","shortMessageHtmlLink":"check vect factor of tmp buffer"}},{"before":"e09ded3fa6d164bfdaf5075829113cedd0c2550d","after":"d551ceb206eb445d2807f826ac96f152190a4d56","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-03-31T16:08:38.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"address comments","shortMessageHtmlLink":"address comments"}},{"before":"d29863e1a5c81d8792a2954ce6480e0ce9a6e8ef","after":null,"ref":"refs/heads/ab/matmul_scheduler_in_segmenter","pushedAt":"2023-03-28T17:09:59.010Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"}},{"before":"a72b978da7450d0aac0ea1f08c2e4f8dd0bca69f","after":null,"ref":"refs/heads/llu/ln_backward_directfix","pushedAt":"2023-03-27T17:56:12.268Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"}},{"before":"640ff6de5d918f05c2137d719bdf6bd120a128d7","after":"e09ded3fa6d164bfdaf5075829113cedd0c2550d","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-03-24T17:32:30.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"inner reduction and outer reduction should have shared input","shortMessageHtmlLink":"inner reduction and outer reduction should have shared input"}},{"before":"42bcbe95d1dff287a0b3cb89e6aef43d88c05a0d","after":"640ff6de5d918f05c2137d719bdf6bd120a128d7","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-03-24T17:07:11.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"comments","shortMessageHtmlLink":"comments"}},{"before":"c5d838588cf6740c4b2ee77d0bc9ac7f437bfb58","after":"42bcbe95d1dff287a0b3cb89e6aef43d88c05a0d","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-03-22T15:15:13.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"revise","shortMessageHtmlLink":"revise"}},{"before":"6b8596ebf9f6b2c085d99350608a11f6b213045f","after":"c5d838588cf6740c4b2ee77d0bc9ac7f437bfb58","ref":"refs/heads/llu/ln_backward_merge","pushedAt":"2023-03-22T12:50:50.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"liqiangxl","name":"Liqiang Lu","path":"/liqiangxl","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/116412316?s=80&v=4"},"commit":{"message":"Apply suggestions from code review\n\nCo-authored-by: Naoya Maruyama <naoyam@users.noreply.github.com>","shortMessageHtmlLink":"Apply suggestions from code review"}},{"before":"9a9510061ad2d7081a31f35cc924520ef2ad1e55","after":null,"ref":"refs/heads/fuser","pushedAt":"2023-03-20T18:03:12.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"}},{"before":"c8c8cd7dcba47c5898de07195d3ea4ff53825d39","after":null,"ref":"refs/heads/rebase-matmul_swizzle_gen","pushedAt":"2023-03-20T18:02:56.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"}},{"before":"d04cd5bdc30e2baffb81522539055972ca39df53","after":null,"ref":"refs/heads/matmul_swizzle_gen","pushedAt":"2023-03-20T17:57:01.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"}},{"before":"9a9510061ad2d7081a31f35cc924520ef2ad1e55","after":"c8c8cd7dcba47c5898de07195d3ea4ff53825d39","ref":"refs/heads/rebase-matmul_swizzle_gen","pushedAt":"2023-03-20T17:56:57.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"},"commit":{"message":"[MatMul] Prolog build out, adding automatic swizzle generator for a few tile sizes (#1900)\n\n* use custom propagator in ampere TN\r\n\r\n* add tile ordering utilities\r\n\r\n* initial matmul scheduler implementation\r\n\r\n* use matmul scheduler prototype on ampere and turing test cases\r\n\r\n* extend to support Volta\r\n\r\n* minor cleanup\r\n\r\n* comment cleanup\r\n\r\n* minor fix\r\n\r\n* add fragment iteration and use it in matmul scheduler\r\n\r\n* use scheduler params for tests\r\n\r\n* fragment support in double buffer\r\n\r\n* add register double buffering test cases\r\n\r\n* clean up custom transform propagator\r\n\r\n* rebase fix\r\n\r\n* comment\r\n\r\n* move bounded selector to common area\r\n\r\n* Add logic to handle fake boundary tensors in selection.\r\n\r\n* naming and comment\r\n\r\n* remove unused parameters from mma node\r\n\r\n* remove unnecessary parameters from mma ir node\r\n\r\n* rename scheduling variables\r\n\r\n* change accumulator tv interface\r\n\r\n* Update torch/csrc/jit/codegen/cuda/scheduler/utils.h\r\n\r\nCo-authored-by: Gao, Xiang <qasdfgtyuiop@gmail.com>\r\n\r\n* PR feedback\r\n\r\n* pipe through parallel type position\r\n\r\n* Revert \"fragment support in double buffer\"\r\n\r\nThis reverts commit d12a90fcce5cd02aca7c98ea5f29ea01bc85df6f.\r\n\r\n* use cache op to handle double buffer input\r\n\r\n* add more comment in matmul scheduler\r\n\r\n* more comments\r\n\r\n* comment fix\r\n\r\n* rebase fix\r\n\r\n* add inline pred for cpasync\r\n\r\n* minor cleanup\r\n\r\n* add inlining test in unit\r\n\r\n* add option to dump ptx\r\n\r\n* add ampere xor swizzle gen\r\n\r\n* minor scheduler fix; add bank conflict helper\r\n\r\n* minor update and enable single word access checker\r\n\r\n* minor fixes and symmetric 4 warp recipe tests\r\n\r\n* rebase fix\r\n\r\n* fix rebase\r\n\r\n* add cyclic shift for non-power-of-2 swizzle period\r\n\r\n* fix swizzle handling in replay\r\n\r\n* add a few more tile support\r\n\r\n* minor fix\r\n\r\n* add 6 warp test cases\r\n\r\n* add skip swizzle option for replay matching\r\n\r\n* cleanup\r\n\r\n* add small repro for the replay fix\r\n\r\n* Fix missing thread predicates\r\n\r\nUnlikely to matter, but should be necessary\r\n\r\n* fix merge\r\n\r\n* fix merge\r\n\r\n* format\r\n\r\n* Rebase #1900 (#2009)\r\n\r\n* hash update - bug fix for branches (#83865)\r\n\r\nhash updates for xla were failing because the current pinned hash is a branch, so the git command for getting the date couldn't find the branch due to not having a local version of the branch.  Fixed by checking out the branch to make sure it exists locally.\r\n\r\nexample of failure: https://github.com/pytorch/pytorch/runs/7913835742?check_suite_focus=true\r\n\r\nTest plan:\r\nmade it pull request trigger and ran, to get this:\r\nhttps://github.com/pytorch/pytorch/runs/7959221184?check_suite_focus=true\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83865\r\nApproved by: https://github.com/zengk95\r\n\r\n* [FSDP] Remove unneeded checks (#83150)\r\n\r\n@awgu pointed out these checks aren't really doing anything, as they just make sure we're setting training state in certain ways throughout FSDP and is sort of arbitrary. So, removing them to avoid confusion.\r\n\r\nWe still keep the checking around `_post_backward_called` because this is needed in `finalize_params` for now.\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83150\r\nApproved by: https://github.com/awgu\r\n\r\n* [BE] Revert distributed change in https://github.com/pytorch/pytorch/pull/68779 (#83181)\r\n\r\nhttps://github.com/pytorch/pytorch/issues/82641 points out a regression in how inputs / outputs are processed by DDP, blocking their HF use case. It was narrowed down to https://github.com/pytorch/pytorch/pull/68779 and reverting the distributed change there fixes the issue.\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83181\r\nApproved by: https://github.com/kumpera\r\n\r\n* Transpose scheduler small dim sizes better support (#1910)\r\n\r\n* Optimize transpose copy on CPU using fbgemm transpose (#83327)\r\n\r\n### Description\r\nOptimize transpose copy on CPU using fbgemm transpose\r\n\r\n### Testing\r\nsingle socket (28cores):\r\n```\r\nbefore: torch.Size([10, 128, 10, 124]) -> torch.Size([10, 128, 124, 10]) fp32: 4.819e-05 ms; bf16: 4.846e-05 ms\r\n        torch.Size([10, 128, 30, 124]) -> torch.Size([10, 128, 124, 30]) fp32: 0.000171 ms; bf16: 0.000129 ms\r\n\r\nafter: torch.Size([10, 128, 10, 124]) -> torch.Size([10, 128, 124, 10])  fp32: 2.439e-05 ms; bf16: 2.152e-05 ms\r\n        torch.Size([10, 128, 30, 124]) -> torch.Size([10, 128, 124, 30]) fp32: 0.000132 ms; bf16: 3.916e-05 ms\r\n```\r\nsingle core:\r\n```\r\nbefore: torch.Size([10, 128, 10, 124]) -> torch.Size([10, 128, 124, 10]) fp32: 0.00109 ms;  bf16: 0.00103 ms\r\n        torch.Size([10, 128, 30, 124]) -> torch.Size([10, 128, 124, 30]) fp32: 0.00339 ms; bf16: 0.00295 ms\r\n\r\nafter: torch.Size([10, 128, 10, 124]) -> torch.Size([10, 128, 124, 10]) fp32: 0.000566  ms; bf16: 0.000382 ms\r\n        torch.Size([10, 128, 30, 124]) -> torch.Size([10, 128, 124, 30]) fp32: 0.00282 ms; bf16: 0.000999 ms\r\n```\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83327\r\nApproved by: https://github.com/frank-wei\r\n\r\n* Grouped grid welford (#1921)\r\n\r\nEnables grouping of grid welford ops across iterations. Same functionality as the iteration grouping for GridReduction. This ins intended to improve the outer-norm grid persistence in batchnorm-like fusions.\r\n\r\n* [ONNX] Use `errors.SymbolicValueError` for more context (#83332)\r\n\r\nReplace runtime errors in torch.onnx with `errors.SymbolicValueError` for more context around jit values.\r\n\r\n- Extend `_unimplemented`, `_onnx_unsupported`, `_onnx_opset_unsupported`, `_onnx_opset_unsupported_detailed` errors to include JIT value information\r\n- Replace plain RuntimeError with `errors.SymbolicValueError`\r\n- Clean up: Use `_is_bool` to replace string comparison on jit types\r\n- Clean up: Remove the todo `Remove type ignore after #81112`\r\n\r\n#77316\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83332\r\nApproved by: https://github.com/AllenTiTaiWang, https://github.com/thiagocrepaldi, https://github.com/BowenBao\r\n\r\n* [quant][fx] Add support for quantized matmul (#83885)\r\n\r\nSummary:\r\natt, probably missed the op during migration to the reference flow\r\n\r\nTest Plan:\r\npython test/test_quantization.py TestQuantizeFxOps.test_qmatmul\r\n\r\nReviewers:\r\n\r\nSubscribers:\r\n\r\nTasks:\r\n\r\nTags:\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83885\r\nApproved by: https://github.com/andrewor14\r\n\r\n* Misc fixes/tuning for transpose scheduler (#1912)\r\n\r\n* [nn] split rnn_utils test from test_nn.py (#83675)\r\n\r\nRef: https://github.com/pytorch/pytorch/issues/63085\r\nProposed folder structure\r\n```\r\n-> test\r\n  -> nn\r\n    -> test_conv.py\r\n    -> test_pooling.py\r\n    -> .....\r\n```\r\n\r\nThis PR: Moves test related RNN utilities to a different file.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83675\r\nApproved by: https://github.com/albanD\r\n\r\n* [optim] rprop: handle complex params as independent real params (#83858)\r\n\r\nRef #65711\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83858\r\nApproved by: https://github.com/albanD\r\n\r\n* [xla hash update] update the pinned xla hash (#83899)\r\n\r\nThis PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).\r\nUpdate the pinned xla hash.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83899\r\nApproved by: https://github.com/pytorchbot\r\n\r\n* [ROCm] More Sparse UTs enablement and more hipification mappings. (#78939)\r\n\r\nEnables:\r\n\r\n test_bmm_cuda_float64\r\n test_bmm_deterministic_cuda_float64\r\n test_csr_matvec_cuda_complex128\r\n test_csr_matvec_cuda_complex64\r\n test_csr_matvec_cuda_float32\r\n test_csr_matvec_cuda_float64\r\n\r\nTo enable the above tests had to add some more hip mappings for the hipification process.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/78939\r\nApproved by: https://github.com/pruthvistony, https://github.com/malfet\r\n\r\n* Normalize DLPack stride to 1 where shape < 2 (#83158)\r\n\r\nFixes #83069. Also move all the dlpack tests to a new file., `test_dlpack.py`.\r\n\r\nThe fix involves always allocating a \"strides\" int array when converting to dlPack and deleting the strides when the capsule descructor is called. Then the strides are copied from the tensor, and `strides[i]` is set to `1` where `shape[i] < 2`.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83158\r\nApproved by: https://github.com/ezyang\r\n\r\n* Remove DBR quantization from the codebase (#83642)\r\n\r\nSummary:\r\n\r\nDBR quantization is a no-go for now because it does not align well with\r\nPyTorch 2.0 plans and we do not want to build yet another tracing system.\r\n\r\nDeleting it from the codebase for now since there are no plans to develop\r\nthis in the near future. We can bring it back at a later time if necessary.\r\n\r\nTest plan:\r\n\r\nCI\r\n\r\nDifferential Revision: [D38839556](https://our.internmc.facebook.com/intern/diff/D38839556)\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83642\r\nApproved by: https://github.com/andrewor14, https://github.com/jerryzh168\r\n\r\n* Refactored ops on size to be dispatcher ops (#83719)\r\n\r\nAn example of how the graph looks now.\r\n```\r\ndef forward(self, x_1):\r\n    size = torch.ops.math.size(x_1, 0)\r\n    size_1 = torch.ops.math.size(x_1, 1);  x_1 = None\r\n    ones = torch.ops.aten.ones.default([1], device = device(type='cpu'), pin_memory = False)\r\n    expand_sym_int = torch.ops.aten.expand.SymInt(ones, [size, size_1]);  ones = size = size_1 = None\r\n    cos_default = torch.ops.aten.cos.default(expand_sym_int);  expand_sym_int = None\r\n    return (cos_default,)\r\n```\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83719\r\nApproved by: https://github.com/ezyang\r\n\r\n* Fix stride issue with faketensors (#83822)\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83822\r\nApproved by: https://github.com/ezyang, https://github.com/ngimel\r\n\r\n* Nullary RNGOp (#1892)\r\n\r\n* [ROCm] restore MIOpen benchmark flag default to true (#82656)\r\n\r\n### Description\r\nPR https://github.com/pytorch/pytorch/pull/77438 allowed MIOpen to support the benchmark flag. Previously, the benchmark flag was ignored by MIOpen such that benchmarking was always turned on. This commit restores the behavior that MIOpen benchmarking is by default turned on.\r\n\r\n### Testing\r\nCI unit tests cover this capability.  Torchvision models demonstrate the performance delta.\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/82656\r\nApproved by: https://github.com/ngimel\r\n\r\n* Update retry action to latest version (#83911)\r\n\r\nWe're running into EPERM issues when trying to install nvidia tools, see failure example https://github.com/pytorch/pytorch/runs/7975726013?check_suite_focus=true.\r\n```\r\nWARNING: The nvidia-drm module will not be installed. As a result, DRM-KMS will not function with this installation of the NVIDIA driver.\r\n\r\n/home/ec2-user/actions-runner/_work/_actions/nick-fields/retry/71062288b76e2b6214ebde0e673ce0de1755740a/dist/index.js:1049\r\n            throw err;\r\n            ^\r\n\r\nError: kill EPERM\r\n    at process.kill (internal/process/per_thread.js:199:13)\r\n    at killPid (/home/ec2-user/actions-runner/_work/_actions/nick-fields/retry/71062288b76e2b6214ebde0e673ce0de1755740a/dist/index.js:1059:17)\r\n    at /home/ec2-user/actions-runner/_work/_actions/nick-fields/retry/71062288b76e2b6214ebde0e673ce0de1755740a/dist/index.js:1036:21\r\n    at Array.forEach (<anonymous>)\r\n    at /home/ec2-user/actions-runner/_work/_actions/nick-fields/retry/71062288b76e2b6214ebde0e673ce0de1755740a/dist/index.js:1034:23\r\n    at Array.forEach (<anonymous>)\r\n    at killAll (/home/ec2-user/actions-runner/_work/_actions/nick-fields/retry/71062288b76e2b6214ebde0e673ce0de1755740a/dist/index.js:1033:27)\r\n    at /home/ec2-user/actions-runner/_work/_actions/nick-fields/retry/71062288b76e2b6214ebde0e673ce0de1755740a/dist/index.js:1024:13\r\n    at ChildProcess.onClose (/home/ec2-user/actions-runner/_work/_actions/nick-fields/retry/71062288b76e2b6214ebde0e673ce0de1755740a/dist/index.js:1080:17)\r\n    at ChildProcess.emit (events.js:314:20) {\r\n  errno: 'EPERM',\r\n  code: 'EPERM',\r\n  syscall: 'kill'\r\n}\r\n\r\n```\r\n\r\nThe root issue probably lies elsewhere but this action is not helping/the errors seem to say it's unable to kill child processes. A more recent commit in that repo uses spawn instead of exec which might make a difference.\r\n\r\nRegardless, we should keep our actions up to date anyway.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83911\r\nApproved by: https://github.com/malfet\r\n\r\n* [PyTorch] Remove unused sstream/string includes from c10/macros/Macros.h (#83353)\r\n\r\nNothing in the rest of the header seems to use these.\r\n\r\nDifferential Revision: [D38672680](https://our.internmc.facebook.com/intern/diff/D38672680/)\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83353\r\nApproved by: https://github.com/malfet\r\n\r\n* [functorch] add linalg cross batch rule (#83759)\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83759\r\nApproved by: https://github.com/zou3519\r\n\r\n* Improve DistanceKernel.cu (#83811)\r\n\r\ninclude device_sqrt\r\nreplace reduce_agg by BlockReduce\r\nchoose implementation by impl_fptr instead of error-prone copy-and-paste\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83811\r\nApproved by: https://github.com/ngimel\r\n\r\n* reinplace pass: bugfix for output node replacement (#83845)\r\n\r\nCleaned up some of the arg replacement logic to use tree_map, so it handles FX nodes that have nested containers.\r\n\r\nSee the added test: when you write a function that returns a list, the `output` node in the FX graph shows up as having `node.args = tuple(immutable_list(...))`\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83845\r\nApproved by: https://github.com/ezyang\r\n\r\n* reinplace pass: special handling for view_scatter ops (#83846)\r\n\r\nThere is already special handling in the reinplacing pass for removing `{view}_scatter` ops, but there is another case that needs special handling. In this code:\r\n```\r\n         def f():\r\n             a = torch.zeros(4, 4, 4)\r\n             a[:, 2:] = torch.ones(4, 2, 4)\r\n             return a\r\n```\r\n\r\nTracing normally with `make_fx()` gives you:\r\n```\r\n\r\ndef forward(self):\r\n    zeros = torch.ops.aten.zeros.default([4, 4, 4], device = device(type='cpu'), pin_memory = False)\r\n    ones = torch.ops.aten.ones.default([4, 2, 4], device = device(type='cpu'), pin_memory = False)\r\n    slice_tensor = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)\r\n    slice_tensor_1 = torch.ops.aten.slice.Tensor(slice_tensor, 1, 2, 9223372036854775807);  slice_tensor = None\r\n    copy__default = torch.ops.aten.copy_.default(slice_tensor_1, ones);  slice_tensor_1 = ones = None\r\n    return zeros\r\n```\r\nFunctionalizing it gives you:\r\n\r\n```\r\ndef forward(self):\r\n    zeros = torch.ops.aten.zeros.default([4, 4, 4], device = device(type='cpu'), pin_memory = False)\r\n    ones = torch.ops.aten.ones.default([4, 2, 4], device = device(type='cpu'), pin_memory = False)\r\n    slice_tensor = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)\r\n    slice_tensor_1 = torch.ops.aten.slice.Tensor(slice_tensor, 1, 2, 9223372036854775807);  slice_tensor = None\r\n    slice_tensor_2 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)\r\n    slice_scatter_default = torch.ops.aten.slice_scatter.default(slice_tensor_2, ones, 1, 2, 9223372036854775807);  slice_tensor_2 = ones = None\r\n    slice_scatter_default_1 = torch.ops.aten.slice_scatter.default(zeros, slice_scatter_default, 0, 0, 9223372036854775807);  zeros = slice_scatter_default = None\r\n    return slice_scatter_default_1\r\n```\r\n\r\nNotice that there are not any functional ops to directly re-inplace! What actually happened is that functionalization turned the `copy_()` into a `copy()`, but the out-of-place `copy()` operator gets optimized away because it's a no-op (when the input and output metadata are the same, `out = copy(a, b)` just returns `b`).\r\n\r\nWhat we actually want is to replace this line:\r\n```\r\nslice_scatter_default = torch.ops.aten.slice_scatter.default(slice_tensor_2, ones, 1, 2, ...);\r\n```\r\nwith this:\r\n```\r\nnew_slice = torch.ops.aten.slice.Tensor(slice_tensor_2, 1, 2, ...);\r\n_ = torch.ops.aten.copy_.default(new_slice, ones)\r\n```\r\n\r\nIn the above, we're taking a fresh slice of the \"base\" tensor, and performing a `copy_()` on the slice, adding back what functionalization removed.\r\n\r\nWe actually need to create a fresh \"slice\" node, because we're not guaranteed that one already exists in the graph (technically there should be one, but it might have been DCE'd by the time we hit re-inplacing)\r\n\r\nI also updated the docs for re-inplacing to more closely match the order of the logic.\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83846\r\nApproved by: https://github.com/ezyang\r\n\r\n* Move ATenNVRTC.h include from `jit_utils.h` to `jit_utils.cpp` (#83886)\r\n\r\nIn general, `.h` files should only include headers that are used in the header\r\n\r\nFixes https://github.com/pytorch/pytorch/issues/83856\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83886\r\nApproved by: https://github.com/ngimel\r\n\r\n* Allow None arguments for elementwise type promotion wrapper and fix clamp with None arguments (#83586)\r\n\r\nFixes https://github.com/pytorch/torchdynamo/issues/759\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83586\r\nApproved by: https://github.com/ezyang, https://github.com/ngimel\r\n\r\n* Enable NCCL_DESYNC_DEBUG when TORCH_DISTRIBUTED_DEBUG=DETAIL (#83881)\r\n\r\nAutomatically enable `NCCL_DESYNC_DEBUG` when `TORCH_DISTRIBUTED_DEBUG` is set to `DETAIL`.\r\nSaving user from setting two env variables.\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83881\r\nApproved by: https://github.com/malfet, https://github.com/rohan-varma, https://github.com/H-Huang\r\n\r\n* Strenghten preconditions of linalg.cross (#83798)\r\n\r\nThis makes `linalg.cross` array API complaint (https://github.com/data-apis/array-api/issues/415) and fixes a few bugs.\r\n\r\nFixes https://github.com/pytorch/pytorch/issues/77629\r\nFixes https://github.com/pytorch/pytorch/issues/83756\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83798\r\nApproved by: https://github.com/mruberry\r\n\r\n* Fix view_func replay in no-grad mode (#83872)\r\n\r\nFixes https://github.com/pytorch/pytorch/issues/83828\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83872\r\nApproved by: https://github.com/albanD\r\n\r\n* [vulkan] Add VMA as a third_party subrepo (#83906)\r\n\r\nthe [VulkanMemoryAllocator](https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator) is a popular library for GPU memory allocation using Vulkan. The Vulkan backend has a dependency on it, but since it is only a single header file we currently include it by checking it into the repo under [aten/src/ATen/native/vulkan/api/vk_mem_alloc.h](https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h). However, it is better to check it in as a third party submodule, since it allows better version tracking.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83906\r\nApproved by: https://github.com/kimishpatel\r\n\r\n* [torchgen] Add documentation for `autogen` keyword (#83610)\r\n\r\nThis is a follow up for #81437. This PR explains what operator can use `autogen` and what will be generated. Also talked about generated kernels and where to find them.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83610\r\nApproved by: https://github.com/albanD, https://github.com/bdhirsh\r\n\r\n* remove assertEqualIgnoreTypes from test/distributions/test_distributions.py (#83709)\r\n\r\nSee https://github.com/pytorch/pytorch/issues/38095\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83709\r\nApproved by: https://github.com/kit1980\r\n\r\n* [fix] edge case in `MaxPool1d` and add ErrorInputs (#83553)\r\n\r\nFixes #83224\r\n\r\ncc @kshitij12345 @albanD!\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83553\r\nApproved by: https://github.com/albanD\r\n\r\n* [complex] conv_transpose1d (#79694)\r\n\r\nReference: https://github.com/pytorch/pytorch/issues/71108\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/79694\r\nApproved by: https://github.com/ngimel\r\n\r\n* Revert \"Strenghten preconditions of linalg.cross (#83798)\"\r\n\r\nThis reverts commit 7f0198e7390eff2f2f5fcb33ce36c99ec3b7f55e.\r\n\r\nReverted https://github.com/pytorch/pytorch/pull/83798 on behalf of https://github.com/janeyx99 due to Sorry, land race caused functorch issues https://hud.pytorch.org/pytorch/pytorch/commit/7f0198e7390eff2f2f5fcb33ce36c99ec3b7f55e\r\n\r\n* Fix load_extra_only api for flatbuffers and enable flatbuffers in mobile for OSS properly (#83855)\r\n\r\n`_load_extra_only_for_mobile` API hasn't handled flatbuffers logic yet. Update the api accordingly.\r\n\r\nAlso find out mobile build in OSS doesn't build with flatbuffers. Filed task T129996445 to track\r\n\r\nDifferential Revision: [D38890847](https://our.internmc.facebook.com/intern/diff/D38890847/)\r\n\r\n**NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D38890847/)!\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83855\r\nApproved by: https://github.com/qihqi\r\n\r\n* Prefer signal from land checks over PR signals (#83715)\r\n\r\n# The problem\r\n\r\nWhen a dev forks their branch from a red master build, their branch can fail CI checks for reasons unrelated to their changes, but the same checks would however pass in the land validation commit (which is rebased off of viable/strict)\r\n\r\nToday, in the above scenario the `merge -l` command fails because mergebot sees the failing checks in the PR, which is not helpful when that same check passes in land validation.\r\n\r\n# The solution\r\nThis PR changes the behavior so that:\r\n1. If both the PR and land validation ran a workflow, only look at the results from land validation\r\n2. If only the PR ran a specific workflow (e.g. for CLA Check or a nightly run) then continue to look the result from the PR (which matches existing behavior)\r\n\r\n### Bonus fixes\r\nIt also includes a few extra BE fixes:\r\n- Replaces the tuple we used to pass workflow check results around with a named tuple so that it's easier to tell what data is being used\r\n- Reduces the number of API calls to github by ~50% during merges.  Before, we were pulling results from github every time and then filtering it down to the relevant category of checks (e.g. failed/pending/startup_failed). Now, our filters share the check results\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83715\r\nApproved by: https://github.com/zengk95\r\n\r\n* Don't introduce new overload for SymInt (#83628)\r\n\r\nPreviously, we introduced new SymInt overloads for every function we wanted.  This led to a lot of boilerplate, and also a lot of confusion about how the overloads needed to be implemented.\r\n\r\nThis PR takes a simpler but more risky approach: just take the original function and changes its ints to SymInts.\r\n\r\nThis is BC-breaking in the following ways:\r\n\r\n* The C++ API for registering implementations for aten operators will change from int64_t to SymInt whenever you make this change. Code generated registrations in PyTorch do not change as codegen handles the translation automatically, but manual registrations will need to follow the change.  Typically, if you now accept a SymInt where you previously only took int64_t, you have to convert it back manually.  This will definitely break XLA, see companion PR https://github.com/pytorch/xla/pull/3914 Note that not all dispatch keys get the automatic translation; all the composite keys and Meta keys are modified to take SymInt directly (because they should handle them directly), and so there are adjustments for this.\r\n\r\nThis is not BC-breaking in the following ways:\r\n\r\n* The user facing C++ API remains compatible.  Even if a function changes from int to SymInt, the default C++ binding still takes only ints.  (e.g., at::empty(IntArrayRef, ...).  To call with SymInts, you must call at::empty_symint instead. This involved adding two more signatures to CppSignatureGroup; in many cases I refactored code to iterate over all signatures in the group instead of hard-coding the two that previously existed.\r\n* This is TorchScript compatible; internally we treat SymInts as ints so there is no change to what happens at runtime in TorchScript. In particular, it's OK to reference an empty schema by its old type (using int types), as long as you're not doing string equality (which you shouldn't be), these parse to the same underyling type.\r\n\r\nStructure of the PR:\r\n\r\n* The general strategy of this PR is that, even when you write `SymInt` inside `native_functions.yaml`, sometimes, we will treat it *as if* it were an `int`. This idea pervades the codegen changes, where we have a translation from SymInt to c10::SymInt or int64_t, and this is controlled by a symint kwarg which I added and then audited all call sites to decide which I wanted. Here are some of the major places where we pick one or the other:\r\n  * The C++ FunctionSchema representation represents `SymInt` as `int`. There are a few places we do need to know that we actually have a SymInt and we consult `real_type()` to get the real type in this case. In particular:\r\n    * When we do schema validation of C++ operator registration, we must compare against true schema (as the C++ API will provide `c10::SymInt`, and this will only be accepted if the schema is `SymInt`. This is handled with cloneWithRealTypes before we check for schema differences.\r\n    * In `toIValue` argument parsing, we parse against the true schema value. For backwards compatibility reasons, I do still accept ints in many places where Layout/SymInt/etc were expected. (Well, accepting int where SymInt is expected is not BC, it's just the right logic!)\r\n  * In particular, because SymInt never shows up as type() in FunctionSchema, this means that we no longer need a dedicated Tag::SymInt. This is good, because SymInts never show up in mobile anyway.\r\n* Changes to functorch/aten are mostly about tracking changes to the C++ API registration convention. Additionally, since SymInt overloads no longer exist, registrations for SymInt implementations are deleted. In many cases, the old implementations did not properly support SymInts; I did not add any new functionality with this PR, but I did try to annotate with TODOs where this is work to do. Finally, because the signature of `native::` API changed from int to SymInt, I need to find alternative APIs for people who were directly calling these functions to call. Typically, I insert a new dispatch call when perf doesn't matter, or use `at::compositeexplicitautograd` namespace to handle other caes.\r\n* The change to `make_boxed_from_unboxed_functor.h` is so that we accept a plain IntList IValue anywhere a SymIntList is expected; these are read-only arguments so covariant typing is OK.\r\n* I change how unboxing logic works slightly. Previously, we interpret the C++ type for Layout/etc directly as IntType JIT type, which works well because the incoming IValue is tagged as an integer. Now, we interpret the C++ type for Layout as its true type, e.g., LayoutType (change to `jit_type.h`), but then we accept an int IValue for it anyway. This makes it symmetric with SymInt, where we interpret the C++ type as SymIntType, and then accept SymInt and int IValues for it.\r\n* I renamed the `empty.names` overload to `empty_names` to make it less confusing (I kept mixing it up with the real empty overload)\r\n* I deleted the `empty.SymInt` overload, which ended up killing a pile of functions. (This was originally a separate PR but the profiler expect test was giving me grief so I folded it in.)\r\n* I deleted the LazyDynamicOpsTest tests. These were failing after these changes, and I couldn't figure out why they used to be passing: they make use of `narrow_copy` which didn't actually support SymInts; they were immediately converted to ints.\r\n* I bashed LTC into working. The patches made here are not the end of the story. The big problem is that SymInt translates into Value, but what if you have a list of SymInt? This cannot be conveniently represented in the IR today, since variadic Values are not supported. To work around this, I translate SymInt[] into plain int[] (this is fine for tests because LTC dynamic shapes never actually worked); but this will need to be fixed for proper LTC SymInt support. The LTC codegen also looked somewhat questionable; I added comments based on my code reading.\r\n\r\nSigned-off-by: Edward Z. Yang <ezyang@fb.com>\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83628\r\nApproved by: https://github.com/albanD, https://github.com/bdhirsh\r\n\r\n* Remove CoreMLMemoryObserver (#83703)\r\n\r\nSummary: We added this observer to help us diagnose memory issues that have since resolved. It should be safe to clean this up.\r\n\r\nTest Plan: Diff just removed logging, so just build IG and confirm no errors.\r\n\r\nDifferential Revision: D38843701\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83703\r\nApproved by: https://github.com/mcr229\r\n\r\n* ci: Remove dead code related to android uploads (#83930)\r\n\r\nThese uploads actually never got triggeredhappened in nightlies so\r\nremoving it altogether. Someone can re-add in the future if they feel\r\nthese are important but I can't find an instance of this running since\r\nwe migrated so I have a hard time believing anyone will miss it.\r\n\r\nhttps://hud.pytorch.org/hud/pytorch/pytorch/nightly/1?per_page=50&name_filter=android\r\n\r\nSigned-off-by: Eli Uriegas <eliuriegas@fb.com>\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83930\r\nApproved by: https://github.com/atalman, https://github.com/malfet\r\n\r\n* [fx][pass infra] Adding error catching (#83933)\r\n\r\nExample:\r\n\r\n```\r\n======================================================================\r\nERROR: test_pass_manager_error (fx.test_pass_infra.TestPassManager)\r\n----------------------------------------------------------------------\r\nTraceback (most recent call last):\r\n  File \"/Users/angelayi/Projects/pytorch/torch/fx/passes/infra/pass_manager.py\", line 285, in __call__\r\n    res = fn(module)\r\n  File \"/Users/angelayi/Projects/pytorch/test/fx/test_pass_infra.py\", line 164, in pass_fail\r\n    raise RuntimeError(\"bad\")\r\nRuntimeError: bad\r\n\r\nThe above exception was the direct cause of the following exception:\r\n\r\nTraceback (most recent call last):\r\n  File \"/Users/angelayi/Projects/pytorch/test/fx/test_pass_infra.py\", line 170, in test_pass_manager_error\r\n    pm(traced_m)\r\n  File \"/Users/angelayi/Projects/pytorch/torch/fx/passes/infra/pass_manager.py\", line 289, in __call__\r\n    raise RuntimeError(msg) from e\r\nRuntimeError: An error occured when running the 'pass_fail' pass after the following passes: ['replace_add_with_mul_pass', 'replace_mul_with_div_pass']\r\n```\r\n\r\nFixes #ISSUE_NUMBER\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83933\r\nApproved by: https://github.com/SherlockNoMad\r\n\r\n* Back out \"Support regex-style matching for Any and Oneof (#82853)\" (#83922)\r\n\r\nReviewed By: hl475\r\n\r\nDifferential Revision: D38945806\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83922\r\nApproved by: https://github.com/hl475\r\n\r\n* Fix use-dict-literal lint (#83718)\r\n\r\nFix use-dict-literal pylint suggestions by changing `dict()` to `{}`. This PR should do the change for every Python file except test/jit/test_list_dict.py, where I think the intent is to test the constructor.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83718\r\nApproved by: https://github.com/albanD\r\n\r\n* Revert \"Optimize transpose copy on CPU using fbgemm transpose (#83327)\"\r\n\r\nThis reverts commit 04d8da88a6a1abf0da2b11096c85244bf38d3b2a.\r\n\r\nReverted https://github.com/pytorch/pytorch/pull/83327 on behalf of https://github.com/weiwangmeta due to breaking internal builds/causing out-of-bounds errors/training accuracy\r\n\r\n* Add hypothesis to requirements.txt (#83740)\r\n\r\nSigned-off-by: Edward Z. Yang <ezyang@fb.com>\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83740\r\nApproved by: https://github.com/zhxchen17, https://github.com/janeyx99, https://github.com/zou3519\r\n\r\n* [fbia] Keep Track of full qualified name before and after remote sharding (#83889)\r\n\r\nSummary: track qualname changes in embedding sharding & FX split, and compose target qualname in the end of FBIA transform stage, so we can use the qualname mapping in XL materialize stage\r\n\r\nTest Plan:\r\nCI/CD\r\n\r\nwith DISABLE_XLEBB_MATERIALIZATION = True\r\nhttps://fburl.com/fblearner/a8yljbux\r\n\r\nwith DISABLE_XLEBB_MATERIALIZATION = False\r\nhttps://fburl.com/fblearner/2nvi0dam\r\n\r\nReviewed By: lliu315gt\r\n\r\nDifferential Revision: D38772525\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83889\r\nApproved by: https://github.com/houseroad\r\n\r\n* add merge blocking to ci: sev template (#83940)\r\n\r\nas in title, so that by default, ci: sev will block merges\r\n\r\nthe line can be removed to not block merges\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83940\r\nApproved by: https://github.com/huydhn, https://github.com/janeyx99, https://github.com/malfet, https://github.com/seemethere\r\n\r\n* Move nnapi code from ATen common code to specific library (#83748)\r\n\r\nSummary: Currently we include nnapi code in all targets using ATen even if it's not used (actually there is no usage and being deprecated). Move it to `nnapi_backend_lib` for now.\r\n\r\nTest Plan: Sandcastle.\r\n\r\nDifferential Revision: D38761095\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83748\r\nApproved by: https://github.com/salilsdesai, https://github.com/SS-JIA\r\n\r\n* Task: T129772171 remove assertEqualIgnoreTypes from test/test_nn.py (#83870)\r\n\r\nSee https://github.com/pytorch/pytorch/issues/38095\r\nReplaced assertEqualIgnoreType with assertEqual\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83870\r\nApproved by: https://github.com/kit1980\r\n\r\n* [Nested Tensor] Make offset copy and move assignment more explicit. (#83488)\r\n\r\nCurrently the nested tensor construction for the offset_ parameter takes in references and in the chain of delegation uses value. This could lead to unnecessary copies.  Whenever a nested tensor impl is constructed it should take ownership of all its metadata. The only non-trivially copyable metadata associated with the class is `offsets_`.\r\n\r\nThe goal of this PR is to make sure that consumers of nested_tensor_impl constructors ensure that they are passing offsets as a temporary - either buy explicitly copying a reference, or by constructing the offsets vector in the scope of construction.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83488\r\nApproved by: https://github.com/albanD, https://github.com/bdhirsh\r\n\r\n* Remove conj kernels for real dtypes (#80374)\r\n\r\n`conj_physical_stub` is currently implemented for all dtypes despite\r\nit just being a plain copy for real dtypes. So, instead we should\r\ndefer to the existing copy kernel in these cases.\r\n\r\nOn my build for one CUDA architecture, I see a 2.2 MB decrease in\r\n`libtorch_cuda.so` size.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/80374\r\nApproved by: https://github.com/ngimel, https://github.com/atalman\r\n\r\n* [BE][CUDA] Use packed_accessor64 (#83949)\r\n\r\nNot sure why we are ignoring those, but SoftMax.cu alone\r\ngenerates 100+ lines of warnings:\r\n```\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In function ‘at::Tensor at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::get_offsets(const at::Tensor&, const IntArrayRef&, int64_t)’:\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:261:69: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = long int; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto indices_accessor = indices.packed_accessor<int64_t, 2>();\r\n                                                                     ^\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘void at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::cuda_sparse_coo_softmax(at::Tensor&, const at::Tensor&, int64_t) [with scalar_t = double; bool LogSoftMax = false; int64_t = long int]’:\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:607:924:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:423:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto values_accessor = values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:426:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘void at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::cuda_sparse_coo_softmax(at::Tensor&, const at::Tensor&, int64_t) [with scalar_t = float; bool LogSoftMax = false; int64_t = long int]’:\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:607:1677:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:423:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto values_accessor = values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:426:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘void at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::cuda_sparse_coo_softmax(at::Tensor&, const at::Tensor&, int64_t) [with scalar_t = double; bool LogSoftMax = true; int64_t = long int]’:\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:623:927:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:423:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto values_accessor = values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:426:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘void at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::cuda_sparse_coo_softmax(at::Tensor&, const at::Tensor&, int64_t) [with scalar_t = float; bool LogSoftMax = true; int64_t = long int]’:\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:623:1679:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:423:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto values_accessor = values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:426:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘void at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::cuda_sparse_coo_softmax_backward(at::Tensor&, const at::Tensor&, const at::Tensor&, int64_t, c10::ScalarType) [with scalar_t = double; bool LogSoftMax = false; int64_t = long int]’:\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:641:977:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:542:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto values_accessor = values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:545:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:548:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto grad_values_accessor = grad_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘void at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::cuda_sparse_coo_softmax_backward(at::Tensor&, const at::Tensor&, const at::Tensor&, int64_t, c10::ScalarType) [with scalar_t = float; bool LogSoftMax = false; int64_t = long int]’:\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:641:1775:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:542:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto values_accessor = values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:545:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:548:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto grad_values_accessor = grad_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘void at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::cuda_sparse_coo_softmax_backward(at::Tensor&, const at::Tensor&, const at::Tensor&, int64_t, c10::ScalarType) [with scalar_t = double; bool LogSoftMax = true; int64_t = long int]’:\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:661:980:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:542:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto values_accessor = values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:545:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:548:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto grad_values_accessor = grad_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘void at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::cuda_sparse_coo_softmax_backward(at::Tensor&, const at::Tensor&, const at::Tensor&, int64_t, c10::ScalarType) [with scalar_t = float; bool LogSoftMax = true; int64_t = long int]’:\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:661:1777:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:542:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto values_accessor = values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:545:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:548:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n   auto grad_values_accessor = grad_values_2.packed_accessor<scalar_t, 2>();\r\n      ^~~~~~~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::compute_pool_max(const at::Tensor&, const at::Tensor&, const IntArrayRef&, int64_t, int64_t) [with scalar_t = double; bool requireMxRows = true; at::IntArrayRef = c10::ArrayRef<long int>; int64_t = long int]’:\r\n/tmp/tmpxft_000040e0_00000000-6_SoftMax.cudafe1.stub.c:16:557:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:347:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n     auto values_accessor =\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::compute_pool_max(const at::Tensor&, const at::Tensor&, const IntArrayRef&, int64_t, int64_t) [with scalar_t = float; bool requireMxRows = true; at::IntArrayRef = c10::ArrayRef<long int>; int64_t = long int]’:\r\n/tmp/tmpxft_000040e0_00000000-6_SoftMax.cudafe1.stub.c:18:556:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:347:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n     auto values_accessor =\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::compute_pool_max(const at::Tensor&, const at::Tensor&, const IntArrayRef&, int64_t, int64_t) [with scalar_t = double; bool requireMxRows = false; at::IntArrayRef = c10::ArrayRef<long int>; int64_t = long int]’:\r\n/tmp/tmpxft_000040e0_00000000-6_SoftMax.cudafe1.stub.c:20:557:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:347:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = double; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n     auto values_accessor =\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu: In instantiation of ‘std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> at::native::_GLOBAL__N__39f8a8aa_10_SoftMax_cu_75209b9c::compute_pool_max(const at::Tensor&, const at::Tensor&, const IntArrayRef&, int64_t, int64_t) [with scalar_t = float; bool requireMxRows = false; at::IntArrayRef = c10::ArrayRef<long int>; int64_t = long int]’:\r\n/tmp/tmpxft_000040e0_00000000-6_SoftMax.cudafe1.stub.c:21:556:   required from here\r\n/home/nshulga/git/pytorch/pytorch/aten/src/ATen/native/sparse/cuda/SoftMax.cu:347:6: warning: ‘at::GenericPackedTensorAccessor<T, N, PtrTraits, index_t> at::Tensor::packed_accessor() const & [with T = float; long unsigned int N = 2; PtrTraits = at::DefaultPtrTraits; index_t = long int]’ is deprecated: packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead [-Wdeprecated-declarations]\r\n     auto values_accessor =\r\n      ^~~~~~~~~~~~~~~\r\n/home/nshulga/git/pytorch/pytorch/build/aten/src/ATen/core/TensorBody.h:245:1: note: declared here\r\n   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {\r\n ^ ~~~~~~~~~~~~~\r\n```\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83949\r\nApproved by: https://github.com/ngimel\r\n\r\n* Support returning symbolic strides from t.stride() in Python (#83842)\r\n\r\nSigned-off-by: Edward Z. Yang <ezyang@fb.com>\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83842\r\nApproved by: https://github.com/albanD, https://github.com/Chillee, https://github.com/bdhirsh\r\n\r\n* Support the XPU backend untyped storage (#83952)\r\n\r\nSimple add XPU backend in untyped torch storage.\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83952\r\nApproved by: https://github.com/ezyang\r\n\r\n* Support NCCL Premul Sum (#81272)\r\n\r\nThis PR adds the support for https://docs.nvidia.com/deeplearning/nccl/archives/nccl_21212/user-guide/docs/api/ops.html?highlight=premul#c.ncclRedOpCreatePreMulSum.\r\n\r\nThe major changes include\r\n- convert enum ReduceOp to struct\r\n- add premul sum specific paths to init.cpp and Ops.cpp.\r\n\r\nnote:\r\n- For pip wheels / conda binaries to support this, ~~I think https://github.com/pytorch/pytorch/pull/79132 would be needed~~ https://github.com/pytorch/pytorch/pull/82775 landed\r\n\r\nThe commit titled \"add nccl premul\" whose current hash is https://github.com/pytorch/pytorch/pull/81272/commits/cb99ad67447b5899ecf8c4c3d78deaafa1cc09b8 was authored by @mcarilli and @ptrblck.\r\n\r\ncc @ptrblck\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/81272\r\nApproved by: https://github.com/kwen2501\r\n\r\n* Test type promotion assertignoretypes (#83867)\r\n\r\nSee #38095\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83867\r\nApproved by: https://github.com/kit1980, https://github.com/mruberry\r\n\r\n* [Profiler] record nn.Module's parameters (#83209)\r\n\r\nSummary:\r\nRecord nn.Module's parameters for detaild memory profiling:\r\n- extend 'module_' in value cache  & NNModuleInfo to save parameters\r\n- python binding and unit test case\r\n\r\nTest Plan: buck run mode/opt //caffe2/test:profiler -- -r test_nnmodule\r\n\r\nDifferential Revision: D38379717\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83209\r\nApproved by: https://github.com/robieta\r\n\r\n* [xla hash update] update the pinned xla hash (#83967)\r\n\r\nThis PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).\r\nUpdate the pinned xla hash.\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83967\r\nApproved by: https://github.com/pytorchbot\r\n\r\n* Fix `ir_utils::hasBlockSync` + misc fixes in transpose scheduler (#1924)\r\n\r\n* Fix LTC build warnings (#83955)\r\n\r\nAddresses `Wc++98-compat-extra-semi` warning from https://github.com/llvm/torch-mlir/issues/1264 by removing extraneous semicolon after autogen LTC native function definitions.\r\n\r\n```\r\n/home/runner/work/torch-mlir/torch-mlir/build/tools/torch-mlir/python/torch_mlir/csrc/base_lazy_backend/generated/LazyNativeFunctions.cpp:4241:6: warning: extra ';' outside of a function is incompatible with C++98 [-Wc++98-compat-extra-semi]\r\n    };\r\n     ^\r\n```\r\n\r\ncc: @wconstab @desertfire @ke1337 @antoniojkim\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83955\r\nApproved by: https://github.com/wconstab\r\n\r\n* Strenghten preconditions of linalg.cross (#83798)\r\n\r\nThis makes `linalg.cross` array API complaint (https://github.com/data-apis/array-api/issues/415) and fixes a few bugs.\r\n\r\nFixes https://github.com/pytorch/pytorch/issues/77629\r\nFixes https://github.com/pytorch/pytorch/issues/83756\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83798\r\nApproved by: https://github.com/mruberry\r\n\r\n* Make linalg.inv composite of linalg.solve (#80074)\r\n\r\nThe `getri` kernel calls inside `getrs` so we can do so explicitly\r\nourselves and save ourselves from having to maintain an extra kernel.\r\nThis way we just need to optimise `lu_factor` and `lu_solve` and `inv`\r\nwill be as efficient as it can be, as it'll be choosing the best backend\r\nto perform the factorisation and the best backend (not necessarily the\r\nsame) to perform the solve.\r\n\r\nFixes https://github.com/pytorch/pytorch/issues/77498\r\n\r\nThe benchmarks: https://github.com/pytorch/pytorch/pull/80074#issuecomment-1164309071\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/80074\r\nApproved by: https://github.com/IvanYashchuk, https://github.com/albanD, https://github.com/malfet\r\n\r\n* Support a stable double backward on linalg.det for real inputs (#80217)\r\n\r\nThe complex case still fails. I do not know why.\r\n\r\nFixes https://github.com/pytorch/pytorch/issues/62327\r\nFixes https://github.com/pytorch/pytorch/issues/53364\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/80217\r\nApproved by: https://github.com/nikitaved, https://github.com/albanD, https://github.com/malfet\r\n\r\n* [LTC] Add custom lazy tensor save function (#83294)\r\n\r\nWe need a custom `save` function for checkpointing a lazy model, similar to what exists in PyTorch/XLA:\r\nhttps://github.com/pytorch/xla/blob/3eb8a9d9eb4ebb0b064461c3704650241625654e/torch_xla/core/xla_model.py#L994\r\nThe purpose of this function is to move any lazy tensors to CPU before saving the checkpoint.\r\n\r\nThe way I implemented it was to create a general structure visitor, adapted from a function that we use quite often in Cerebras internal repositories. If there is a better tool already available in PyTorch that does the same things, I'm open to suggestions.\r\n\r\nCC: @wconstab @Krovatkin @JackCaoG\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83294\r\nApproved by: https://github.com/wconstab\r\n\r\n* move pooling test from test_nn to test/nn/test_pooling (#83915)\r\n\r\nRef #63085\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83915\r\nApproved by: https://github.com/albanD\r\n\r\n* [ONNX] Remove static None graph output (#82623)\r\n\r\nFixes #82370\r\n* Unify the export behavior regarding static None outputs. These are\r\ndropped for both traced graph and TorchScript graph export.\r\n* `Optional` outputs are not affected.\r\nFixes #82370\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/82623\r\nApproved by: https://github.com/AllenTiTaiWang, https://github.com/abock\r\n\r\n* [TorchTidy Fix] Don't try to collect strides for non-strided tensors (#83935)\r\n\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/83935\r\nApproved by: https://github.com/robieta, https://github.com/slgong-fb\r\n\r\n* [WIP] Validating input_col for certain datapipes (#80267)\r\n\r\nFollow up from #79344.\r\n\r\nCurrently WIP due to multiple test failures.\r\n\r\nWaiting for #80140 to land\r\nPull Request resolved: https://github.com/pytorch/pytorch/pull/80267\r\nApproved by:…","shortMessageHtmlLink":"[MatMul] Prolog build out, adding automatic swizzle generator for a f…"}},{"before":null,"after":"9a9510061ad2d7081a31f35cc924520ef2ad1e55","ref":"refs/heads/rebase-matmul_swizzle_gen","pushedAt":"2023-03-20T17:54:44.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"},"commit":{"message":"tracking NVIDIA/Fuser","shortMessageHtmlLink":"tracking NVIDIA/Fuser"}},{"before":"86b103bd7ef1a807dc403a9ae9c005049bffb980","after":null,"ref":"refs/heads/rebase-tracking-matmul","pushedAt":"2023-03-20T17:52:58.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"}},{"before":"d5f95763bd4f7cb2ef01440b8a70304936d3a8e9","after":null,"ref":"refs/heads/tracking-matmul","pushedAt":"2023-03-20T17:14:19.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"}},{"before":"9a9510061ad2d7081a31f35cc924520ef2ad1e55","after":"86b103bd7ef1a807dc403a9ae9c005049bffb980","ref":"refs/heads/rebase-tracking-matmul","pushedAt":"2023-03-20T17:14:15.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"},"commit":{"message":"[MatMul][DO NOT MERGE] Tracking matmul (#2022)\n\n* use custom propagator in ampere TN\r\n\r\n* add tile ordering utilities\r\n\r\n* initial matmul scheduler implementation\r\n\r\n* use matmul scheduler prototype on ampere and turing test cases\r\n\r\n* extend to support Volta\r\n\r\n* minor cleanup\r\n\r\n* comment cleanup\r\n\r\n* minor fix\r\n\r\n* add fragment iteration and use it in matmul scheduler\r\n\r\n* use scheduler params for tests\r\n\r\n* fragment support in double buffer\r\n\r\n* add register double buffering test cases\r\n\r\n* clean up custom transform propagator\r\n\r\n* rebase fix\r\n\r\n* comment\r\n\r\n* move bounded selector to common area\r\n\r\n* Add logic to handle fake boundary tensors in selection.\r\n\r\n* naming and comment\r\n\r\n* remove unused parameters from mma node\r\n\r\n* remove unnecessary parameters from mma ir node\r\n\r\n* rename scheduling variables\r\n\r\n* change accumulator tv interface\r\n\r\n* Update torch/csrc/jit/codegen/cuda/scheduler/utils.h\r\n\r\nCo-authored-by: Gao, Xiang <qasdfgtyuiop@gmail.com>\r\n\r\n* PR feedback\r\n\r\n* pipe through parallel type position\r\n\r\n* Revert \"fragment support in double buffer\"\r\n\r\nThis reverts commit d12a90fcce5cd02aca7c98ea5f29ea01bc85df6f.\r\n\r\n* use cache op to handle double buffer input\r\n\r\n* add more comment in matmul scheduler\r\n\r\n* more comments\r\n\r\n* comment fix\r\n\r\n* rebase fix\r\n\r\n* add inline pred for cpasync\r\n\r\n* minor cleanup\r\n\r\n* add inlining test in unit\r\n\r\n* add option to dump ptx\r\n\r\n* add ampere xor swizzle gen\r\n\r\n* minor scheduler fix; add bank conflict helper\r\n\r\n* minor update and enable single word access checker\r\n\r\n* minor fixes and symmetric 4 warp recipe tests\r\n\r\n* rebase fix\r\n\r\n* fix rebase\r\n\r\n* add cyclic shift for non-power-of-2 swizzle period\r\n\r\n* fix swizzle handling in replay\r\n\r\n* add a few more tile support\r\n\r\n* minor fix\r\n\r\n* add 6 warp test cases\r\n\r\n* add skip swizzle option for replay matching\r\n\r\n* add address compute type\r\n\r\n* add address compute insertion logic\r\n\r\n* pipe through index logic\r\n\r\n* add swizzle tile check\r\n\r\n* WAR on double buffer and thread loop\r\n\r\n* WIP matmul integration\r\n\r\n* skip initialization op for lifting record\r\n\r\n* format and fix double buffered index lifting\r\n\r\n* avoid re-swizzle\r\n\r\n* fix pattern matching\r\n\r\n* interaction between hoisting and double buffer\r\n\r\n* disable hoisting with lifted address tensor\r\n\r\n* index lifting by default in matmul scheduler\r\n\r\n* slightly refactor for loop interface\r\n\r\n* use base index attribute in hoisted mem index\r\n\r\n* add skew loop pass\r\n\r\n* Use skew double buffer pass in matmul scheduler\r\n\r\n* add peeling attribute\r\n\r\n* Add predicate peeling pass\r\n\r\n* minor fix on circular buffer prolog\r\n\r\n* WAR on circular buffer peeled predicate.\r\n\r\n* add circular buffered test case\r\n\r\n* add test back (rebase fix)\r\n\r\n* fix index lift handling\r\n\r\n* integrate into matmul scheduler\r\n\r\n* lift last iteration of initialization when peeling main loop\r\n\r\n* enable peeling by default\r\n\r\n* add base address field in tensor index\r\n\r\n* add pointer data type\r\n\r\n* codegen for base address option\r\n\r\n* pointer mod take 1\r\n\r\n* minor update\r\n\r\n* (wip) increment mode\r\n\r\n* add loop interleaving pass\r\n\r\n* use interleaving in matmul scheduler\r\n\r\n* add predicate record\r\n\r\n* fix lifting logic\r\n\r\n* pipe through predicate address compute insertion\r\n\r\n* (initial integration) hoist predicate index\r\n\r\n* cleanup\r\n\r\n* add small repro for the replay fix\r\n\r\n* fix lifting logic\r\n\r\n* lift consumer write address\r\n\r\n* fix divisibility and contig id check; enable 4warp\r\n\r\n* update test cases\r\n\r\n* [fix] check nullptr\r\n\r\n* rebase fix\r\n\r\n* comments\r\n\r\n* format; rebase fix\r\n\r\n* rebase fix\r\n\r\n* cleanup; comment\r\n\r\n* add validation and condition in scheduler\r\n\r\n* cleanup ; comment\r\n\r\n* lift read db index\r\n\r\n* inplace write double buffer update\r\n\r\n* lift cvta out of main loop\r\n\r\n* increment gmem load\r\n\r\n* [hack] decrement index\r\n\r\n* rebase fix\r\n\r\n* clean up\r\n\r\n* comment ; cleanup\r\n\r\n* minor fix\r\n\r\n* minor fix\r\n\r\n* [MOVE] circular buffer fix\r\n\r\n* make interleaving factor an option\r\n\r\n* comments ; clean up\r\n\r\n* optionally reorder the tiles to support legacy test (FIXME)\r\n\r\n* fix nondivisible split\r\n\r\n* comment\r\n\r\n* add option to disable nvfuser_zero\r\n\r\n* auto conversion to aligned sync\r\n\r\n* minor fix on index hoisting\r\n\r\n* minor fix\r\n\r\n* pipe through cpasyncCG\r\n\r\n* add matmul benchmark\r\n\r\n* more benchmark and test extension\r\n\r\n* Fix missing thread predicates\r\n\r\nUnlikely to matter, but should be necessary\r\n\r\n* fix merge\r\n\r\n* fix merge\r\n\r\n* Quick cleanup of things I noticed while reviewing\r\n\r\n* format\r\n\r\n* fixes\r\n\r\n* save\r\n\r\n* fix\r\n\r\n* fixes\r\n\r\n* fixes\r\n\r\n* fixes\r\n\r\n* fix\r\n\r\n* cleanup\r\n\r\n* fixes\r\n\r\n* fixes\r\n\r\n* more cleanup\r\n\r\n* more fixes\r\n\r\n* fix\r\n\r\n* new style expr\r\n\r\n* Merge branch 'devel' of github.com:csarofeen/pytorch into tracking-matmul\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* move\r\n\r\n* move\r\n\r\n* fix include\r\n\r\n* fix\r\n\r\n* try-fix\r\n\r\n* temporarily disable distributeMul\r\n\r\n* fix\r\n\r\n* position of toSmem\r\n\r\n* fix\r\n\r\n* format\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* cleanup\r\n\r\n* Add missing helper functions for AddressCompute expr\r\n\r\n* Fix code format issues in helpers for AddressCompute expr\r\n\r\n* changes_to_export_kernels (#2406)\r\n\r\n* fix\r\n\r\n* cleanup\r\n\r\n* more cleanup\r\n\r\n* gcd cleanup\r\n\r\n* NVFuserTest.FusionAmpereMatmulSASSModifiersCheck_CUDA\r\n\r\n* fixes\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* Add debugging utility RAII guard for printting scopes\r\n\r\n* unformat\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* Tracking matmul cleanup (#2557)\r\n\r\n---------\r\n\r\nCo-authored-by: shmsong <shisong@umich.edu>\r\nCo-authored-by: S. Song <41357537+shmsong@users.noreply.github.com>\r\nCo-authored-by: Naoya Maruyama <nmaruyama@nvidia.com>\r\nCo-authored-by: Andrzej Bekas <118676880+drzejan2@users.noreply.github.com>\r\nCo-authored-by: Michel Migdal <120487391+mmigdal-nv@users.noreply.github.com>","shortMessageHtmlLink":"[MatMul][DO NOT MERGE] Tracking matmul (<a class=\"issue-link js-issue-link\" data-error-text=\"Failed to load title\" data-id=\"1391964174\" data-permission-text=\"Title is private\" data-url=\"https://github.com/csarofeen/pytorch/issues/2022\" data-hovercard-type=\"pull_request\" data-hovercard-url=\"/csarofeen/pytorch/pull/2022/hovercard\" href=\"https://github.com/csarofeen/pytorch/pull/2022\">#2022</a>)"}},{"before":null,"after":"9a9510061ad2d7081a31f35cc924520ef2ad1e55","ref":"refs/heads/rebase-tracking-matmul","pushedAt":"2023-03-20T17:12:26.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"},"commit":{"message":"tracking NVIDIA/Fuser","shortMessageHtmlLink":"tracking NVIDIA/Fuser"}},{"before":"9b7e946a9ceb73a1fd2eef5244a4574551c0ae48","after":"d5f95763bd4f7cb2ef01440b8a70304936d3a8e9","ref":"refs/heads/tracking-matmul","pushedAt":"2023-03-20T17:10:51.000Z","pushType":"push","commitsCount":11,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"},"commit":{"message":"Merge branch 'devel' of github.com:csarofeen/pytorch into tracking-matmul","shortMessageHtmlLink":"Merge branch 'devel' of github.com:csarofeen/pytorch into tracking-ma…"}},{"before":"90e668face8dabc9f5eed2a11aeb7cde6324ad7d","after":null,"ref":"refs/heads/rebase-assumptions","pushedAt":"2023-03-20T17:07:40.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"zasdfgbnm","name":"Gao, Xiang","path":"/zasdfgbnm","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1032377?s=80&v=4"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAADHHHm_gA","startCursor":null,"endCursor":null}},"title":"Activity · csarofeen/pytorch"}