diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index fb5deed5f6..1c355366d6 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -3957,6 +3957,13 @@ Currently supported: `RPA`, `LDA`, `PBE`, `HSE`, `HF`.
 - **Description**:  The number of 2-particle states to be solved
 - **Default**: 0
 
+### lr_unrestricted
+- **Type**: Boolean
+- **Description**: Whether to use unrestricted construction for LR-TDDFT (the matrix size will be doubled).
+  - True:  Always use unrestricted LR-TDDFT. 
+  - False: Use unrestricted LR-TDDFT only when the system is open-shell.
+- **Default**: False
+
 ### abs_wavelen_range
 
 - **Type**: Real Real
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 8bad2ec8bf..c6aa1b0c8d 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -729,7 +729,6 @@ OBJS_TENSOR=tensor.o\
     operator_lr_exx.o\
     kernel_xc.o\
     pot_hxc_lrtd.o\
-    hsolver_lrtd.o\
     lr_spectrum.o\
     hamilt_casida.o\
     esolver_lrtd_lcao.o\
diff --git a/source/module_basis/module_ao/parallel_2d.h b/source/module_basis/module_ao/parallel_2d.h
index d3c88d5783..8aeea9792f 100644
--- a/source/module_basis/module_ao/parallel_2d.h
+++ b/source/module_basis/module_ao/parallel_2d.h
@@ -15,6 +15,7 @@ class Parallel_2D
     ~Parallel_2D() = default;
 
     Parallel_2D& operator=(Parallel_2D&& rhs) = default;
+    Parallel_2D(Parallel_2D&& rhs) = default;
 
     /// number of local rows
     int get_row_size() const
diff --git a/source/module_esolver/esolver.cpp b/source/module_esolver/esolver.cpp
index 3501cc79b2..b0247fd63e 100644
--- a/source/module_esolver/esolver.cpp
+++ b/source/module_esolver/esolver.cpp
@@ -187,13 +187,8 @@ ESolver* init_esolver(const Input_para& inp, UnitCell& ucell)
     else if (esolver_type == "lr_lcao")
     {
         // use constructor rather than Init function to initialize reference (instead of pointers) to ucell
-        if (PARAM.globalv.gamma_only_local){
-            return new LR::ESolver_LR<double, double>(inp, ucell);
-        } else if (PARAM.inp.nspin < 2) {
-            return new LR::ESolver_LR<std::complex<double>, double>(inp, ucell);
-        } else {
-            throw std::runtime_error("LR-TDDFT is not implemented for spin polarized case");
-}
+        if (PARAM.globalv.gamma_only_local) { return new LR::ESolver_LR<double, double>(inp, ucell); }
+        else { return new LR::ESolver_LR<std::complex<double>, double>(inp, ucell); }
     }
     else if (esolver_type == "ksdft_lr_lcao")
     {
diff --git a/source/module_io/read_input_item_tddft.cpp b/source/module_io/read_input_item_tddft.cpp
index 544fe00ebb..ccd3190c54 100644
--- a/source/module_io/read_input_item_tddft.cpp
+++ b/source/module_io/read_input_item_tddft.cpp
@@ -327,6 +327,12 @@ void ReadInput::item_lr_tddft()
         read_sync_bool(input.out_wfc_lr);
         this->add_item(item);
     }
+    {
+        Input_Item item("lr_unrestricted");
+        item.annotation = "Whether to use unrestricted construction for LR-TDDFT";
+        read_sync_bool(input.lr_unrestricted);
+        this->add_item(item);
+    }
     {
         Input_Item item("abs_wavelen_range");
         item.annotation = "the range of wavelength(nm) to output the absorption spectrum ";
@@ -337,10 +343,6 @@ void ReadInput::item_lr_tddft()
                 para.input.abs_wavelen_range.push_back(std::stod(item.str_values[i]));
             }
             };
-        item.check_value = [](const Input_Item& item, const Parameter& para) {
-            auto& awr = para.input.abs_wavelen_range;
-            if (awr.size() < 2) { ModuleBase::WARNING_QUIT("ReadInput", "abs_wavelen_range must have two values"); }
-            };
         sync_doublevec(input.abs_wavelen_range, 2, 0.0);
         this->add_item(item);
     }
diff --git a/source/module_io/test/read_input_ptest.cpp b/source/module_io/test/read_input_ptest.cpp
index 7953002c3a..a36cb5cd7d 100644
--- a/source/module_io/test/read_input_ptest.cpp
+++ b/source/module_io/test/read_input_ptest.cpp
@@ -419,6 +419,7 @@ TEST_F(InputParaTest, ParaRead)
     EXPECT_EQ(param.inp.xc_kernel, "LDA");
     EXPECT_EQ(param.inp.lr_solver, "dav");
     EXPECT_DOUBLE_EQ(param.inp.lr_thr, 1e-2);
+    EXPECT_FALSE(param.inp.lr_unrestricted);
     EXPECT_FALSE(param.inp.out_wfc_lr);
     EXPECT_EQ(param.inp.abs_wavelen_range.size(), 2);
     EXPECT_DOUBLE_EQ(param.inp.abs_wavelen_range[0], 0.0);
diff --git a/source/module_lr/AX/AX.h b/source/module_lr/AX/AX.h
index 360a4c9cec..36a6d5aebd 100644
--- a/source/module_lr/AX/AX.h
+++ b/source/module_lr/AX/AX.h
@@ -13,13 +13,13 @@ namespace LR
         const psi::Psi<double>& c,
         const int& nocc,
         const int& nvirt,
-        psi::Psi<double>& AX_istate);
+        double* AX_istate);
     void cal_AX_blas(
         const std::vector<container::Tensor>& V_istate,
         const psi::Psi<double>& c,
         const int& nocc,
         const int& nvirt,
-        psi::Psi<double>& AX_istate,
+        double* AX_istate,
         const bool add_on = true);
 #ifdef __MPI
     void cal_AX_pblas(
@@ -27,11 +27,11 @@ namespace LR
         const Parallel_2D& pmat,
         const psi::Psi<double>& c,
         const Parallel_2D& pc,
-        int naos,
-        int nocc,
-        int nvirt,
-        Parallel_2D& pX,
-        psi::Psi<double>& AX_istate,
+        const int& naos,
+        const int& nocc,
+        const int& nvirt,
+        const Parallel_2D& pX,
+        double* AX_istate,
         const bool add_on=true);
 #endif
     // complex
@@ -40,13 +40,13 @@ namespace LR
         const psi::Psi<std::complex<double>>& c,
         const int& nocc,
         const int& nvirt,
-        psi::Psi<std::complex<double>>& AX_istate);
+        std::complex<double>* AX_istate);
     void cal_AX_blas(
         const std::vector<container::Tensor>& V_istate,
         const psi::Psi<std::complex<double>>& c,
         const int& nocc,
         const int& nvirt,
-        psi::Psi<std::complex<double>>& AX_istate,
+        std::complex<double>* AX_istate,
         const bool add_on = true);
 
 #ifdef __MPI
@@ -55,11 +55,11 @@ namespace LR
         const Parallel_2D& pmat,
         const psi::Psi<std::complex<double>>& c,
         const Parallel_2D& pc,
-        int naos,
-        int nocc,
-        int nvirt,
-        Parallel_2D& pX,
-        psi::Psi<std::complex<double>>& AX_istate,
+        const int& naos,
+        const int& nocc,
+        const int& nvirt,
+        const Parallel_2D& pX,
+        std::complex<double>* AX_istate,
         const bool add_on = true);
 #endif
 }
\ No newline at end of file
diff --git a/source/module_lr/AX/AX_parallel.cpp b/source/module_lr/AX/AX_parallel.cpp
index 2b390e431a..221962416e 100644
--- a/source/module_lr/AX/AX_parallel.cpp
+++ b/source/module_lr/AX/AX_parallel.cpp
@@ -14,20 +14,17 @@ namespace LR
         const Parallel_2D& pmat,
         const psi::Psi<double>& c,
         const Parallel_2D& pc,
-        int naos,
-        int nocc,
-        int nvirt,
-        Parallel_2D& pX,
-        psi::Psi<double>& AX_istate,
+        const int& naos,
+        const int& nocc,
+        const int& nvirt,
+        const Parallel_2D& pX,
+        double* AX_istate,
         const bool add_on)
     {
         ModuleBase::TITLE("hamilt_lrtd", "cal_AX_pblas");
-        assert(pmat.comm() == pc.comm());
-        assert(pmat.blacs_ctxt == pc.blacs_ctxt);
-
-        if (pX.comm() != pmat.comm() || pX.blacs_ctxt != pmat.blacs_ctxt)
-            LR_Util::setup_2d_division(pX, pmat.get_block_size(), nvirt, nocc, pmat.blacs_ctxt);
-        else assert(pX.get_local_size() > 0 && AX_istate.get_nbasis() == pX.get_local_size());
+        assert(pmat.comm() == pc.comm() && pmat.comm() == pX.comm());
+        assert(pmat.blacs_ctxt == pc.blacs_ctxt && pmat.blacs_ctxt == pX.blacs_ctxt);
+        assert(pX.get_local_size() > 0);
 
         const int nks = V_istate.size();
 
@@ -35,7 +32,7 @@ namespace LR
         LR_Util::setup_2d_division(pVc, pmat.get_block_size(), naos, nocc, pmat.blacs_ctxt);
         for (int isk = 0;isk < nks;++isk)
         {
-            AX_istate.fix_k(isk);
+            const int ax_start = isk * pX.get_local_size();
             c.fix_k(isk);
 
             //Vc
@@ -60,7 +57,7 @@ namespace LR
             pdgemm_(&transa, &transb, &nvirt, &nocc, &naos,
                 &alpha, c.get_pointer(), &i1, &ivirt, pc.desc,
                 Vc.data<double>(), &i1, &i1, pVc.desc,
-                &beta, AX_istate.get_pointer(), &i1, &i1, pX.desc);
+                &beta, AX_istate + ax_start, &i1, &i1, pX.desc);
 
         }
     }
@@ -70,20 +67,17 @@ namespace LR
         const Parallel_2D& pmat,
         const psi::Psi<std::complex<double>>& c,
         const Parallel_2D& pc,
-        int naos,
-        int nocc,
-        int nvirt,
-        Parallel_2D& pX,
-        psi::Psi<std::complex<double>>& AX_istate,
+        const int& naos,
+        const int& nocc,
+        const int& nvirt,
+        const Parallel_2D& pX,
+        std::complex<double>* AX_istate,
         const bool add_on)
     {
         ModuleBase::TITLE("hamilt_lrtd", "cal_AX_plas");
-        assert(pmat.comm() == pc.comm());
-        assert(pmat.blacs_ctxt == pc.blacs_ctxt);
-
-        if (pX.comm() != pmat.comm() || pX.blacs_ctxt != pmat.blacs_ctxt)
-            LR_Util::setup_2d_division(pX, pmat.get_block_size(), nvirt, nocc, pmat.blacs_ctxt);
-        else assert(pX.get_local_size() > 0 && AX_istate.get_nbasis() == pX.get_local_size());
+        assert(pmat.comm() == pc.comm() && pmat.comm() == pX.comm());
+        assert(pmat.blacs_ctxt == pc.blacs_ctxt && pmat.blacs_ctxt == pX.blacs_ctxt);
+        assert(pX.get_local_size() > 0);
 
         const int nks = V_istate.size();
 
@@ -91,7 +85,7 @@ namespace LR
         LR_Util::setup_2d_division(pVc, pmat.get_block_size(), naos, nocc, pmat.blacs_ctxt);
         for (int isk = 0;isk < nks;++isk)
         {
-            AX_istate.fix_k(isk);
+            const int ax_start = isk * pX.get_local_size();
             c.fix_k(isk);
 
             //Vc
@@ -116,7 +110,7 @@ namespace LR
             pzgemm_(&transa, &transb, &nvirt, &nocc, &naos,
                 &alpha, c.get_pointer(), &i1, &ivirt, pc.desc,
                 Vc.data<std::complex<double>>(), &i1, &i1, pVc.desc,
-                &beta, AX_istate.get_pointer(), &i1, &i1, pX.desc);
+                &beta, AX_istate + ax_start, &i1, &i1, pX.desc);
         }
     }
 }
diff --git a/source/module_lr/AX/AX_serial.cpp b/source/module_lr/AX/AX_serial.cpp
index 019fd45928..abe990e5ff 100644
--- a/source/module_lr/AX/AX_serial.cpp
+++ b/source/module_lr/AX/AX_serial.cpp
@@ -9,18 +9,17 @@ namespace LR
         const psi::Psi<double>& c,
         const int& nocc,
         const int& nvirt,
-        psi::Psi<double>& AX_istate)
+        double* AX_istate)
     {
         ModuleBase::TITLE("hamilt_lrtd", "cal_AX_forloop");
         const int nks = V_istate.size();
         int naos = c.get_nbasis();
-        AX_istate.fix_k(0);
-        ModuleBase::GlobalFunc::ZEROS(AX_istate.get_pointer(), nks * nocc * nvirt);
+        ModuleBase::GlobalFunc::ZEROS(AX_istate, nks * nocc * nvirt);
 
         for (int isk = 0;isk < nks;++isk)
         {
             c.fix_k(isk);
-            AX_istate.fix_k(isk);
+            const int ax_start = isk * nocc * nvirt;
             for (int i = 0;i < nocc;++i)
             {
                 for (int a = 0;a < nvirt;++a)
@@ -29,7 +28,7 @@ namespace LR
                     {
                         for (int mu = 0;mu < naos;++mu)
                         {
-                            AX_istate(i * nvirt + a) += c(nocc + a, mu) * V_istate[isk].data<double>()[nu * naos + mu] * c(i, nu);
+                            AX_istate[ax_start + i * nvirt + a] += c(nocc + a, mu) * V_istate[isk].data<double>()[nu * naos + mu] * c(i, nu);
                         }
                     }
                 }
@@ -41,18 +40,17 @@ namespace LR
         const psi::Psi<std::complex<double>>& c,
         const int& nocc,
         const int& nvirt,
-        psi::Psi<std::complex<double>>& AX_istate)
+        std::complex<double>* AX_istate)
     {
         ModuleBase::TITLE("hamilt_lrtd", "cal_AX_forloop");
         const int nks = V_istate.size();
         int naos = c.get_nbasis();
-        AX_istate.fix_k(0);
-        ModuleBase::GlobalFunc::ZEROS(AX_istate.get_pointer(), nks * nocc * nvirt);
+        ModuleBase::GlobalFunc::ZEROS(AX_istate, nks * nocc * nvirt);
 
         for (int isk = 0;isk < nks;++isk)
         {
             c.fix_k(isk);
-            AX_istate.fix_k(isk);
+            const int ax_start = isk * nocc * nvirt;
             for (int i = 0;i < nocc;++i)
             {
                 for (int a = 0;a < nvirt;++a)
@@ -61,7 +59,7 @@ namespace LR
                     {
                         for (int mu = 0;mu < naos;++mu)
                         {
-                            AX_istate(i * nvirt + a) += std::conj(c(nocc + a, mu)) * V_istate[isk].data<std::complex<double>>()[nu * naos + mu] * c(i, nu);
+                            AX_istate[ax_start + i * nvirt + a] += std::conj(c(nocc + a, mu)) * V_istate[isk].data<std::complex<double>>()[nu * naos + mu] * c(i, nu);
                         }
                     }
                 }
@@ -74,7 +72,7 @@ namespace LR
         const psi::Psi<double>& c,
         const int& nocc,
         const int& nvirt,
-        psi::Psi<double>& AX_istate,
+        double* AX_istate,
         const bool add_on)
     {
         ModuleBase::TITLE("hamilt_lrtd", "cal_AX_blas");
@@ -84,7 +82,7 @@ namespace LR
         for (int isk = 0;isk < nks;++isk)
         {
             c.fix_k(isk);
-            AX_istate.fix_k(isk);
+            const int ax_start = isk * nocc * nvirt;
 
             // Vc[naos*nocc]
             container::Tensor Vc(DAT::DT_DOUBLE, DEV::CpuDevice, { nocc, naos });// (Vc)^T
@@ -101,7 +99,7 @@ namespace LR
             //AX_istate=c^TVc (nvirt major)
             dgemm_(&transa, &transb, &nvirt, &nocc, &naos, &alpha,
                 c.get_pointer(nocc), &naos, Vc.data<double>(), &naos, &beta,
-                AX_istate.get_pointer(), &nvirt);
+                AX_istate + ax_start, &nvirt);
         }
     }
     void cal_AX_blas(
@@ -109,7 +107,7 @@ namespace LR
         const psi::Psi<std::complex<double>>& c,
         const int& nocc,
         const int& nvirt,
-        psi::Psi<std::complex<double>>& AX_istate,
+        std::complex<double>* AX_istate,
         const bool add_on)
     {
         ModuleBase::TITLE("hamilt_lrtd", "cal_AX_blas");
@@ -119,7 +117,7 @@ namespace LR
         for (int isk = 0;isk < nks;++isk)
         {
             c.fix_k(isk);
-            AX_istate.fix_k(isk);
+            const int ax_start = isk * nocc * nvirt;
 
             // Vc[naos*nocc] (V is hermitian)
             container::Tensor Vc(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { nocc, naos });// (Vc)^T
@@ -136,7 +134,7 @@ namespace LR
             //AX_istate=c^\dagger Vc (nvirt major)
             zgemm_(&transa, &transb, &nvirt, &nocc, &naos, &alpha,
                 c.get_pointer(nocc), &naos, Vc.data<std::complex<double>>(), &naos, &beta,
-                AX_istate.get_pointer(), &nvirt);
+                AX_istate + ax_start, &nvirt);
         }
     }
 }
\ No newline at end of file
diff --git a/source/module_lr/AX/test/AX_test.cpp b/source/module_lr/AX/test/AX_test.cpp
index 92ed30f7e9..d7b4b5115e 100644
--- a/source/module_lr/AX/test/AX_test.cpp
+++ b/source/module_lr/AX/test/AX_test.cpp
@@ -4,6 +4,7 @@
 
 #include "module_lr/utils/lr_util.h"
 
+#define rand01 (static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 0.5 )
 struct matsize
 {
     int nks = 1;
@@ -23,7 +24,7 @@ class AXTest : public testing::Test
     std::vector<matsize> sizes{
         // {2, 3, 2, 1},
         {2, 13, 7, 4},
-        {2, 14, 8, 5}
+        // {2, 14, 8, 5}
     };
     int nstate = 2;
     std::ofstream ofs_running;
@@ -41,18 +42,12 @@ class AXTest : public testing::Test
     }
 #endif
 
-    void set_ones(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = 1.0; 
-}};
-    void set_int(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = static_cast<double>(i + 1); 
-}};
-    void set_int(std::complex<double>* data, int size) { for (int i = 0;i < size;++i) { data[i] = std::complex<double>(i + 1, -i - 1); 
-}};
-    void set_rand(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = double(rand()) / double(RAND_MAX) * 10.0 - 5.0; 
-}};
-    void set_rand(std::complex<double>* data, int size) { for (int i = 0;i < size;++i) { data[i] = std::complex<double>(rand(), rand()) / double(RAND_MAX) * 10.0 - 5.0; 
-}};
-    void check_eq(double* data1, double* data2, int size) { for (int i = 0;i < size;++i) { EXPECT_NEAR(data1[i], data2[i], 1e-10); 
-}};
+    void set_ones(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = 1.0; } };
+    void set_int(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = static_cast<double>(i + 1); } };
+    void set_int(std::complex<double>* data, int size) { for (int i = 0;i < size;++i) { data[i] = std::complex<double>(i + 1, -i - 1); } };
+    void set_rand(double* data, int size) { for (int i = 0;i < size;++i) { data[i] = rand01 * 10; } };
+    void set_rand(std::complex<double>* data, int size) { for (int i = 0;i < size;++i) { data[i] = std::complex<double>(rand01 * 10, rand01 * 10); } };
+    void check_eq(double* data1, double* data2, int size) { for (int i = 0;i < size;++i) { EXPECT_NEAR(data1[i], data2[i], 1e-10); } };
     void check_eq(std::complex<double>* data1, std::complex<double>* data2, int size)
     {
         for (int i = 0;i < size;++i)
@@ -75,17 +70,12 @@ TEST_F(AXTest, DoubleSerial)
         {
             psi::Psi<double> c(s.nks, s.nocc + s.nvirt, s.naos);
             std::vector<container::Tensor> V(s.nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { s.naos, s.naos }));
-            set_rand(c.get_pointer(), size_c);
-            for (auto& v : V) {set_rand(v.data<double>(), size_v);
-}
-            AX_for.fix_b(istate);
-            AX_blas.fix_b(istate);
-            LR::cal_AX_forloop_serial(V, c, s.nocc, s.nvirt, AX_for);
-            LR::cal_AX_blas(V, c, s.nocc, s.nvirt, AX_blas, false);
+            set_rand(&c(0, 0, 0), size_c);
+            for (auto& v : V) { set_rand(v.data<double>(), size_v); }
+            LR::cal_AX_forloop_serial(V, c, s.nocc, s.nvirt, &AX_for(istate, 0, 0));
+            LR::cal_AX_blas(V, c, s.nocc, s.nvirt, &AX_blas(istate, 0, 0), false);
         }
-        AX_for.fix_b(0);
-        AX_blas.fix_b(0);
-        check_eq(AX_for.get_pointer(), AX_blas.get_pointer(), nstate * s.nks * s.nocc * s.nvirt);
+        check_eq(&AX_for(0, 0, 0), &AX_blas(0, 0, 0), nstate * s.nks * s.nocc * s.nvirt);
     }
 }
 
@@ -101,17 +91,12 @@ TEST_F(AXTest, ComplexSerial)
         {
             psi::Psi<std::complex<double>> c(s.nks, s.nocc + s.nvirt, s.naos);
             std::vector<container::Tensor> V(s.nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { s.naos, s.naos }));
-            set_rand(c.get_pointer(), size_c);
-            for (auto& v : V) {set_rand(v.data<std::complex<double>>(), size_v);
-}
-            AX_for.fix_b(istate);
-            AX_blas.fix_b(istate);
-            LR::cal_AX_forloop_serial(V, c, s.nocc, s.nvirt, AX_for);
-            LR::cal_AX_blas(V, c, s.nocc, s.nvirt, AX_blas, false);
+            set_rand(&c(0, 0, 0), size_c);
+            for (auto& v : V) { set_rand(v.data<std::complex<double>>(), size_v); }
+            LR::cal_AX_forloop_serial(V, c, s.nocc, s.nvirt, &AX_for(istate, 0, 0));
+            LR::cal_AX_blas(V, c, s.nocc, s.nvirt, &AX_blas(istate, 0, 0), false);
         }
-        AX_for.fix_b(0);
-        AX_blas.fix_b(0);
-        check_eq(AX_for.get_pointer(), AX_blas.get_pointer(), nstate * s.nks * s.nocc * s.nvirt);
+        check_eq(&AX_for(0, 0, 0), &AX_blas(0, 0, 0), nstate * s.nks * s.nocc * s.nvirt);
     }
 }
 #ifdef __MPI
@@ -135,25 +120,20 @@ TEST_F(AXTest, DoubleParallel)
         EXPECT_GE(s.nvirt, px.dim0);
         EXPECT_GE(s.nocc, px.dim1);
         EXPECT_GE(s.naos, pc.dim0);
-        psi::Psi<double> AX_pblas_loc(s.nks, nstate, px.get_local_size());
+        psi::Psi<double> AX_pblas_loc(s.nks, nstate, px.get_local_size(), nullptr, false);
         psi::Psi<double> AX_gather(s.nks, nstate, s.nocc * s.nvirt, nullptr, false);
         for (int istate = 0;istate < nstate;++istate)
         {
             for (int isk = 0;isk < s.nks;++isk)
             {
                 set_rand(V.at(isk).data<double>(), pV.get_local_size());
-                c.fix_k(isk);
-                set_rand(c.get_pointer(), pc.get_local_size());
+                set_rand(&c(isk, 0, 0), pc.get_local_size());
             }
-            AX_pblas_loc.fix_b(istate);
-            AX_gather.fix_b(istate);
-            LR::cal_AX_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, px, AX_pblas_loc, false);
+            LR::cal_AX_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, px, &AX_pblas_loc(istate, 0, 0), false);
             // gather AX and output
             for (int isk = 0;isk < s.nks;++isk)
             {
-                AX_pblas_loc.fix_k(isk);
-                AX_gather.fix_k(isk);
-                LR_Util::gather_2d_to_full(px, AX_pblas_loc.get_pointer(), AX_gather.get_pointer(), false/*pblas: row first*/, s.nvirt, s.nocc);
+                LR_Util::gather_2d_to_full(px, &AX_pblas_loc(istate, isk, 0), &AX_gather(istate, isk, 0), false/*pblas: row first*/, s.nvirt, s.nocc);
             }
             // compare to global AX
             std::vector<container::Tensor> V_full(s.nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { s.naos, s.naos }));
@@ -161,17 +141,13 @@ TEST_F(AXTest, DoubleParallel)
             for (int isk = 0;isk < s.nks;++isk)
             {
                 LR_Util::gather_2d_to_full(pV, V.at(isk).data<double>(), V_full.at(isk).data<double>(), false, s.naos, s.naos);
-                c.fix_k(isk);
-                c_full.fix_k(isk);
-                LR_Util::gather_2d_to_full(pc, c.get_pointer(), c_full.get_pointer(), false, s.naos, s.nocc + s.nvirt);
+                LR_Util::gather_2d_to_full(pc, &c(isk, 0, 0), &c_full(isk, 0, 0), false, s.naos, s.nocc + s.nvirt);
             }
             if (my_rank == 0)
             {
                 psi::Psi<double>  AX_full_istate(s.nks, 1, s.nocc * s.nvirt, nullptr, false);
-                LR::cal_AX_blas(V_full, c_full, s.nocc, s.nvirt, AX_full_istate, false);
-                AX_full_istate.fix_b(0);
-                AX_gather.fix_b(istate);
-                check_eq(AX_full_istate.get_pointer(), AX_gather.get_pointer(), s.nks * s.nocc * s.nvirt);
+                LR::cal_AX_blas(V_full, c_full, s.nocc, s.nvirt, &AX_full_istate(0, 0, 0), false);
+                check_eq(&AX_full_istate(0, 0, 0), &AX_gather(istate, 0, 0), s.nks * s.nocc * s.nvirt);
             }
         }
     }
@@ -191,26 +167,21 @@ TEST_F(AXTest, ComplexParallel)
         Parallel_2D px;
         LR_Util::setup_2d_division(px, s.nb, s.nvirt, s.nocc, pV.blacs_ctxt);
 
-        psi::Psi<std::complex<double>> AX_pblas_loc(s.nks, nstate, px.get_local_size());
+        psi::Psi<std::complex<double>> AX_pblas_loc(s.nks, nstate, px.get_local_size(), nullptr, false);
         psi::Psi<std::complex<double>> AX_gather(s.nks, nstate, s.nocc * s.nvirt, nullptr, false);
         for (int istate = 0;istate < nstate;++istate)
         {
             for (int isk = 0;isk < s.nks;++isk)
             {
                 set_rand(V.at(isk).data<std::complex<double>>(), pV.get_local_size());
-                c.fix_k(isk);
-                set_rand(c.get_pointer(), pc.get_local_size());
+                set_rand(&c(isk, 0, 0), pc.get_local_size());
             }
-            AX_pblas_loc.fix_b(istate);
-            AX_gather.fix_b(istate);
-            LR::cal_AX_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, px, AX_pblas_loc, false);
+            LR::cal_AX_pblas(V, pV, c, pc, s.naos, s.nocc, s.nvirt, px, &AX_pblas_loc(istate, 0, 0), false);
 
             // gather AX and output
             for (int isk = 0;isk < s.nks;++isk)
             {
-                AX_pblas_loc.fix_k(isk);
-                AX_gather.fix_k(isk);
-                LR_Util::gather_2d_to_full(px, AX_pblas_loc.get_pointer(), AX_gather.get_pointer(), false/*pblas: row first*/, s.nvirt, s.nocc);
+                LR_Util::gather_2d_to_full(px, &AX_pblas_loc(istate, isk, 0), &AX_gather(istate, isk, 0), false/*pblas: row first*/, s.nvirt, s.nocc);
             }
             // compare to global AX
             std::vector<container::Tensor> V_full(s.nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { s.naos, s.naos }));
@@ -218,17 +189,13 @@ TEST_F(AXTest, ComplexParallel)
             for (int isk = 0;isk < s.nks;++isk)
             {
                 LR_Util::gather_2d_to_full(pV, V.at(isk).data<std::complex<double>>(), V_full.at(isk).data<std::complex<double>>(), false, s.naos, s.naos);
-                c.fix_k(isk);
-                c_full.fix_k(isk);
-                LR_Util::gather_2d_to_full(pc, c.get_pointer(), c_full.get_pointer(), false, s.naos, s.nocc + s.nvirt);
+                LR_Util::gather_2d_to_full(pc, &c(isk, 0, 0), &c_full(isk, 0, 0), false, s.naos, s.nocc + s.nvirt);
             }
             if (my_rank == 0)
             {
                 psi::Psi<std::complex<double>>  AX_full_istate(s.nks, 1, s.nocc * s.nvirt, nullptr, false);
-                LR::cal_AX_blas(V_full, c_full, s.nocc, s.nvirt, AX_full_istate, false);
-                AX_full_istate.fix_b(0);
-                AX_gather.fix_b(istate);
-                check_eq(AX_full_istate.get_pointer(), AX_gather.get_pointer(), s.nks * s.nocc * s.nvirt);
+                LR::cal_AX_blas(V_full, c_full, s.nocc, s.nvirt, &AX_full_istate(0, 0, 0), false);
+                check_eq(&AX_full_istate(0, 0, 0), &AX_gather(istate, 0, 0), s.nks * s.nocc * s.nvirt);
             }
         }
     }
diff --git a/source/module_lr/CMakeLists.txt b/source/module_lr/CMakeLists.txt
index 12743eef8f..4b816f09b3 100644
--- a/source/module_lr/CMakeLists.txt
+++ b/source/module_lr/CMakeLists.txt
@@ -15,7 +15,6 @@ if(ENABLE_LCAO)
     operator_casida/operator_lr_hxc.cpp
     operator_casida/operator_lr_exx.cpp
     potentials/pot_hxc_lrtd.cpp
-    hsolver_lrtd.cpp
     lr_spectrum.cpp
     hamilt_casida.cpp
     esolver_lrtd_lcao.cpp)
diff --git a/source/module_lr/dm_trans/dm_trans.h b/source/module_lr/dm_trans/dm_trans.h
index a3fdee2938..283d22149c 100644
--- a/source/module_lr/dm_trans/dm_trans.h
+++ b/source/module_lr/dm_trans/dm_trans.h
@@ -14,14 +14,14 @@ namespace LR
 /// \f[ \tilde{\rho}_{\mu_j\mu_b}=\sum_{jb}c_{j,\mu_j}X_{jb}c^*_{b,\mu_b} \f]
     template<typename T>
     std::vector<container::Tensor> cal_dm_trans_pblas(
-        const psi::Psi<T>& X_istate,
+        const T* X_istate,
         const Parallel_2D& px,
         const psi::Psi<T>& c,
         const Parallel_2D& pc,
         const int naos,
         const int nocc,
         const int nvirt,
-        Parallel_2D& pmat,
+        const Parallel_2D& pmat,
         const bool renorm_k = true,
         const int nspin = 1);
 #endif
@@ -29,7 +29,7 @@ namespace LR
     /// @brief calculate the 2d-block transition density matrix in AO basis using ?gemm
     template<typename T>
     std::vector<container::Tensor> cal_dm_trans_blas(
-        const psi::Psi<T>& X_istate,
+        const T* X_istate,
         const psi::Psi<T>& c,
         const int& nocc, const int& nvirt,
         const bool renorm_k = true,
@@ -39,7 +39,7 @@ namespace LR
     /// @brief calculate the 2d-block transition density matrix in AO basis using for loop (for test)
     template<typename T>
     std::vector<container::Tensor> cal_dm_trans_forloop_serial(
-        const psi::Psi<T>& X_istate,
+        const T* X_istate,
         const psi::Psi<T>& c,
         const int& nocc, const int& nvirt,
         const bool renorm_k = true,
diff --git a/source/module_lr/dm_trans/dm_trans_parallel.cpp b/source/module_lr/dm_trans/dm_trans_parallel.cpp
index 68123bf74c..a21a95c8ee 100644
--- a/source/module_lr/dm_trans/dm_trans_parallel.cpp
+++ b/source/module_lr/dm_trans/dm_trans_parallel.cpp
@@ -10,36 +10,30 @@ namespace LR
     // c: nao*nbands in para2d, nbands*nao in psi  (row-para and constructed: nao)
     // X: nvirt*nocc in para2d, nocc*nvirt in psi (row-para and constructed: nvirt)
 template <>
-std::vector<container::Tensor> cal_dm_trans_pblas(const psi::Psi<double>& X_istate,
-                                                  const Parallel_2D& px,
-                                                  const psi::Psi<double>& c,
-                                                  const Parallel_2D& pc,
-                                                  const int naos,
-                                                  const int nocc,
-                                                  const int nvirt,
-                                                  Parallel_2D& pmat,
-                                                  const bool renorm_k,
-                                                  const int nspin)
+std::vector<container::Tensor> cal_dm_trans_pblas(const double* X_istate,
+    const Parallel_2D& px,
+    const psi::Psi<double>& c,
+    const Parallel_2D& pc,
+    const int naos,
+    const int nocc,
+    const int nvirt,
+    const Parallel_2D& pmat,
+    const bool renorm_k,
+    const int nspin)
 {
     ModuleBase::TITLE("hamilt_lrtd", "cal_dm_trans_pblas");
-    assert(px.comm() == pc.comm());
-    assert(px.blacs_ctxt == pc.blacs_ctxt);
+    assert(px.comm() == pc.comm() && px.comm() == pmat.comm());
+    assert(px.blacs_ctxt == pc.blacs_ctxt && px.blacs_ctxt == pmat.blacs_ctxt);
+    assert(pmat.get_local_size() > 0);
 
-    if (pmat.comm() != px.comm() || pmat.blacs_ctxt != px.blacs_ctxt) {
-        LR_Util::setup_2d_division(pmat, px.get_block_size(), naos, naos, px.blacs_ctxt);
-    } else {
-        assert(pmat.get_local_size() > 0);
-}
-
-    const int nks = X_istate.get_nk();
+    const int nks = c.get_nk();
 
-    std::vector<container::Tensor> dm_trans(
-        nks,
-        container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, {pmat.get_col_size(), pmat.get_row_size()}));
+    std::vector<container::Tensor> dm_trans(nks,
+        container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { pmat.get_col_size(), pmat.get_row_size() }));
     for (int isk = 0; isk < nks; ++isk)
     {
         c.fix_k(isk);
-        X_istate.fix_k(isk);
+        const int x_start = isk * px.get_local_size();
         int i1 = 1;
         int ivirt = nocc + 1;
         char transa = 'N';
@@ -54,79 +48,43 @@ std::vector<container::Tensor> cal_dm_trans_pblas(const psi::Psi<double>& X_ista
                              DEV::CpuDevice,
                              {pXc.get_col_size(), pXc.get_row_size()}); // row is "inside"(memory contiguity) for pblas
         Xc.zero();
-        pdgemm_(&transa,
-                &transb,
-                &naos,
-                &nvirt,
-                &nocc,
-                &alpha,
-                c.get_pointer(),
-                &i1,
-                &i1,
-                pc.desc,
-                X_istate.get_pointer(),
-                &i1,
-                &i1,
-                px.desc,
-                &beta,
-                Xc.data<double>(),
-                &i1,
-                &i1,
-                pXc.desc);
+        pdgemm_(&transa, &transb, &naos, &nvirt, &nocc,
+            &alpha, c.get_pointer(), &i1, &i1, pc.desc,
+            X_istate + x_start, &i1, &i1, px.desc,
+            &beta, Xc.data<double>(), &i1, &i1, pXc.desc);
 
         // 2. C_virt*[X*C_occ^T]
-        pdgemm_(&transa,
-                &transb,
-                &naos,
-                &naos,
-                &nvirt,
-                &alpha,
-                c.get_pointer(),
-                &i1,
-                &ivirt,
-                pc.desc,
-                Xc.data<double>(),
-                &i1,
-                &i1,
-                pXc.desc,
-                &beta,
-                dm_trans[isk].data<double>(),
-                &i1,
-                &i1,
-                pmat.desc);
+        pdgemm_(&transa, &transb, &naos, &naos, &nvirt,
+            &alpha, c.get_pointer(), &i1, &ivirt, pc.desc,
+            Xc.data<double>(), &i1, &i1, pXc.desc,
+            &beta, dm_trans[isk].data<double>(), &i1, &i1, pmat.desc);
     }
     return dm_trans;
 }
 template <>
-std::vector<container::Tensor> cal_dm_trans_pblas(const psi::Psi<std::complex<double>>& X_istate,
-                                                  const Parallel_2D& px,
-                                                  const psi::Psi<std::complex<double>>& c,
-                                                  const Parallel_2D& pc,
-                                                  const int naos,
-                                                  const int nocc,
-                                                  const int nvirt,
-                                                  Parallel_2D& pmat,
-                                                  const bool renorm_k,
-                                                  const int nspin)
+std::vector<container::Tensor> cal_dm_trans_pblas(const std::complex<double>* X_istate,
+    const Parallel_2D& px,
+    const psi::Psi<std::complex<double>>& c,
+    const Parallel_2D& pc,
+    const int naos,
+    const int nocc,
+    const int nvirt,
+    const Parallel_2D& pmat,
+    const bool renorm_k,
+    const int nspin)
 {
     ModuleBase::TITLE("hamilt_lrtd", "cal_dm_trans_pblas");
-    assert(px.comm() == pc.comm());
-    assert(px.blacs_ctxt == pc.blacs_ctxt);
-
-    if (pmat.comm() != px.comm() || pmat.blacs_ctxt != px.blacs_ctxt) {
-        LR_Util::setup_2d_division(pmat, px.get_block_size(), naos, naos, px.blacs_ctxt);
-    } else {
-        assert(pmat.get_local_size() > 0);
-}
-    const int nks = X_istate.get_nk();
+    assert(px.comm() == pc.comm() && px.comm() == pmat.comm());
+    assert(px.blacs_ctxt == pc.blacs_ctxt && px.blacs_ctxt == pmat.blacs_ctxt);
+    assert(pmat.get_local_size() > 0);
+    const int nks = c.get_nk();
 
-    std::vector<container::Tensor> dm_trans(
-        nks,
+    std::vector<container::Tensor> dm_trans(nks,
         container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, {pmat.get_col_size(), pmat.get_row_size()}));
     for (int isk = 0; isk < nks; ++isk)
     {
         c.fix_k(isk);
-        X_istate.fix_k(isk);
+        const int x_start = isk * px.get_local_size();
         int i1 = 1;
         int ivirt = nocc + 1;
 
@@ -141,7 +99,7 @@ std::vector<container::Tensor> cal_dm_trans_pblas(const psi::Psi<std::complex<do
         // const std::complex<double> beta(0.0, 0.0);
         // pzgemm_(&transa, &transb, &naos, &nvirt, &nocc,
         //     &alpha, c.get_pointer(), &i1, &i1, pc.desc,
-        //     X_istate.get_pointer(), &i1, &i1, px.desc,
+        //     X_istate + x_start, &i1, &i1, px.desc,
         //     &beta, Xc.data<std::complex<double>>(), &i1, &i1, pXc.desc);
 
         // // 2. C_virt*[X*C_occ^\dagger]
@@ -163,48 +121,18 @@ std::vector<container::Tensor> cal_dm_trans_pblas(const psi::Psi<std::complex<do
         Xc.zero();
         std::complex<double> alpha(1.0, 0.0);
         const std::complex<double> beta(0.0, 0.0);
-        pzgemm_(&transa,
-                &transb,
-                &nvirt,
-                &naos,
-                &nocc,
-                &alpha,
-                X_istate.get_pointer(),
-                &i1,
-                &i1,
-                px.desc,
-                c.get_pointer(),
-                &i1,
-                &i1,
-                pc.desc,
-                &beta,
-                Xc.data<std::complex<double>>(),
-                &i1,
-                &i1,
-                pXc.desc);
+        pzgemm_(&transa, &transb, &nvirt, &naos, &nocc, &alpha,
+            X_istate + x_start, &i1, &i1, px.desc,
+            c.get_pointer(), &i1, &i1, pc.desc,
+            &beta, Xc.data<std::complex<double>>(), &i1, &i1, pXc.desc);
 
         // 2. [X*C_occ^\dagger]^TC_virt^T
         alpha.real(renorm_k ? 1.0 / static_cast<double>(nks) : 1.0);
         transa = transb = 'T';
-        pzgemm_(&transa,
-                &transb,
-                &naos,
-                &naos,
-                &nvirt,
-                &alpha,
-                Xc.data<std::complex<double>>(),
-                &i1,
-                &i1,
-                pXc.desc,
-                c.get_pointer(),
-                &i1,
-                &ivirt,
-                pc.desc,
-                &beta,
-                dm_trans[isk].data<std::complex<double>>(),
-                &i1,
-                &i1,
-                pmat.desc);
+        pzgemm_(&transa, &transb, &naos, &naos, &nvirt,
+            &alpha, Xc.data<std::complex<double>>(), &i1, &i1, pXc.desc,
+            c.get_pointer(), &i1, &ivirt, pc.desc,
+            &beta, dm_trans[isk].data<std::complex<double>>(), &i1, &i1, pmat.desc);
     }
     return dm_trans;
 }
diff --git a/source/module_lr/dm_trans/dm_trans_serial.cpp b/source/module_lr/dm_trans/dm_trans_serial.cpp
index cbdd63a6c8..70dc6e6542 100644
--- a/source/module_lr/dm_trans/dm_trans_serial.cpp
+++ b/source/module_lr/dm_trans/dm_trans_serial.cpp
@@ -6,7 +6,7 @@
 namespace LR
 {
     template<> std::vector<container::Tensor> cal_dm_trans_forloop_serial(
-        const psi::Psi<double>& X_istate,
+        const double* X_istate,
         const psi::Psi<double>& c,
         const int& nocc,
         const int& nvirt,
@@ -15,16 +15,15 @@ namespace LR
     {
         // cxc_out_test(X_istate, c);
         ModuleBase::TITLE("hamilt_lrtd", "cal_dm_trans_forloop");
-        const int nks = X_istate.get_nk();
-        assert(nocc * nvirt == X_istate.get_nbasis());
-        int naos = c.get_nbasis();
+        const int nks = c.get_nk();
+        const int naos = c.get_nbasis();
         std::vector<container::Tensor> dm_trans(nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { naos, naos }));
         for (auto& dm : dm_trans)ModuleBase::GlobalFunc::ZEROS(dm.data<double>(), naos * naos);
         // loop for AOs
         for (size_t isk = 0;isk < nks;++isk)
         {
             c.fix_k(isk);
-            X_istate.fix_k(isk);
+            const int x_start = isk * nocc * nvirt;
             for (size_t mu = 0;mu < naos;++mu)
             {
                 for (size_t nu = 0;nu < naos;++nu)
@@ -33,7 +32,7 @@ namespace LR
                     for (size_t j = 0;j < nocc;++j)
                     {
                         for (size_t b = 0; b < nvirt;++b)
-                            dm_trans[isk].data<double>()[mu * naos + nu] += c(j, mu) * X_istate(j * nvirt + b) * c(nocc + b, nu);
+                            dm_trans[isk].data<double>()[mu * naos + nu] += c(j, mu) * X_istate[x_start + j * nvirt + b] * c(nocc + b, nu);
                     }
                 }
             }
@@ -42,7 +41,7 @@ namespace LR
     }
 
     template<> std::vector<container::Tensor> cal_dm_trans_forloop_serial(
-        const psi::Psi<std::complex<double>>& X_istate,
+        const std::complex<double>* X_istate,
         const psi::Psi<std::complex<double>>& c,
         const int& nocc,
         const int& nvirt,
@@ -51,8 +50,7 @@ namespace LR
     {
         // cxc_out_test(X_istate, c);
         ModuleBase::TITLE("hamilt_lrtd", "cal_dm_trans_forloop");
-        const int nks = X_istate.get_nk();
-        assert(nocc * nvirt == X_istate.get_nbasis());
+        const int nks = c.get_nk();
         int naos = c.get_nbasis();
         std::vector<container::Tensor> dm_trans(nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { naos, naos }));
         for (auto& dm : dm_trans)ModuleBase::GlobalFunc::ZEROS(dm.data<std::complex<double>>(), naos * naos);
@@ -60,7 +58,7 @@ namespace LR
         for (size_t isk = 0;isk < nks;++isk)
         {
             c.fix_k(isk);
-            X_istate.fix_k(isk);
+            const int x_start = isk * nocc * nvirt;
             for (size_t mu = 0;mu < naos;++mu)
             {
                 for (size_t nu = 0;nu < naos;++nu)
@@ -70,7 +68,7 @@ namespace LR
                     {
                         for (size_t b = 0; b < nvirt;++b)
                             dm_trans[isk].data<std::complex<double>>()[nu * naos + mu] +=
-                            std::conj(c(j, mu)) * X_istate(j * nvirt + b) * c(nocc + b, nu) / static_cast<double>(nks / nspin);
+                            std::conj(c(j, mu)) * X_istate[x_start + j * nvirt + b] * c(nocc + b, nu) / static_cast<double>(nks / nspin);
                     }
                 }
             }
@@ -80,7 +78,7 @@ namespace LR
 
 
     template<> std::vector<container::Tensor> cal_dm_trans_blas(
-        const psi::Psi<double>& X_istate,
+        const double* X_istate,
         const psi::Psi<double>& c,
         const int& nocc,
         const int& nvirt,
@@ -88,14 +86,13 @@ namespace LR
         const int nspin)
     {
         ModuleBase::TITLE("hamilt_lrtd", "cal_dm_trans_blas");
-        const int nks = X_istate.get_nk();
-        assert(nocc * nvirt == X_istate.get_nbasis());
+        const int nks = c.get_nk();
         int naos = c.get_nbasis();
         std::vector<container::Tensor> dm_trans(nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { naos, naos }));
         for (size_t isk = 0;isk < nks;++isk)
         {
             c.fix_k(isk);
-            X_istate.fix_k(isk);
+            const int x_start = isk * nocc * nvirt;
             // 1. [X*C_occ^T]^T=C_occ*X^T
             char transa = 'N';
             char transb = 'T';
@@ -103,7 +100,7 @@ namespace LR
             const double beta = 0.0;
             container::Tensor Xc(DAT::DT_DOUBLE, DEV::CpuDevice, { nvirt, naos });
             dgemm_(&transa, &transb, &naos, &nvirt, &nocc, &alpha,
-                c.get_pointer(), &naos, X_istate.get_pointer(), &nvirt,
+                c.get_pointer(), &naos, X_istate + x_start, &nvirt,
                 &beta, Xc.data<double>(), &naos);
             // 2. C_virt*[X*C_occ^T]
             dgemm_(&transa, &transb, &naos, &naos, &nvirt, &alpha,
@@ -115,7 +112,7 @@ namespace LR
 
 
     template<> std::vector<container::Tensor> cal_dm_trans_blas(
-        const psi::Psi<std::complex<double>>& X_istate,
+        const std::complex<double>* X_istate,
         const psi::Psi<std::complex<double>>& c,
         const int& nocc,
         const int& nvirt,
@@ -123,14 +120,13 @@ namespace LR
         const int nspin)
     {
         ModuleBase::TITLE("hamilt_lrtd", "cal_dm_trans_blas");
-        const int nks = X_istate.get_nk();
-        assert(nocc * nvirt == X_istate.get_nbasis());
+        const int nks = c.get_nk();
         int naos = c.get_nbasis();
         std::vector<container::Tensor> dm_trans(nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { naos, naos }));
         for (size_t isk = 0;isk < nks;++isk)
         {
             c.fix_k(isk);
-            X_istate.fix_k(isk);
+            const int x_start = isk * nocc * nvirt;
 
             char transa = 'N';
             char transb = 'C';
@@ -154,7 +150,7 @@ namespace LR
             // 1. X*C_occ^\dagger
             container::Tensor Xc(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { naos, nvirt });
             zgemm_(&transa, &transb, &nvirt, &naos, &nocc, &alpha,
-                X_istate.get_pointer(), &nvirt, c.get_pointer(), &naos,
+                X_istate + x_start, &nvirt, c.get_pointer(), &naos,
                 &beta, Xc.data<std::complex<double>>(), &nvirt);
             // 2. [X*C_occ^\dagger]^TC_virt^T
             transa = transb = 'T';
diff --git a/source/module_lr/dm_trans/test/dm_trans_test.cpp b/source/module_lr/dm_trans/test/dm_trans_test.cpp
index 4e0a034f20..212db64cc8 100644
--- a/source/module_lr/dm_trans/test/dm_trans_test.cpp
+++ b/source/module_lr/dm_trans/test/dm_trans_test.cpp
@@ -68,8 +68,8 @@ TEST_F(DMTransTest, DoubleSerial)
             psi::Psi<double> c(s.nks, s.nocc + s.nvirt, s.naos);
             set_rand(c.get_pointer(), size_c);
             X.fix_b(istate);
-            const std::vector<container::Tensor>& dm_for = LR::cal_dm_trans_forloop_serial(X, c, s.nocc, s.nvirt);
-            const std::vector<container::Tensor>& dm_blas = LR::cal_dm_trans_blas(X, c, s.nocc, s.nvirt);
+            const std::vector<container::Tensor>& dm_for = LR::cal_dm_trans_forloop_serial(X.get_pointer(), c, s.nocc, s.nvirt);
+            const std::vector<container::Tensor>& dm_blas = LR::cal_dm_trans_blas(X.get_pointer(), c, s.nocc, s.nvirt);
             for (int isk = 0;isk < s.nks;++isk) check_eq(dm_for[isk].data<double>(), dm_blas[isk].data<double>(), s.naos * s.naos);
         }
 
@@ -87,8 +87,8 @@ TEST_F(DMTransTest, ComplexSerial)
             psi::Psi<std::complex<double>> c(s.nks, s.nocc + s.nvirt, s.naos);
             set_rand(c.get_pointer(), size_c);
             X.fix_b(istate);
-            const std::vector<container::Tensor>& dm_for = LR::cal_dm_trans_forloop_serial(X, c, s.nocc, s.nvirt);
-            const std::vector<container::Tensor>& dm_blas = LR::cal_dm_trans_blas(X, c, s.nocc, s.nvirt);
+            const std::vector<container::Tensor>& dm_for = LR::cal_dm_trans_forloop_serial(X.get_pointer(), c, s.nocc, s.nvirt);
+            const std::vector<container::Tensor>& dm_blas = LR::cal_dm_trans_blas(X.get_pointer(), c, s.nocc, s.nvirt);
             for (int isk = 0;isk < s.nks;++isk) check_eq(dm_for[isk].data<std::complex<double>>(), dm_blas[isk].data<std::complex<double>>(), s.naos * s.naos);
         }
 
@@ -109,6 +109,7 @@ TEST_F(DMTransTest, DoubleParallel)
         LR_Util::setup_2d_division(pc, s.nb, s.naos, s.nocc + s.nvirt, px.blacs_ctxt);
         psi::Psi<double> c(s.nks, pc.get_col_size(), pc.get_row_size());
         Parallel_2D pmat;
+        LR_Util::setup_2d_division(pmat, s.nb, s.naos, s.naos, px.blacs_ctxt);
 
         EXPECT_EQ(px.dim0, pc.dim0);
         EXPECT_EQ(px.dim1, pc.dim1);
@@ -137,7 +138,7 @@ TEST_F(DMTransTest, DoubleParallel)
             X.fix_b(istate);
             X_full.fix_b(istate);
 
-            std::vector<container::Tensor> dm_pblas_loc = LR::cal_dm_trans_pblas(X, px, c, pc, s.naos, s.nocc, s.nvirt, pmat);
+            std::vector<container::Tensor> dm_pblas_loc = LR::cal_dm_trans_pblas(X.get_pointer(), px, c, pc, s.naos, s.nocc, s.nvirt, pmat);
 
             // gather dm and output
             std::vector<container::Tensor> dm_gather(s.nks, container::Tensor(DAT::DT_DOUBLE, DEV::CpuDevice, { s.naos, s.naos }));
@@ -154,7 +155,7 @@ TEST_F(DMTransTest, DoubleParallel)
             }
             if (my_rank == 0)
             {
-                const std::vector<container::Tensor>& dm_full = LR::cal_dm_trans_blas(X_full, c_full, s.nocc, s.nvirt);
+                const std::vector<container::Tensor>& dm_full = LR::cal_dm_trans_blas(X_full.get_pointer(), c_full, s.nocc, s.nvirt);
                 for (int isk = 0;isk < s.nks;++isk) check_eq(dm_full[isk].data<double>(), dm_gather[isk].data<double>(), s.naos * s.naos);
             }
         }
@@ -173,6 +174,7 @@ TEST_F(DMTransTest, ComplexParallel)
         LR_Util::setup_2d_division(pc, s.nb, s.naos, s.nocc + s.nvirt, px.blacs_ctxt);
         psi::Psi<std::complex<double>> c(s.nks, pc.get_col_size(), pc.get_row_size());
         Parallel_2D pmat;
+        LR_Util::setup_2d_division(pmat, s.nb, s.naos, s.naos, px.blacs_ctxt);
 
         set_rand(X.get_pointer(), nstate * s.nks * px.get_local_size());        //set X and X_full
         psi::Psi<std::complex<double>> X_full(s.nks, nstate, s.nocc * s.nvirt, nullptr, false);        // allocate X_full
@@ -195,7 +197,7 @@ TEST_F(DMTransTest, ComplexParallel)
             X.fix_b(istate);
             X_full.fix_b(istate);
 
-            std::vector<container::Tensor> dm_pblas_loc = LR::cal_dm_trans_pblas(X, px, c, pc, s.naos, s.nocc, s.nvirt, pmat);
+            std::vector<container::Tensor> dm_pblas_loc = LR::cal_dm_trans_pblas(X.get_pointer(), px, c, pc, s.naos, s.nocc, s.nvirt, pmat);
 
             // gather dm and output
             std::vector<container::Tensor> dm_gather(s.nks, container::Tensor(DAT::DT_COMPLEX_DOUBLE, DEV::CpuDevice, { s.naos, s.naos }));
@@ -212,7 +214,7 @@ TEST_F(DMTransTest, ComplexParallel)
             }
             if (my_rank == 0)
             {
-                std::vector<container::Tensor> dm_full = LR::cal_dm_trans_blas(X_full, c_full, s.nocc, s.nvirt);
+                std::vector<container::Tensor> dm_full = LR::cal_dm_trans_blas(X_full.get_pointer(), c_full, s.nocc, s.nvirt);
                 for (int isk = 0;isk < s.nks;++isk) check_eq(dm_full[isk].data<std::complex<double>>(), dm_gather[isk].data<std::complex<double>>(), s.naos * s.naos);
             }
         }
diff --git a/source/module_lr/esolver_lrtd_lcao.cpp b/source/module_lr/esolver_lrtd_lcao.cpp
index f21a86d2c1..2f4f3b4b51 100644
--- a/source/module_lr/esolver_lrtd_lcao.cpp
+++ b/source/module_lr/esolver_lrtd_lcao.cpp
@@ -2,8 +2,9 @@
 #include "utils/gint_move.hpp"
 #include "utils/lr_util.h"
 #include "hamilt_casida.h"
+#include "hamilt_ulr.hpp"
 #include "module_lr/potentials/pot_hxc_lrtd.h"
-#include "module_lr/hsolver_lrtd.h"
+#include "module_lr/hsolver_lrtd.hpp"
 #include "module_lr/lr_spectrum.h"
 #include <memory>
 #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
@@ -67,6 +68,21 @@ inline void redirect_log(const bool& out_alllog)
     }
 }
 
+inline int cal_nupdown_form_occ(const ModuleBase::matrix& wg)
+{   // only for nspin=2
+    const int& nk = wg.nr / 2;
+    auto occ_sum_k = [&](const int& is, const int& ib)->double { double o = 0.0; for (int ik = 0;ik < nk;++ik) { o += wg(is * nk + ik, ib); } return o;};
+    int nupdown = 0;
+    for (int ib = 0;ib < wg.nc;++ib)
+    {
+        const int nu = static_cast<int>(std::lround(occ_sum_k(0, ib)));
+        const int nd = static_cast<int>(std::lround(occ_sum_k(1, ib)));
+        if ((nu + nd) == 0) { break; }
+        nupdown += nu - nd;
+    }
+    return nupdown;
+}
+
 template<typename T, typename TR>
 void LR::ESolver_LR<T, TR>::parameter_check()const
 {
@@ -87,32 +103,25 @@ template<typename T, typename TR>
 void LR::ESolver_LR<T, TR>::set_dimension()
 {
     this->nspin = PARAM.inp.nspin;
-    if (nspin == 2) { std::cout << "** Assuming the spin-up and spin-down states are degenerate. **" << std::endl;
-}
     this->nstates = input.lr_nstates;
     this->nbasis = PARAM.globalv.nlocal;
     // calculate the number of occupied and unoccupied states
     // which determines the basis size of the excited states
     this->nocc_max = LR_Util::cal_nocc(LR_Util::cal_nelec(ucell));
-    this->nocc = std::max(1, std::min(input.nocc, this->nocc_max));
-    this->nvirt = PARAM.inp.nbands - this->nocc_max;   //nbands-nocc
-    if (input.nvirt > this->nvirt) {
-        GlobalV::ofs_warning << "ESolver_LR: input nvirt is too large to cover by nbands, set nvirt = nbands - nocc = " << this->nvirt << std::endl;
-    } else if (input.nvirt > 0) { this->nvirt = input.nvirt;
-}
-    this->nbands = this->nocc + this->nvirt;
-    this->npairs = this->nocc * this->nvirt;
+    this->nocc_in = std::max(1, std::min(input.nocc, this->nocc_max));
+    this->nvirt_in = PARAM.inp.nbands - this->nocc_max;   //nbands-nocc
+    if (input.nvirt > this->nvirt_in) { GlobalV::ofs_warning << "ESolver_LR: input nvirt is too large to cover by nbands, set nvirt = nbands - nocc = " << this->nvirt_in << std::endl; }
+    else if (input.nvirt > 0) { this->nvirt_in = input.nvirt; }
+    this->nbands = this->nocc_in + this->nvirt_in;
     this->nk = this->kv.get_nks() / this->nspin;
-    if (this->nstates > this->nocc * this->nvirt * this->nk) {
-        throw std::invalid_argument("ESolver_LR: nstates > nocc*nvirt*nk");
-}
-
+    this->nocc.resize(nspin, nocc_in);
+    this->nvirt.resize(nspin, nvirt_in);
+    for (int is = 0;is < nspin;++is) { this->npairs.push_back(nocc[is] * nvirt[is]); }
     GlobalV::ofs_running << "Setting LR-TDDFT parameters: " << std::endl;
-    GlobalV::ofs_running << "number of occupied bands: " << this->nocc << std::endl;
-    GlobalV::ofs_running << "number of virtual bands: " << this->nvirt << std::endl;
+    GlobalV::ofs_running << "number of occupied bands: " << nocc_in << std::endl;
+    GlobalV::ofs_running << "number of virtual bands: " << nvirt_in << std::endl;
     GlobalV::ofs_running << "number of Atom orbitals (LCAO-basis size): " << this->nbasis << std::endl;
     GlobalV::ofs_running << "number of KS bands: " << this->eig_ks.nc << std::endl;
-    GlobalV::ofs_running << "number of electron-hole pairs (2-particle basis size): " << this->npairs << std::endl;
     GlobalV::ofs_running << "number of excited states to be solved: " << this->nstates << std::endl;
     if (input.ri_hartree_benchmark == "aims" && !input.aims_nbasis.empty())
     {
@@ -121,6 +130,23 @@ void LR::ESolver_LR<T, TR>::set_dimension()
     }
 }
 
+template<typename T, typename TR>
+void LR::ESolver_LR<T, TR>::reset_dim_spin2()
+{
+    if (nspin != 2) { return; }
+    if (nupdown == 0) { std::cout << "** Assuming the spin-up and spin-down states are degenerate. **" << std::endl; }
+    else
+    {
+        this->openshell = true;
+        nupdown > 0 ? ((nocc[1] -= nupdown) && (nvirt[1] += nupdown)) : ((nocc[0] += nupdown) && (nvirt[0] -= nupdown));
+        npairs = { nocc[0] * nvirt[0], nocc[1] * nvirt[1] };
+        std::cout << "** Solve the spin-up and spin-down states separately for open-shell system. **" << std::endl;
+    }
+    for (int is : {0, 1}) { if (npairs[is] <= 0) { throw std::invalid_argument(std::string("ESolver_LR: npairs (nocc*nvirt) <= 0 for spin") + std::string(is == 0 ? "up" : "down")); } }
+    if (nstates > (npairs[0] + npairs[1]) * nk) { throw std::invalid_argument("ESolver_LR: nstates > nocc*nvirt*nk"); }
+    if (input.lr_unrestricted) { this->openshell = true; }
+}
+
 template <typename T, typename TR>
 LR::ESolver_LR<T, TR>::ESolver_LR(ModuleESolver::ESolver_KS_LCAO<T, TR>&& ks_sol,
     const Input_para& inp, UnitCell& ucell)
@@ -171,7 +197,7 @@ LR::ESolver_LR<T, TR>::ESolver_LR(ModuleESolver::ESolver_KS_LCAO<T, TR>&& ks_sol
     {
         this->psi_ks = new psi::Psi<T>(this->kv.get_nks(), this->paraC_.get_col_size(), this->paraC_.get_row_size());
         this->eig_ks.create(this->kv.get_nks(), this->nbands);
-        const int start_band = this->nocc_max - this->nocc;
+        const int start_band = this->nocc_max - std::max(nocc[0], nocc[1]);
         for (int ik = 0;ik < this->kv.get_nks();++ik)
         {
             Cpxgemr2d(this->nbasis, this->nbands, &(*ks_sol.psi)(ik, 0, 0), 1, start_band + 1, ks_sol.pv.desc_wfc,
@@ -182,12 +208,18 @@ LR::ESolver_LR<T, TR>::ESolver_LR(ModuleESolver::ESolver_KS_LCAO<T, TR>&& ks_sol
 #else
     move_gs();
 #endif
+    if (nspin == 2)
+    {
+        this->nupdown = cal_nupdown_form_occ(ks_sol.pelec->wg);
+        reset_dim_spin2();
+    }
 
     //grid integration
     this->gt_ = std::move(ks_sol.GridT);
     if (std::is_same<T, double>::value) { this->gint_g_ = std::move(ks_sol.GG); }
     else { this->gint_k_ = std::move(ks_sol.GK); }
     this->set_gint();
+    this->gint_->reset_DMRGint(1);
 
     // move pw basis
     delete this->pw_rho;    // newed in ESolver_FP::ESolver_FP
@@ -275,6 +307,11 @@ LR::ESolver_LR<T, TR>::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu
         this->paraMat_.ncol_bands,
         this->paraMat_.get_row_size());
     this->read_ks_wfc();
+    if (nspin == 2)
+    {
+        this->nupdown = cal_nupdown_form_occ(this->pelec->wg);
+        reset_dim_spin2();
+    }
 
     LR_Util::setup_2d_division(this->paraC_, 1, this->nbasis, this->nbands
 #ifdef __MPI
@@ -313,7 +350,6 @@ LR::ESolver_LR<T, TR>::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu
         PARAM.inp.test_atom_input);
     this->set_gint();
     this->gint_->gridt = &this->gt_;
-    this->gint_->reset_DMRGint(1);
 
     // (3) Periodic condition search for each grid.
     double dr_uniform = 0.001;
@@ -399,35 +435,63 @@ void LR::ESolver_LR<T, TR>::runner(int istep, UnitCell& cell)
     this->setup_eigenvectors_X();
     this->pelec->ekb.create(nspin, this->nstates);
 
+    auto efile = [&](const std::string& label)->std::string {return PARAM.globalv.global_out_dir + "Excitation_Energy_" + label + ".dat";};
+    auto vfile = [&](const std::string& label)->std::string {return PARAM.globalv.global_out_dir + "Excitation_Amplitude_" + label + "_" + std::to_string(GlobalV::MY_RANK) + ".dat";};
     if (this->input.lr_solver != "spectrum")
     {
+        auto write_states = [&](const std::string& label, const Real<T>* e, const T* v, const int& dim, const int& nst, const int& prec = 8)->void
+            {
+                if (GlobalV::MY_RANK == 0) { assert(nst == LR_Util::write_value(efile(label), prec, e, nst)); }
+                assert(nst * dim == LR_Util::write_value(vfile(label), prec, v, nst, dim));
+            };
         // allocate and initialize A matrix and density matrix
-        std::vector<std::string> spin_type = { "Spin Singlet", "Spin Triplet" };
-        for (int is = 0;is < nspin;++is)
+        if (openshell)
         {
-            if (nspin == 2) { std::cout << "Calculating " << spin_type[is] << " excitations" << std::endl; }
-            hamilt::Hamilt<T>* phamilt = new HamiltCasidaLR<T>(xc_kernel, nspin, this->nbasis, this->nocc, this->nvirt, this->ucell, orb_cutoff_, GlobalC::GridD, this->psi_ks, this->eig_ks,
+            std::cout << "Solving spin-conserving excitation for open-shell system." << std::endl;
+            HamiltULR<T> hulr(xc_kernel, nspin, this->nbasis, this->nocc, this->nvirt, this->ucell, orb_cutoff_, GlobalC::GridD, *this->psi_ks, this->eig_ks,
 #ifdef __EXX
                 this->exx_lri, this->exx_info.info_global.hybrid_alpha,
 #endif
-                this->gint_, this->pot[is], this->kv, & this->paraX_, & this->paraC_, & this->paraMat_, 
-                spin_type[is], input.ri_hartree_benchmark, (input.ri_hartree_benchmark == "aims" ? input.aims_nbasis : std::vector<int>({})));
-            // solve the Casida equation
-            HSolverLR<T> hsol(nk, this->npairs, is, this->input.out_wfc_lr);
-            hsol.set_diagethr(hsol.diag_ethr, 0, 0, std::max(1e-13, this->input.lr_thr));
-            hsol.solve(phamilt, *this->X[is], this->pelec, this->input.lr_solver/*,
-                !std::set<std::string>({ "hf", "hse" }).count(this->xc_kernel)*/);  //whether the kernel is Hermitian
-            delete phamilt;
+                this->gint_, this->pot, this->kv, this->paraX_, this->paraC_, this->paraMat_);
+            LR::HSolver::solve(hulr, this->X[0].template data<T>(), nloc_per_band, nstates, this->pelec->ekb.c, this->input.lr_solver, this->input.lr_thr);
+            if (input.out_wfc_lr) { write_states("openshell", this->pelec->ekb.c, this->X[0].template data<T>(), nloc_per_band, nstates); }
+        }
+        else
+        {
+            auto spin_types = std::vector<std::string>({ "singlet", "triplet" });
+            for (int is = 0;is < nspin;++is)
+            {
+                std::cout << "Calculating " << spin_types[is] << " excitations" << std::endl;
+                HamiltLR<T> hlr(xc_kernel, nspin, this->nbasis, this->nocc, this->nvirt, this->ucell, orb_cutoff_, GlobalC::GridD, *this->psi_ks, this->eig_ks,
+#ifdef __EXX
+                    this->exx_lri, this->exx_info.info_global.hybrid_alpha,
+#endif
+                    this->gint_, this->pot[is], this->kv, this->paraX_, this->paraC_, this->paraMat_,
+                    spin_types[is], input.ri_hartree_benchmark, (input.ri_hartree_benchmark == "aims" ? input.aims_nbasis : std::vector<int>({})));
+                // solve the Casida equation
+                LR::HSolver::solve(hlr, this->X[is].template data<T>(), nloc_per_band, nstates,
+                    this->pelec->ekb.c + is * nstates, this->input.lr_solver, this->input.lr_thr/*,
+                        !std::set<std::string>({ "hf", "hse" }).count(this->xc_kernel)*/);  //whether the kernel is Hermitian
+                if (input.out_wfc_lr) { write_states(spin_types[is], this->pelec->ekb.c + is * nstates, this->X[is].template data<T>(), nloc_per_band, nstates); }
+            }
         }
     }
     else    // read the eigenvalues
     {
-        std::ifstream ifs(PARAM.globalv.global_readin_dir + "Excitation_Energy.dat");
-        std::cout << "reading the excitation energies from file: \n";
-        for (int is = 0;is < nspin;++is)
+        auto read_states = [&](const std::string& label, Real<T>* e, T* v, const int& dim, const int& nst)->void
+            {
+                if (GlobalV::MY_RANK == 0) { assert(nst == LR_Util::read_value(efile(label), e, nst)); }
+                assert(nst * dim == LR_Util::read_value(vfile(label), v, nst, dim));
+            };
+        std::cout << "reading the excitation amplitudes from file: \n";
+        if (openshell)
+        {
+            read_states("openshell", this->pelec->ekb.c, this->X[0].template data<T>(), nloc_per_band, nstates);
+        }
+        else
         {
-            for (int i = 0;i < this->nstates;++i) { ifs >> this->pelec->ekb(is, i); }
-            for (int i = 0;i < this->nstates;++i) { std::cout << this->pelec->ekb(is, i) << " "; }
+            auto spin_types = std::vector<std::string>({ "singlet", "triplet" });
+            for (int is = 0;is < nspin;++is) { read_states(spin_types[is], this->pelec->ekb.c + is * nstates, this->X[is].template data<T>(), nloc_per_band, nstates); }
         }
     }
     return;
@@ -441,99 +505,92 @@ void LR::ESolver_LR<T, TR>::after_all_runners()
     //cal spectrum
     std::vector<double> freq(100);
     std::vector<double> abs_wavelen_range({ 20, 200 });//default range
-    if (input.abs_wavelen_range.size() == 2 && std::abs(input.abs_wavelen_range[1] - input.abs_wavelen_range[0]) > 0.02)
+    if (input.abs_wavelen_range.size() >= 2 && std::abs(input.abs_wavelen_range[1] - input.abs_wavelen_range[0]) > 0.02)
     {
         abs_wavelen_range = input.abs_wavelen_range;
     }
     double lambda_diff = std::abs(abs_wavelen_range[1] - abs_wavelen_range[0]);
     double lambda_min = std::min(abs_wavelen_range[1], abs_wavelen_range[0]);
     for (int i = 0;i < freq.size();++i) { freq[i] = 91.126664 / (lambda_min + 0.01 * static_cast<double>(i + 1) * lambda_diff); }
-    for (int is = 0;is < this->nspin;++is)
+    auto spin_types = (nspin == 2 && !openshell) ? std::vector<std::string>({ "singlet", "triplet" }) : std::vector<std::string>({ "updown" });
+    for (int is = 0;is < this->X.size();++is)
     {
-        LR_Spectrum<T> spectrum(&this->pelec->ekb.c[is * this->nstates], *this->X[is],
-            this->nspin, this->nbasis, this->nocc, this->nvirt, this->gint_, *this->pw_rho, *this->psi_ks,
-            this->ucell, this->kv, this->paraX_, this->paraC_, this->paraMat_);
-        spectrum.oscillator_strength();
-        spectrum.transition_analysis(is);
-        spectrum.optical_absorption(freq, input.abs_broadening, is);
+        LR_Spectrum<T> spectrum(nspin, this->nbasis, this->nocc, this->nvirt, this->gint_, *this->pw_rho, *this->psi_ks,
+            this->ucell, this->kv, GlobalC::GridD, this->orb_cutoff_,
+            this->paraX_, this->paraC_, this->paraMat_,
+            &this->pelec->ekb.c[is * nstates], this->X[is].template data<T>(), nstates, openshell);
+        spectrum.transition_analysis(spin_types[is]);
+        spectrum.optical_absorption(freq, input.abs_broadening, spin_types[is]);
     }
 }
 
-
 template<typename T, typename TR>
 void LR::ESolver_LR<T, TR>::setup_eigenvectors_X()
 {
     ModuleBase::TITLE("ESolver_LR", "setup_eigenvectors_X");
-    LR_Util::setup_2d_division(this->paraX_, 1, this->nvirt, this->nocc
+    for (int is = 0;is < nspin;++is)
+    {
+        Parallel_2D px;
+        LR_Util::setup_2d_division(px, /*nb2d=*/1, this->nvirt[is], this->nocc[is]
 #ifdef __MPI
-        , this->paraC_.blacs_ctxt
+            , this->paraC_.blacs_ctxt
 #endif
-    );//nvirt - row, nocc - col 
-    this->X.resize(this->nspin);
-    const std::vector<std::string> spin_types = { "Spin Singlet", "Spin Triplet" };
-    // if spectrum-only, read the LR-eigenstates from file and return
-    if (this->input.lr_solver == "spectrum")
-    {
-        std::cout << "reading the excitation amplitudes from file: \n";
-        for (int is = 0; is < this->nspin; ++is)
-        {
-            this->X[is] = std::make_shared<psi::Psi<T>>(LR_Util::read_psi_bandfirst<T>(
-                PARAM.globalv.global_readin_dir + "Excitation_Amplitude_" + spin_types[is], GlobalV::MY_RANK));
-        }
-    }
-    else
-    {
-        for (int is = 0; is < this->nspin; ++is)
-        {
-            this->X[is] = std::make_shared<psi::Psi<T>>(this->nk,
-                this->nstates,
-                this->paraX_.get_local_size(),
-                nullptr,
-                false); // band(state)-first
-            this->X[is]->zero_out();
-        }
-        set_X_initial_guess();
+        );//nvirt - row, nocc - col 
+        this->paraX_.emplace_back(std::move(px));
     }
+    this->nloc_per_band = nk * (openshell ? paraX_[0].get_local_size() + paraX_[1].get_local_size() : paraX_[0].get_local_size());
+
+    this->X.resize(openshell ? 1 : nspin, LR_Util::newTensor<T>({ nstates, nloc_per_band }));
+    for (auto& x : X) { x.zero(); }
+
+    auto spin_types = (nspin == 2 && !openshell) ? std::vector<std::string>({ "singlet", "triplet" }) : std::vector<std::string>({ "updown" });
+    // if spectrum-only, read the LR-eigenstates from file and return
+    if (this->input.lr_solver != "spectrum") { set_X_initial_guess(); }
 }
 
 template<typename T, typename TR>
 void LR::ESolver_LR<T, TR>::set_X_initial_guess()
 {
     // set the initial guess of X
-  // if (E_{lumo}-E_{homo-1} < E_{lumo+1}-E{homo}), mode = 0, else 1(smaller first)
-    bool ix_mode = false;   //default
-    if (this->eig_ks.nc > nocc + 1 && nocc >= 2 && eig_ks(0, nocc) - eig_ks(0, nocc - 2) - 1e-5 > eig_ks(0, nocc + 1) - eig_ks(0, nocc - 1)) { ix_mode = true; }
-    GlobalV::ofs_running << "setting the initial guess of X: " << std::endl;
-    if (nocc >= 2 && eig_ks.nc > nocc) { GlobalV::ofs_running << "E_{lumo}-E_{homo-1}=" << eig_ks(0, nocc) - eig_ks(0, nocc - 2) << std::endl; }
-    if (nocc >= 1 && eig_ks.nc > nocc + 1) { GlobalV::ofs_running << "E_{lumo+1}-E{homo}=" << eig_ks(0, nocc + 1) - eig_ks(0, nocc - 1) << std::endl; }
-    GlobalV::ofs_running << "mode of X-index: " << ix_mode << std::endl;
-
-    /// global index map between (i,c) and ix
-    ModuleBase::matrix ioiv2ix;
-    std::vector<std::pair<int, int>> ix2ioiv;
-    std::pair<ModuleBase::matrix, std::vector<std::pair<int, int>>> indexmap =
-        LR_Util::set_ix_map_diagonal(ix_mode, nocc, nvirt);
-
-    ioiv2ix = std::move(std::get<0>(indexmap));
-    ix2ioiv = std::move(std::get<1>(indexmap));
-
-    // use unit vectors as the initial guess
-    // for (int i = 0; i < std::min(this->nstates * PARAM.inp.pw_diag_ndim, nocc * nvirt); i++)
     for (int is = 0;is < this->nspin;++is)
     {
-        for (int s = 0; s < nstates; ++s)
+        const int& no = this->nocc[is];
+        const int& nv = this->nvirt[is];
+        const int& np = this->npairs[is];
+        const Parallel_2D& px = this->paraX_[is];
+
+        // if (E_{lumo}-E_{homo-1} < E_{lumo+1}-E{homo}), mode = 0, else 1(smaller first)
+        bool ix_mode = false;   //default
+        if (this->eig_ks.nc > no + 1 && no >= 2 && eig_ks(is, no) - eig_ks(is, no - 2) - 1e-5 > eig_ks(is, no + 1) - eig_ks(is, no - 1)) { ix_mode = true; }
+        GlobalV::ofs_running << "setting the initial guess of X of spin" << is << std::endl;
+        if (no >= 2 && eig_ks.nc > no) { GlobalV::ofs_running << "E_{lumo}-E_{homo-1}=" << eig_ks(is, no) - eig_ks(is, no - 2) << std::endl; }
+        if (no >= 1 && eig_ks.nc > no + 1) { GlobalV::ofs_running << "E_{lumo+1}-E{homo}=" << eig_ks(is, no + 1) - eig_ks(is, no - 1) << std::endl; }
+        GlobalV::ofs_running << "mode of X-index: " << ix_mode << std::endl;
+
+        /// global index map between (i,c) and ix
+        ModuleBase::matrix ioiv2ix;
+        std::vector<std::pair<int, int>> ix2ioiv;
+        std::pair<ModuleBase::matrix, std::vector<std::pair<int, int>>> indexmap =
+            LR_Util::set_ix_map_diagonal(ix_mode, no, nv);
+
+        ioiv2ix = std::move(std::get<0>(indexmap));
+        ix2ioiv = std::move(std::get<1>(indexmap));
+
+        for (int ib = 0; ib < nstates; ++ib)
         {
-            this->X[is]->fix_b(s);
-            int ipair = s % (npairs);
-            int occ_global = std::get<0>(ix2ioiv[ipair]);   // occ
-            int virt_global = std::get<1>(ix2ioiv[ipair]);   // virt
-            int ik = s / (npairs);
-            if (this->paraX_.in_this_processor(virt_global, occ_global))
-                (*X[is])(ik, this->paraX_.global2local_col(occ_global) * this->paraX_.get_row_size()
-                    + this->paraX_.global2local_row(virt_global))
-                = (static_cast<T>(1.0) / static_cast<T>(nk));
+            const int ipair = ib % np;
+            const int occ_global = std::get<0>(ix2ioiv[ipair]);   // occ
+            const int virt_global = std::get<1>(ix2ioiv[ipair]);   // virt
+            const int ik = ib / np;
+            const int xstart_b = ib * nloc_per_band;    //start index of band ib
+            const int xstart_bs = (openshell && is == 1) ? xstart_b + nk * paraX_[0].get_local_size() : xstart_b;  // start index of band ib, spin is
+            const int is_in_x = openshell ? 0 : is;     // if openshell, spin-up and spin-down are put together
+            if (px.in_this_processor(virt_global, occ_global))
+            {
+                const int ipair_loc = px.global2local_col(occ_global) * px.get_row_size() + px.global2local_row(virt_global);
+                X[is_in_x].data<T>()[xstart_bs + ipair_loc] = (static_cast<T>(1.0) / static_cast<T>(nk));
+            }
         }
-        this->X[is]->fix_b(0);  //recover the pointer
     }
 }
 
@@ -544,12 +601,13 @@ void LR::ESolver_LR<T, TR>::init_pot(const Charge& chg_gs)
     if (this->input.ri_hartree_benchmark != "none") { return; } //no need to initialize potential for Hxc kernel in the RI-benchmark routine
     switch (nspin)
     {
+        using ST = PotHxcLR::SpinType;
     case 1:
-        this->pot[0] = std::make_shared<PotHxcLR>(xc_kernel, this->pw_rho, &ucell, &chg_gs, PotHxcLR::SpinType::S1);
+        this->pot[0] = std::make_shared<PotHxcLR>(xc_kernel, this->pw_rho, &ucell, &chg_gs, ST::S1);
         break;
     case 2:
-        this->pot[0] = std::make_shared<PotHxcLR>(xc_kernel, this->pw_rho, &ucell, &chg_gs, PotHxcLR::SpinType::S2_singlet);
-        this->pot[1] = std::make_shared<PotHxcLR>(xc_kernel, this->pw_rho, &ucell, &chg_gs, PotHxcLR::SpinType::S2_triplet);
+        this->pot[0] = std::make_shared<PotHxcLR>(xc_kernel, this->pw_rho, &ucell, &chg_gs, openshell ? ST::S2_updown : ST::S2_singlet);
+        this->pot[1] = std::make_shared<PotHxcLR>(xc_kernel, this->pw_rho, &ucell, &chg_gs, openshell ? ST::S2_updown : ST::S2_triplet);
         break;
     default:
         throw std::invalid_argument("ESolver_LR: nspin must be 1 or 2");
@@ -567,8 +625,8 @@ void LR::ESolver_LR<T, TR>::read_ks_wfc()
     {
 #ifdef __EXX
         int ncore = 0;
-        std::vector<double> eig_ks_vec = RI_Benchmark::read_aims_ebands<double>(PARAM.globalv.global_readin_dir + "band_out", nocc, nvirt, ncore);
-        std::cout << "ncore=" << ncore << ", nocc=" << nocc << ", nvirt=" << nvirt << ", nbands=" << this->nbands << std::endl;
+        std::vector<double> eig_ks_vec = RI_Benchmark::read_aims_ebands<double>(PARAM.globalv.global_readin_dir + "band_out", nocc_in, nvirt_in, ncore);
+        std::cout << "ncore=" << ncore << ", nocc=" << nocc_in << ", nvirt=" << nvirt_in << ", nbands=" << this->nbands << std::endl;
         std::cout << "eig_ks_vec.size()=" << eig_ks_vec.size() << std::endl;
         if(eig_ks_vec.size() != this->nbands) {ModuleBase::WARNING_QUIT("ESolver_LR", "read_aims_ebands failed.");};
         for (int i = 0;i < nbands;++i) { this->pelec->ekb(0, i) = eig_ks_vec[i]; }
@@ -578,7 +636,7 @@ void LR::ESolver_LR<T, TR>::read_ks_wfc()
 #endif
     }
     else if (!ModuleIO::read_wfc_nao(PARAM.globalv.global_readin_dir, this->paraMat_, *this->psi_ks, this->pelec,
-        /*skip_bands=*/this->nocc_max - this->nocc)) {
+        /*skip_bands=*/this->nocc_max - this->nocc_in)) {
         ModuleBase::WARNING_QUIT("ESolver_LR", "read ground-state wavefunction failed.");
     }
     this->eig_ks = std::move(this->pelec->ekb);
diff --git a/source/module_lr/esolver_lrtd_lcao.h b/source/module_lr/esolver_lrtd_lcao.h
index f9db98d765..62e27e4fa8 100644
--- a/source/module_lr/esolver_lrtd_lcao.h
+++ b/source/module_lr/esolver_lrtd_lcao.h
@@ -65,20 +65,28 @@ namespace LR
         /// @brief ground state bands, read from the file, or moved from ESolver_FP::pelec.ekb
         ModuleBase::matrix eig_ks;///< energy of ground state
 
-        /// @brief Excited state info. size: nstates * nks * (nocc(local) * nvirt (local))
-        std::vector<std::shared_ptr<psi::Psi<T>>> X;
-
-        int nocc = 1;
+        /// @brief Excited state wavefunction (locc, lvirt are local size of nocc and nvirt in each process)
+        /// size of X: [neq][{nstate, nloc_per_band}], namely:
+        /// - [nspin][{nstates, nk* (locc* lvirt}] for close- shell,
+        /// -  [1][{nstates, nk * (locc[0] * lvirt[0]) + nk * (locc[1] * lvirt[1])}] for open-shell
+        std::vector<ct::Tensor> X;
+        int nloc_per_band = 1;
+
+        std::vector<int> nocc;   ///< number of occupied orbitals for each spin used in the calculation
+        int nocc_in = 1;    ///< nocc read from input (adjusted by nelec): max(spin-up, spindown)
         int nocc_max = 1;   ///< nelec/2
-        int nvirt = 1;
+        std::vector<int> nvirt;   ///< number of virtual orbitals for each spin used in the calculation
+        int nvirt_in = 1;   ///< nvirt read from input (adjusted by nelec): min(spin-up, spindown)
         int nbands = 2;
         int nbasis = 2;
         /// n_occ*nvirt, the basis size of electron-hole pair representation
-        int npairs = 1;
+        std::vector<int> npairs;
         /// how many 2-particle states to be solved
         int nstates = 1;
         int nspin = 1;
         int nk = 1;
+        int nupdown = 0;
+        bool openshell = false;
         std::string xc_kernel;
 
         Grid_Technique gt_;
@@ -90,7 +98,7 @@ namespace LR
         /// @brief variables for parallel distribution of KS orbitals
         Parallel_2D paraC_;
         /// @brief variables for parallel distribution of excited states
-        Parallel_2D paraX_;
+        std::vector<Parallel_2D> paraX_;
         /// @brief variables for parallel distribution of matrix in AO representation
         Parallel_Orbitals paraMat_;
 
@@ -111,6 +119,8 @@ namespace LR
         void parameter_check() const;
         /// @brief set nocc, nvirt, nbasis, npairs and nstates
         void set_dimension();
+        /// reset nocc, nvirt, npairs after read ground-state wavefunction when nspin=2
+        void reset_dim_spin2();
 
 #ifdef __EXX
         /// Tdata of Exx_LRI is same as T, for the reason, see operator_lr_exx.h
diff --git a/source/module_lr/hamilt_casida.cpp b/source/module_lr/hamilt_casida.cpp
index 08a00bb09e..a29429d5be 100644
--- a/source/module_lr/hamilt_casida.cpp
+++ b/source/module_lr/hamilt_casida.cpp
@@ -1,58 +1,57 @@
 #include "hamilt_casida.h"
+#include "module_lr/utils/lr_util_print.h"
 namespace LR
 {
     template<typename T>
-    std::vector<T> HamiltCasidaLR<T>::matrix()
+    std::vector<T> HamiltLR<T>::matrix()const
     {
-        ModuleBase::TITLE("HamiltCasidaLR", "matrix");
-        int npairs = this->nocc * this->nvirt;
+        ModuleBase::TITLE("HamiltLR", "matrix");
+        const int no = this->nocc[0];
+        const int nv = this->nvirt[0];
+        const auto& px = this->pX[0];
+        const int ldim = nk * px.get_local_size();
+        int npairs = no * nv;
         std::vector<T> Amat_full(this->nk * npairs * this->nk * npairs, 0.0);
         for (int ik = 0;ik < this->nk;++ik)
-            for (int j = 0;j < nocc;++j)
-                for (int b = 0;b < nvirt;++b)
+            for (int j = 0;j < no;++j)
+                for (int b = 0;b < nv;++b)
                 {//calculate A^{ai} for each bj
-                    int bj = j * nvirt + b;
-                    int kbj = ik * npairs + bj;
-                    psi::Psi<T> X_bj(1, 1, this->nk * this->pX->get_local_size()); // k1-first, like in iterative solver
+                    int bj = j * nv + b;    //global
+                    int kbj = ik * npairs + bj; //global
+                    psi::Psi<T> X_bj(1, 1, this->nk * px.get_local_size()); // k1-first, like in iterative solver
                     X_bj.zero_out();
-                    // X_bj(0, 0, lj * this->pX->get_row_size() + lb) = this->one();
-                    int lj = this->pX->global2local_col(j);
-                    int lb = this->pX->global2local_row(b);
-                    if (this->pX->in_this_processor(b, j)) X_bj(0, 0, ik * this->pX->get_local_size() + lj * this->pX->get_row_size() + lb) = this->one();
-                    psi::Psi<T> A_aibj(1, 1, this->nk * this->pX->get_local_size()); // k1-first
+                    // X_bj(0, 0, lj * px.get_row_size() + lb) = this->one();
+                    int lj = px.global2local_col(j);
+                    int lb = px.global2local_row(b);
+                    if (px.in_this_processor(b, j)) { X_bj(0, 0, ik * px.get_local_size() + lj * px.get_row_size() + lb) = this->one(); }
+                    psi::Psi<T> A_aibj(1, 1, this->nk * px.get_local_size()); // k1-first
                     A_aibj.zero_out();
 
+                    this->cal_dm_trans(0, X_bj.get_pointer());
                     hamilt::Operator<T>* node(this->ops);
                     while (node != nullptr)
                     {   // act() on and return the k1-first type of psi
-                        node->act(X_bj, A_aibj, 1);
+                        node->act(1, ldim, /*npol=*/1, X_bj.get_pointer(), A_aibj.get_pointer());
                         node = (hamilt::Operator<T>*)(node->next_op);
                     }
                     // reduce ai for a fixed bj
                     A_aibj.fix_kb(0, 0);
 #ifdef __MPI
                     for (int ik_ai = 0;ik_ai < this->nk;++ik_ai)
-                        LR_Util::gather_2d_to_full(*this->pX, &A_aibj.get_pointer()[ik_ai * this->pX->get_local_size()],
+                        LR_Util::gather_2d_to_full(px, &A_aibj.get_pointer()[ik_ai * px.get_local_size()],
                             Amat_full.data() + kbj * this->nk * npairs /*col, bj*/ + ik_ai * npairs/*row, ai*/,
-                            false, this->nvirt, this->nocc);
+                            false, nv, no);
 #endif
                 }
         // output Amat
-        std::cout << "Amat_full:" << std::endl;
-        for (int i = 0;i < this->nk * npairs;++i)
-        {
-            for (int j = 0;j < this->nk * npairs;++j)
-            {
-                std::cout << Amat_full[i * this->nk * npairs + j] << " ";
-            }
-            std::cout << std::endl;
-        }
+        std::cout << "Full A matrix: (elements < 1e-10 is set to 0)" << std::endl;
+        LR_Util::print_value(Amat_full.data(), nk * npairs, nk * npairs);
         return Amat_full;
     }
 
-    template<> double HamiltCasidaLR<double>::one() { return 1.0; }
-    template<> std::complex<double> HamiltCasidaLR<std::complex<double>>::one() { return std::complex<double>(1.0, 0.0); }
+    template<> double HamiltLR<double>::one() const { return 1.0; }
+    template<> std::complex<double> HamiltLR<std::complex<double>>::one() const { return std::complex<double>(1.0, 0.0); }
 
-    template class HamiltCasidaLR<double>;
-    template class HamiltCasidaLR<std::complex<double>>;
+    template class HamiltLR<double>;
+    template class HamiltLR<std::complex<double>>;
 }
\ No newline at end of file
diff --git a/source/module_lr/hamilt_casida.h b/source/module_lr/hamilt_casida.h
index 27d637f7b9..ab40c82611 100644
--- a/source/module_lr/hamilt_casida.h
+++ b/source/module_lr/hamilt_casida.h
@@ -1,9 +1,11 @@
 #pragma once
+#include <typeinfo>
 #include "module_hamilt_general/hamilt.h"
 #include "module_elecstate/module_dm/density_matrix.h"
 #include "module_lr/operator_casida/operator_lr_diag.h"
 #include "module_lr/operator_casida/operator_lr_hxc.h"
 #include "module_basis/module_ao/parallel_orbitals.h"
+#include "module_lr/dm_trans/dm_trans.h"
 #ifdef __EXX
 #include "module_lr/operator_casida/operator_lr_exx.h"
 #include "module_lr/ri_benchmark/operator_ri_hartree.h"
@@ -12,19 +14,19 @@
 namespace LR
 {
     template<typename T>
-    class HamiltCasidaLR : public hamilt::Hamilt<T, base_device::DEVICE_CPU>
+    class HamiltLR
     {
     public:
         template<typename TGint>
-        HamiltCasidaLR(std::string& xc_kernel,
+        HamiltLR(std::string& xc_kernel,
             const int& nspin,
             const int& naos,
-            const int& nocc,
-            const int& nvirt,
+            const std::vector<int>& nocc,
+            const std::vector<int>& nvirt,
             const UnitCell& ucell_in,
             const std::vector<double>& orb_cutoff,
             Grid_Driver& gd_in,
-            const psi::Psi<T>* psi_ks_in,
+            const psi::Psi<T>& psi_ks_in,
             const ModuleBase::matrix& eig_ks,
 #ifdef __EXX
             std::weak_ptr<Exx_LRI<T>> exx_lri_in,
@@ -33,20 +35,22 @@ namespace LR
             TGint* gint_in,
             std::weak_ptr<PotHxcLR> pot_in,
             const K_Vectors& kv_in,
-            Parallel_2D* pX_in,
-            Parallel_2D* pc_in,
-            Parallel_Orbitals* pmat_in,
+            const std::vector<Parallel_2D>& pX_in,
+            const Parallel_2D& pc_in,
+            const Parallel_Orbitals& pmat_in,
             const std::string& spin_type,
             const std::string& ri_hartree_benchmark = "none",
-            const std::vector<int>& aims_nbasis = {}) : nocc(nocc), nvirt(nvirt), pX(pX_in), nk(kv_in.get_nks() / nspin)
+            const std::vector<int>& aims_nbasis = {}) : nspin(nspin), nocc(nocc), nvirt(nvirt), pX(pX_in), nk(kv_in.get_nks() / nspin)
         {
-            ModuleBase::TITLE("HamiltCasidaLR", "HamiltCasidaLR");
+            ModuleBase::TITLE("HamiltLR", "HamiltLR");
             if (ri_hartree_benchmark != "aims") { assert(aims_nbasis.empty()); }
-            this->classname = "HamiltCasidaLR";
-            this->DM_trans.resize(1);
-            this->DM_trans[0] = LR_Util::make_unique<elecstate::DensityMatrix<T, T>>(pmat_in, 1, kv_in.kvec_d, nk);
+            // always use nspin=1 for transition density matrix
+            this->DM_trans = LR_Util::make_unique<elecstate::DensityMatrix<T, T>>(&pmat_in, 1, kv_in.kvec_d, nk);
+            if (ri_hartree_benchmark == "none") { LR_Util::initialize_DMR(*this->DM_trans, pmat_in, ucell_in, gd_in, orb_cutoff); }
+            // this->DM_trans->init_DMR(&gd_in, &ucell_in); // too large due to not restricted by orb_cutoff
+
             // add the diag operator  (the first one)
-            this->ops = new OperatorLRDiag<T>(eig_ks, pX_in, nk, nocc, nvirt);
+            this->ops = new OperatorLRDiag<T>(eig_ks.c, pX[0], nk, nocc[0], nvirt[0]);
             //add Hxc operator
 #ifdef __EXX
             using TAC = std::pair<int, std::array<int, 3>>;
@@ -63,7 +67,7 @@ namespace LR
             if (ri_hartree_benchmark != "none")
             {
 #ifdef __EXX
-                if (spin_type == "Spin Singlet")
+                if (spin_type == "singlet")
                 {
                     if (ri_hartree_benchmark == "aims") 
                     { 
@@ -77,11 +81,11 @@ namespace LR
                     }
                     if (!std::set<std::string>({ "rpa", "hf" }).count(xc_kernel)) { throw std::runtime_error("ri_hartree_benchmark is only supported for xc_kernel rpa and hf"); }
                     RI_Benchmark::OperatorRIHartree<T>* ri_hartree_op
-                        = new RI_Benchmark::OperatorRIHartree<T>(ucell_in, naos, nocc, nvirt, *psi_ks_in,
+                        = new RI_Benchmark::OperatorRIHartree<T>(ucell_in, naos, nocc[0], nvirt[0], psi_ks_in,
                             Cs_read, Vs_read, ri_hartree_benchmark == "aims", aims_nbasis);
                     this->ops->add(ri_hartree_op);
                 }
-                else if (spin_type == "Spin Triplet") {std::cout<<"f_Hxc based on grid integral is not needed."<<std::endl;}
+                else if (spin_type == "triplet") { std::cout << "f_Hxc based on grid integral is not needed." << std::endl; }
 #else
                 ModuleBase::WARNING_QUIT("ESolver_LR", "RI benchmark is only supported when compile with LibRI.");
 #endif
@@ -96,40 +100,94 @@ namespace LR
 #ifdef __EXX
             if (xc_kernel == "hf" || xc_kernel == "hse")
             {   //add Exx operator
-                if (ri_hartree_benchmark != "none" && spin_type == "Spin Singlet")
+                if (ri_hartree_benchmark != "none" && spin_type == "singlet")
                 {
                     exx_lri_in.lock()->reset_Cs(Cs_read);
                     exx_lri_in.lock()->reset_Vs(Vs_read);
                 }
                 // std::cout << "exx_alpha=" << exx_alpha << std::endl; // the default value of exx_alpha is 0.25 when dft_functional is pbe or hse
-                hamilt::Operator<T>* lr_exx = new OperatorLREXX<T>(nspin, naos, nocc, nvirt, ucell_in, psi_ks_in,
-                    this->DM_trans, exx_lri_in, kv_in, pX_in, pc_in, pmat_in,
+                hamilt::Operator<T>* lr_exx = new OperatorLREXX<T>(nspin, naos, nocc[0], nvirt[0], ucell_in, psi_ks_in,
+                    this->DM_trans, exx_lri_in, kv_in, pX_in[0], pc_in, pmat_in,
                     xc_kernel == "hf" ? 1.0 : exx_alpha, //alpha
-                    ri_hartree_benchmark != "none"/*whether to cal_dm_trans first here*/,
                     aims_nbasis);
                 this->ops->add(lr_exx);
             }
 #endif
+
+            this->cal_dm_trans = [&, this](const int& is, const T* X)->void
+                {
+                    const auto psi_ks_is = LR_Util::get_psi_spin(psi_ks_in, is, nk);
+#ifdef __MPI
+                    std::vector<ct::Tensor>  dm_trans_2d = cal_dm_trans_pblas(X, pX[is], psi_ks_is, pc_in, naos, nocc[is], nvirt[is], pmat_in);
+                    if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos, pmat_in);
+#else
+                    std::vector<ct::Tensor>  dm_trans_2d = cal_dm_trans_blas(X, psi_ks_is, nocc[is], nvirt[is]);
+                    if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos);
+#endif
+                    // LR_Util::print_tensor<T>(dm_trans_2d[0], "dm_trans_2d[0]", &pmat_in);
+                    // tensor to vector, then set DMK
+                    for (int ik = 0;ik < nk;++ik) { this->DM_trans->set_DMK_pointer(ik, dm_trans_2d[ik].data<T>()); }
+                };
         }
-        ~HamiltCasidaLR()
+        ~HamiltLR() { delete this->ops; }
+
+        std::vector<T> matrix()const;
+
+        void hPsi(const T* psi_in, T* hpsi, const int ld_psi, const int& nband) const
         {
-            if (this->ops != nullptr)
+            assert(ld_psi == nk * pX[0].get_local_size());
+            for (int ib = 0;ib < nband;++ib)
             {
-                delete this->ops;
+                const int offset = ib * ld_psi;
+                this->cal_dm_trans(0, psi_in + offset);  // calculate transition density matrix here
+                hamilt::Operator<T>* node(this->ops);
+                while (node != nullptr)
+                {
+                    node->act(/*nband=*/1, ld_psi, /*npol=*/1, psi_in + offset, hpsi + offset);
+                    node = (hamilt::Operator<T>*)(node->next_op);
+                }
             }
-        };
+        }
 
-        virtual std::vector<T> matrix() override;
+        void global2local(T* lvec, const T* gvec, const int& nband) const
+        {
+            const int npairs = nocc[0] * nvirt[0];
+            for (int ib = 0;ib < nband;++ib)
+            {
+                const int loffset_b = ib * nk * pX[0].get_local_size();
+                const int goffset_b = ib * nk * npairs;
+                for (int ik = 0;ik < nk;++ik)
+                {
+                    const int loffset = loffset_b + ik * pX[0].get_local_size();
+                    const int goffset = goffset_b + ik * npairs;
+                    for (int lo = 0;lo < pX[0].get_col_size();++lo)
+                    {
+                        const int go = pX[0].local2global_col(lo);
+                        for (int lv = 0;lv < pX[0].get_row_size();++lv)
+                        {
+                            const int gv = pX[0].local2global_row(lv);
+                            lvec[loffset + lo * pX[0].get_row_size() + lv] = gvec[goffset + go * nvirt[0] + gv];
+                        }
+                    }
+                }
+            }
+        }
 
     private:
-        int nocc;
-        int nvirt;
-        int nk;
-        Parallel_2D* pX = nullptr;
-        T one();
+        const std::vector<int>& nocc;
+        const std::vector<int>& nvirt;
+        const int nspin = 1;
+        const int nk = 1;
+        const bool tdm_sym = false;     ///< whether to symmetrize the transition density matrix
+        const std::vector<Parallel_2D>& pX;
+        T one()const;
         /// transition density matrix in AO representation
-        /// Hxc only: size=1, calculate on the same address for each bands
-        /// Hxc+Exx: size=nbands, store the result of each bands for common use
-        std::vector<std::unique_ptr<elecstate::DensityMatrix<T, T>>> DM_trans;
+        /// calculate on the same address for each bands, and commonly used by all the operators
+        std::unique_ptr<elecstate::DensityMatrix<T, T>> DM_trans;
+
+        /// first node operator, add operations from each operators
+        hamilt::Operator<T, base_device::DEVICE_CPU>* ops = nullptr;
+
+        std::function<void(const int&, const T*)> cal_dm_trans;
     };
 }
diff --git a/source/module_lr/hamilt_ulr.hpp b/source/module_lr/hamilt_ulr.hpp
new file mode 100644
index 0000000000..77ee62ce73
--- /dev/null
+++ b/source/module_lr/hamilt_ulr.hpp
@@ -0,0 +1,230 @@
+#pragma once
+#include "module_hamilt_general/hamilt.h"
+#include "module_elecstate/module_dm/density_matrix.h"
+#include "module_lr/operator_casida/operator_lr_diag.h"
+#include "module_lr/operator_casida/operator_lr_hxc.h"
+#include "module_lr/utils/lr_util_print.h"
+#ifdef __EXX
+#include "module_lr/operator_casida/operator_lr_exx.h"
+#endif
+namespace LR
+{
+    /// Unristricted TDDFT (TDA) for open-shell systems
+    /// The A matrix is diveded by 4 blocks: uu, ud, du, dd
+    template<typename T>
+    class HamiltULR
+    {
+    public:
+        template<typename TGint>
+        HamiltULR(std::string& xc_kernel,
+            const int& nspin,
+            const int& naos,
+            const std::vector<int>& nocc,   ///< {up, down}
+            const std::vector<int>& nvirt,   ///< {up, down}
+            const UnitCell& ucell_in,
+            const std::vector<double>& orb_cutoff,
+            Grid_Driver& gd_in,
+            const psi::Psi<T>& psi_ks_in,
+            const ModuleBase::matrix& eig_ks,
+#ifdef __EXX
+            std::weak_ptr<Exx_LRI<T>> exx_lri_in,
+            const double& exx_alpha,
+#endif 
+            TGint* gint_in,
+            std::vector<std::shared_ptr<PotHxcLR>>& pot_in,
+            const K_Vectors& kv_in,
+            const std::vector<Parallel_2D>& pX_in,   ///< {up, down}
+            const Parallel_2D& pc_in,
+            const Parallel_Orbitals& pmat_in) :nocc(nocc), nvirt(nvirt), pX(pX_in), nk(kv_in.get_nks() / nspin),
+            ldim(nk* pX[0].get_local_size() + nk * pX[1].get_local_size()),
+            gdim(nk* std::inner_product(nocc.begin(), nocc.end(), nvirt.begin(), 0))
+        {
+            ModuleBase::TITLE("HamiltULR", "HamiltULR");
+            this->DM_trans = LR_Util::make_unique<elecstate::DensityMatrix<T, T>>(&pmat_in, 1, kv_in.kvec_d, nk);
+            LR_Util::initialize_DMR(*this->DM_trans, pmat_in, ucell_in, gd_in, orb_cutoff);
+            // this->DM_trans->init_DMR(&gd_in, &ucell_in); // too large due to not restricted by orb_cutoff
+            this->ops.resize(4);
+
+            this->ops[0] = new OperatorLRDiag<T>(eig_ks.c, pX_in[0], nk, nocc[0], nvirt[0]);
+            this->ops[3] = new OperatorLRDiag<T>(eig_ks.c + nk * (nocc[0] + nvirt[0]), pX_in[1], nk, nocc[1], nvirt[1]);
+
+            auto newHxc = [&](const int& sl, const int& sr) { return new OperatorLRHxc<T>(nspin, naos, nocc, nvirt, psi_ks_in,
+                this->DM_trans, gint_in, pot_in[sl], ucell_in, orb_cutoff, gd_in, kv_in, pX_in, pc_in, pmat_in, { sl,sr }); };
+            this->ops[0]->add(newHxc(0, 0));
+            this->ops[1] = newHxc(0, 1);
+            this->ops[2] = newHxc(1, 0);
+            this->ops[3]->add(newHxc(1, 1));
+
+#ifdef __EXX
+            if (xc_kernel == "hf" || xc_kernel == "hse")
+            {
+                std::vector<psi::Psi<T>> psi_ks_spin = { LR_Util::get_psi_spin(psi_ks_in, 0, nk), LR_Util::get_psi_spin(psi_ks_in, 1, nk) };
+                for (int is : {0, 1})
+                {
+                    this->ops[(is << 1) + is]->add(new OperatorLREXX<T>(nspin, naos, nocc[is], nvirt[is], ucell_in, psi_ks_spin[is],
+                        this->DM_trans, exx_lri_in, kv_in, pX_in[is], pc_in, pmat_in,
+                        xc_kernel == "hf" ? 1.0 : exx_alpha));
+                }
+            }
+#endif
+
+            this->cal_dm_trans = [&, this](const int& is, const T* X)->void
+                {
+                    const auto psi_ks_is = LR_Util::get_psi_spin(psi_ks_in, is, nk);
+                    // LR_Util::print_value(X, pX_in[is].get_local_size());
+#ifdef __MPI
+                    std::vector<ct::Tensor>  dm_trans_2d = cal_dm_trans_pblas(X, pX[is], psi_ks_is, pc_in, naos, nocc[is], nvirt[is], pmat_in);
+                    if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos, pmat_in);
+#else
+                    std::vector<ct::Tensor>  dm_trans_2d = cal_dm_trans_blas(X, psi_ks_is, nocc[is], nvirt[is]);
+                    if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos);
+#endif
+                    // LR_Util::print_tensor<T>(dm_trans_2d[0], "DMtrans(k=0)", &pmat_in);
+                    // tensor to vector, then set DMK
+                    for (int ik = 0;ik < nk;++ik) { this->DM_trans->set_DMK_pointer(ik, dm_trans_2d[ik].data<T>()); }
+                };
+        }
+        ~HamiltULR()
+        {
+            for (auto& op : ops) { delete op; }
+        }
+        void hPsi(const T* psi_in, T* hpsi, const int ld_psi, const int& nband) const
+        {
+            ModuleBase::TITLE("HamiltULR", "hPsi");
+            assert(ld_psi == this->ldim);
+            const std::vector<int64_t> xdim_is = { nk * pX[0].get_local_size(), nk * pX[1].get_local_size() };
+            /// band-wise act (also works for close-shell, but not efficient)
+            for (int ib = 0;ib < nband;++ib)
+            {
+                const int offset_band = ib * ld_psi;
+                for (int is_bj : {0, 1})
+                {
+                    const int offset_bj = offset_band + is_bj * xdim_is[0];
+                    cal_dm_trans(is_bj, psi_in + offset_bj);   // calculate transition density matrix here
+                    for (int is_ai : {0, 1})
+                    {
+                        const int offset_ai = offset_band + is_ai * xdim_is[0];
+                        hamilt::Operator<T>* node(this->ops[(is_ai << 1) + is_bj]);
+                        while (node != nullptr)
+                        {
+                            node->act(/*nband=*/1, xdim_is[is_bj], /*npol=*/1, psi_in + offset_bj, hpsi + offset_ai);
+                            node = (hamilt::Operator<T>*)(node->next_op);
+                        }
+                    }
+                }
+            }
+        }
+        std::vector<T> matrix()const
+        {
+            ModuleBase::TITLE("HamiltULR", "matrix");
+            const std::vector<int> npairs = { this->nocc[0] * this->nvirt[0], this->nocc[1] * this->nvirt[1] };
+            const std::vector<int64_t> ldim_is = { nk * pX[0].get_local_size(), nk * pX[1].get_local_size() };
+            const std::vector<int> gdim_is = { nk * npairs[0], nk * npairs[1] };
+            std::vector<T> Amat_full(gdim * gdim);
+            for (int is_bj : {0, 1})
+            {
+                const int no = this->nocc[is_bj];
+                const int nv = this->nvirt[is_bj];
+                const auto& px = this->pX[is_bj];
+                const int loffset_bj = is_bj * ldim_is[0];
+                const int goffset_bj = is_bj * gdim_is[0];
+                for (int ik_bj = 0;ik_bj < nk;++ik_bj)
+                {
+                    for (int j = 0;j < no;++j)
+                    {
+                        for (int b = 0;b < nv;++b)
+                        {
+                            const int gcol = goffset_bj + ik_bj * npairs[is_bj] + j * nv + b;//global
+                            std::vector<T> X_bj(this->ldim, T(0));
+                            const int lj = px.global2local_col(j);
+                            const int lb = px.global2local_row(b);
+                            const int lcol = loffset_bj + ik_bj * px.get_local_size() + lj * px.get_row_size() + lb;//local
+                            if (px.in_this_processor(b, j)) { X_bj[lcol] = T(1); }
+                            this->cal_dm_trans(is_bj, X_bj.data() + loffset_bj);
+                            std::vector<T> Aloc_col(this->ldim, T(0)); // a col of A matrix (local)
+                            for (int is_ai : {0, 1})
+                            {
+                                const int goffset_ai = is_ai * gdim_is[0];
+                                const int loffset_ai = is_ai * ldim_is[0];
+                                const auto& pax = this->pX[is_ai];
+                                hamilt::Operator<T>* node(this->ops[(is_ai << 1) + is_bj]);
+                                while (node != nullptr)
+                                {
+                                    node->act(1, ldim_is[is_bj], /*npol=*/1, X_bj.data() + loffset_bj, Aloc_col.data() + loffset_ai);
+                                    node = (hamilt::Operator<T>*)(node->next_op);
+                                }
+#ifdef __MPI
+                                for (int ik_ai = 0;ik_ai < this->nk;++ik_ai)
+                                {
+                                    LR_Util::gather_2d_to_full(pax, Aloc_col.data() + loffset_ai + ik_ai * pax.get_local_size(),
+                                        Amat_full.data() + gcol * gdim /*col, bj*/ + goffset_ai + ik_ai * npairs[is_ai]/*row, ai*/,
+                                        false, nv, no);
+                                }
+#else
+                                std::memcpy(Amat_full.data() + gcol * gdim + goffset_ai, Aloc_col.data() + goffset_ai, gdim_is[is_ai] * sizeof(T));
+#endif
+                            }
+                        }
+                    }
+                }
+            }
+            std::cout << "Full A matrix:" << std::endl;
+            LR_Util::print_value(Amat_full.data(), gdim, gdim);
+            return Amat_full;
+        }
+
+        /// copy global data (eigenvectors) to local memory
+        void global2local(T* lvec, const T* gvec, const int& nband) const
+        {
+            const std::vector<int> npairs = { this->nocc[0] * this->nvirt[0], this->nocc[1] * this->nvirt[1] };
+            const std::vector<int64_t> ldim_is = { nk * pX[0].get_local_size(), nk * pX[1].get_local_size() };
+            const std::vector<int> gdim_is = { nk * npairs[0], nk * npairs[1] };
+            for (int ib = 0;ib < nband;++ib)
+            {
+                const int loffset_b = ib * this->ldim;
+                const int goffset_b = ib * this->gdim;
+                for (int is : {0, 1})
+                {
+                    const int loffset_bs = loffset_b + is * ldim_is[0];
+                    const int goffset_bs = goffset_b + is * gdim_is[0];
+                    for (int ik = 0;ik < nk;++ik)
+                    {
+                        const int loffset = loffset_bs + ik * pX[is].get_local_size();
+                        const int goffset = goffset_bs + ik * npairs[is];
+                        for (int lo = 0;lo < pX[is].get_col_size();++lo)
+                        {
+                            const int go = pX[is].local2global_col(lo);
+                            for (int lv = 0;lv < pX[is].get_row_size();++lv)
+                            {
+                                const int gv = pX[is].local2global_row(lv);
+                                lvec[loffset + lo * pX[is].get_row_size() + lv] = gvec[goffset + go * nvirt[is] + gv];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    private:
+        const std::vector<int>& nocc;
+        const std::vector<int>& nvirt;
+
+        const std::vector<Parallel_2D>& pX;
+
+
+        const int nk = 1;
+        const int ldim = 1;
+        const int gdim = 1;
+
+        /// 4 operator lists: uu, ud, du, dd
+        std::vector<hamilt::Operator<T>*> ops;
+
+        /// transition density matrix in AO representation
+        /// Hxc only: size=1, calculate on the same address for each bands
+        /// Hxc+Exx: size=nbands, store the result of each bands for common use
+        std::unique_ptr<elecstate::DensityMatrix<T, T>> DM_trans;
+
+        std::function<void(const int&, const T*)> cal_dm_trans;
+        const bool tdm_sym = false;     ///< whether to symmetrize the transition density matrix
+    };
+}
\ No newline at end of file
diff --git a/source/module_lr/hsolver_lrtd.cpp b/source/module_lr/hsolver_lrtd.cpp
deleted file mode 100644
index c8f2b9d5b6..0000000000
--- a/source/module_lr/hsolver_lrtd.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-#include "hsolver_lrtd.h"
-#include "module_parameter/parameter.h"
-#include "module_hsolver/diago_david.h"
-#include "module_hsolver/diago_dav_subspace.h"
-#include "module_hsolver/diago_cg.h"
-#include "module_lr/utils/lr_util.h"
-#include "module_lr/utils/lr_util_print.h"
-
-namespace LR
-{
-    inline double square(double x) { return x * x; };
-    inline double square(std::complex<double> x) { return x.real() * x.real() + x.imag() * x.imag(); };
-    template<typename T>
-    inline void print_eigs(const std::vector<T>& eigs, const std::string& label = "", const double factor = 1.0)
-    {
-        std::cout << label << std::endl;
-        for (auto& e : eigs) {std::cout << e * factor << " ";
-}
-        std::cout << std::endl;
-    }
-    template<typename T, typename Device>
-    void HSolverLR<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
-        psi::Psi<T, Device>& psi,
-        elecstate::ElecState* pes,
-        const std::string method_in,
-        const bool hermitian)
-    {
-        ModuleBase::TITLE("HSolverLR", "solve");
-        assert(psi.get_nk() == nk);
-        const std::vector<std::string> spin_types = { "Spin Singlet", "Spin Triplet" };
-        // note: if not TDA, the eigenvalues will be complex
-        // then we will need a new constructor of DiagoDavid
-
-        // 1. allocate precondition and eigenvalue
-        std::vector<Real> precondition(psi.get_nk() * psi.get_nbasis());
-        std::vector<Real> eigenvalue(psi.get_nbands());   //nstates
-        // 2. select the method
-        this->method = method_in;
-#ifdef __MPI
-        const hsolver::diag_comm_info comm_info = { POOL_WORLD, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL };
-#else
-        const hsolver::diag_comm_info comm_info = { GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL };
-#endif
-
-        if (this->method == "lapack")
-        {
-            std::vector<T> Amat_full = pHamilt->matrix();
-            eigenvalue.resize(nk * npairs);
-            if (hermitian) { LR_Util::diag_lapack(nk * npairs, Amat_full.data(), eigenvalue.data()); }
-            else
-            {
-                std::vector<std::complex<double>> eig_complex(nk * npairs);
-                LR_Util::diag_lapack_nh(nk * npairs, Amat_full.data(), eig_complex.data());
-                print_eigs(eig_complex, "Right eigenvalues: of the non-Hermitian matrix: (Ry)");
-                for (int i = 0; i < nk * npairs; i++) { eigenvalue[i] = eig_complex[i].real(); }
-            }
-            psi.fix_kb(0, 0);
-            // copy eigenvectors
-            for (int i = 0;i < psi.size();++i) { psi.get_pointer()[i] = Amat_full[i];
-}
-        }
-        else
-        {
-            // 3. set precondition and diagethr
-            for (int i = 0; i < psi.get_nk() * psi.get_nbasis(); ++i) {
-                precondition[i] = static_cast<Real>(1.0);
-}
-
-            // wrap band-first psi as k1-first psi_k1_dav
-            psi::Psi<T> psi_k1_dav = LR_Util::bfirst_to_k1_wrapper(psi);
-            assert(psi_k1_dav.get_nbands() == psi.get_nbands());
-            assert(psi_k1_dav.get_nbasis() == psi.get_nbasis() * psi.get_nk());
-
-            const int david_maxiter = hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX;
-
-            if (this->method == "dav")
-            {
-                // Allow 5 tries at most. If ntry > ntry_max = 5, exit diag loop.
-                const int ntry_max = 5;
-                // In non-self consistent calculation, do until totally converged. Else allow 5 eigenvecs to be NOT
-                // converged.
-                const int notconv_max = ("nscf" == PARAM.inp.calculation) ? 0 : 5;
-                // do diag and add davidson iteration counts up to avg_iter
-
-                auto hpsi_func = [pHamilt](
-                    T* psi_in,
-                    T* hpsi_out,
-                    const int ld_psi,
-                    const int nvec)
-                    {
-                        auto psi_iter_wrapper = psi::Psi<T, Device>(psi_in, 1, nvec, ld_psi, nullptr);
-                        psi::Range bands_range(true, 0, 0, nvec-1);
-                        using hpsi_info = typename hamilt::Operator<T, Device>::hpsi_info;
-                        hpsi_info info(&psi_iter_wrapper, bands_range, hpsi_out);
-                        pHamilt->ops->hPsi(info);
-                    };
-                auto spsi_func = [pHamilt](const T* psi_in, T* spsi_out,
-                               const int ld_psi, const int nbands)
-                {
-                    // sPsi determines S=I or not by PARAM.globalv.use_uspp inside
-                    pHamilt->sPsi(psi_in, spsi_out, ld_psi, ld_psi, nbands);
-                };
-
-                const int& dim = psi_k1_dav.get_nbasis();   //equals to leading dimension here
-                const int& nband = psi_k1_dav.get_nbands();
-                hsolver::DiagoDavid<T, Device> david(precondition.data(), nband, dim, PARAM.inp.pw_diag_ndim, PARAM.inp.use_paw, comm_info);
-                hsolver::DiagoIterAssist<T, Device>::avg_iter += static_cast<double>(david.diag(hpsi_func, spsi_func,
-                    dim, psi_k1_dav.get_pointer(), eigenvalue.data(), this->diag_ethr, david_maxiter, ntry_max, 0));
-            }
-            else if (this->method == "dav_subspace") //need refactor
-            {
-                hsolver::Diago_DavSubspace<T, Device> dav_subspace(precondition,
-                    psi_k1_dav.get_nbands(),
-                    psi_k1_dav.get_nbasis(),
-                    PARAM.inp.pw_diag_ndim,
-                    this->diag_ethr,
-                    david_maxiter,
-                    false, //always do the subspace diag (check the implementation)
-                    comm_info);
-
-                auto hpsi_func = [pHamilt](
-                    T* psi_in,
-                    T* hpsi_out,
-                    const int ld_psi,
-                    const int nvec)
-                    {
-                        auto psi_iter_wrapper = psi::Psi<T, Device>(psi_in, 1, nvec, ld_psi, nullptr);
-                        psi::Range bands_range(true, 0, 0, nvec-1);
-                        using hpsi_info = typename hamilt::Operator<T, Device>::hpsi_info;
-                        hpsi_info info(&psi_iter_wrapper, bands_range, hpsi_out);
-                        pHamilt->ops->hPsi(info);
-                    };
-                auto subspace_func = [pHamilt](T* psi_out,
-                    T* psi_in,
-                    Real* eigenvalue_in_hsolver,
-                    const int nband_in,
-                    const int nbasis_max_in) {
-                        // Convert "pointer data stucture" to a psi::Psi object
-                        auto psi_in_wrapper = psi::Psi<T, Device>(psi_in, 1, nband_in, nbasis_max_in, nullptr);
-                        auto psi_out_wrapper = psi::Psi<T, Device>(psi_out, 1, nband_in, nbasis_max_in, nullptr);
-
-                        hsolver::DiagoIterAssist<T, Device>::diagH_subspace(pHamilt,
-                            psi_in_wrapper,
-                            psi_out_wrapper,
-                            eigenvalue_in_hsolver,
-                            nband_in);
-                    };
-
-                std::vector<double> ethr_band(psi_k1_dav.get_nbands(), this->diag_ethr);
-                hsolver::DiagoIterAssist<T, Device>::avg_iter
-                    += static_cast<double>(dav_subspace.diag(
-                        hpsi_func, psi_k1_dav.get_pointer(),
-                        psi_k1_dav.get_nbasis(),
-                        eigenvalue.data(),
-                        ethr_band.data(),
-                        false /*scf*/));
-            }
-            else {throw std::runtime_error("HSolverLR::solve: method not implemented");}
-        }
-
-        // 5. copy eigenvalue to pes
-        for (int ist = 0;ist < psi.get_nbands();++ist) { pes->ekb(ispin_solve, ist) = eigenvalue[ist];}
-
-
-        // 6. output eigenvalues and eigenvectors
-        print_eigs(eigenvalue, "eigenvalues: (Ry)");
-        print_eigs(eigenvalue, "eigenvalues: (eV)", ModuleBase::Ry_to_eV);
-        if (out_wfc_lr)
-        {
-            if (GlobalV::MY_RANK == 0)
-            {
-                std::ofstream ofs(PARAM.globalv.global_out_dir + "Excitation_Energy_" + spin_types[ispin_solve] + ".dat");
-                ofs << std::setprecision(8) << std::scientific;
-                for (auto& e : eigenvalue) {ofs << e << " ";}
-                ofs.close();
-            }
-            LR_Util::write_psi_bandfirst(psi, PARAM.globalv.global_out_dir + "Excitation_Amplitude_" + spin_types[ispin_solve], GlobalV::MY_RANK);
-        }
-
-        // normalization is already satisfied
-        // std::cout << "check normalization of eigenvectors:" << std::endl;
-        // for (int ist = 0;ist < psi.get_nbands();++ist)
-        // {
-        //     double norm2 = 0;
-        //     for (int ik = 0;ik < psi.get_nk();++ik)
-        //     {
-        //         for (int ib = 0;ib < psi.get_nbasis();++ib)
-        //         {
-        //             norm2 += square(psi(ist, ik, ib));
-        //             // std::cout << "norm2_now=" << norm2 << std::endl;
-        //         }
-        //     }
-        //     std::cout << "state " << ist << ", norm2=" << norm2 << std::endl;
-        // }
-
-        // output iters
-        std::cout << "Average iterative diagonalization steps: " << hsolver::DiagoIterAssist<T, Device>::avg_iter
-            << " ; where current threshold is: " << hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR << " . " << std::endl;
-        // castmem_2d_2h_op()(cpu_ctx, cpu_ctx, pes->ekb.c, eigenvalues.data(), pes->ekb.nr * pes->ekb.nc);
-    }
-    template class HSolverLR<double>;
-    template class HSolverLR<std::complex<double>>;
-};
\ No newline at end of file
diff --git a/source/module_lr/hsolver_lrtd.h b/source/module_lr/hsolver_lrtd.h
deleted file mode 100644
index 7248a5a4f1..0000000000
--- a/source/module_lr/hsolver_lrtd.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#pragma once
-#include "module_hsolver/hsolver.h"
-#include "module_hsolver/diago_iter_assist.h"
-#include "module_psi/psi.h"
-namespace LR
-{
-    template<typename T, typename Device = base_device::DEVICE_CPU>
-    class HSolverLR
-    {
-        using Real = typename GetTypeReal<T>::type;
-        const int& nk;
-        const int& npairs;
-        const int& ispin_solve;
-        const bool out_wfc_lr = false;
-    public:
-        HSolverLR(const int& nk_in, const int& npairs_in, const int& ispin_solve_in = 0, const bool& out_wfc_lr_in = false)
-            :nk(nk_in), npairs(npairs_in), out_wfc_lr(out_wfc_lr_in), ispin_solve(ispin_solve_in) {};
-        Real set_diagethr(Real diag_ethr_in, const int istep, const int iter, const Real ethr)
-        {
-            this->diag_ethr = ethr;
-            return ethr;
-        }
-        void solve(hamilt::Hamilt<T, Device>* pHamilt,
-            psi::Psi<T, Device>& psi,
-            elecstate::ElecState* pes,
-            const std::string method_in,
-            const bool hermitian = true);
-
-        Real diag_ethr = 0.0; // threshold for diagonalization
-
-      private:
-        std::string method = "none";
-    };
-};
\ No newline at end of file
diff --git a/source/module_lr/hsolver_lrtd.hpp b/source/module_lr/hsolver_lrtd.hpp
new file mode 100644
index 0000000000..107621fa08
--- /dev/null
+++ b/source/module_lr/hsolver_lrtd.hpp
@@ -0,0 +1,139 @@
+#pragma once
+#include "module_parameter/parameter.h"
+#include "module_hsolver/diago_david.h"
+#include "module_hsolver/diago_dav_subspace.h"
+#include "module_hsolver/diago_cg.h"
+#include "module_hsolver/diago_iter_assist.h"
+#include "module_lr/utils/lr_util.h"
+#include "module_lr/utils/lr_util_print.h"
+
+namespace LR
+{
+    template<typename T> using Real = typename GetTypeReal<T>::type;
+
+    namespace HSolver
+    {
+        template<typename T>
+        inline void print_eigs(const std::vector<T>& eigs, const std::string& label = "", const double factor = 1.0)
+        {
+            std::cout << label << std::endl;
+            for (auto& e : eigs) { std::cout << e * factor << " "; }
+            std::cout << std::endl;
+        }
+
+        /// eigensolver for common Hamilt
+        template<typename T, typename THamilt>
+        void solve(const THamilt& hm,
+            T* psi,
+            const int& dim, ///< local leading dimension (or nbasis)
+            const int& nband,   ///< nstates in LR-TDDFT, not (nocc+nvirt)
+            double* eig,
+            const std::string method,
+            const Real<T>& diag_ethr, ///< threshold for diagonalization
+            const bool hermitian = true)
+        {
+            ModuleBase::TITLE("HSolverLR", "solve");
+            const std::vector<std::string> spin_types = { "singlet", "triplet" };
+            // note: if not TDA, the eigenvalues will be complex
+            // then we will need a new constructor of DiagoDavid
+
+            // 1. allocate precondition and eigenvalue
+            std::vector<Real<T>> precondition(dim);
+            std::vector<Real<T>> eigenvalue(nband);   //nstates
+            // 2. select the method
+#ifdef __MPI
+            const hsolver::diag_comm_info comm_info = { POOL_WORLD, GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL };
+#else
+            const hsolver::diag_comm_info comm_info = { GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL };
+#endif
+
+            if (method == "lapack")
+            {
+                std::vector<T> Amat_full = hm.matrix();
+                const int gdim = std::sqrt(Amat_full.size());
+                eigenvalue.resize(gdim);
+                if (hermitian) { LR_Util::diag_lapack(gdim, Amat_full.data(), eigenvalue.data()); }
+                else
+                {
+                    std::vector<std::complex<double>> eig_complex(gdim);
+                    LR_Util::diag_lapack_nh(gdim, Amat_full.data(), eig_complex.data());
+                    print_eigs(eig_complex, "Right eigenvalues: of the non-Hermitian matrix: (Ry)");
+                    for (int i = 0; i < gdim; i++) { eigenvalue[i] = eig_complex[i].real(); }
+                }
+                // copy eigenvectors
+                hm.global2local(psi, Amat_full.data(), nband);
+            }
+            else
+            {
+                // 3. set precondition and diagethr
+                for (int i = 0; i < dim; ++i) { precondition[i] = static_cast<Real<T>>(1.0); }
+
+                const int david_maxiter = hsolver::DiagoIterAssist<T>::PW_DIAG_NMAX;
+
+                auto hpsi_func = [&hm](T* psi_in, T* hpsi, const int ld_psi, const int nvec) {hm.hPsi(psi_in, hpsi, ld_psi, nvec);};
+                auto spsi_func = [&hm](const T* psi_in, T* spsi, const int ld_psi, const int nvec)
+                    { std::memcpy(spsi, psi_in, sizeof(T) * ld_psi * nvec); };
+
+                if (method == "dav")
+                {
+                    // Allow 5 tries at most. If ntry > ntry_max = 5, exit diag loop.
+                    const int ntry_max = 5;
+                    // In non-self consistent calculation, do until totally converged. Else allow 5 eigenvecs to be NOT
+                    // converged.
+                    const int notconv_max = ("nscf" == PARAM.inp.calculation) ? 0 : 5;
+                    // do diag and add davidson iteration counts up to avg_iter
+                    hsolver::DiagoDavid<T> david(precondition.data(), nband, dim, PARAM.inp.pw_diag_ndim, PARAM.inp.use_paw, comm_info);
+                    hsolver::DiagoIterAssist<T>::avg_iter += static_cast<double>(david.diag(hpsi_func, spsi_func,
+                        dim, psi, eigenvalue.data(), diag_ethr, david_maxiter, ntry_max, 0));
+                }
+                else if (method == "dav_subspace") //need refactor
+                {
+                    hsolver::Diago_DavSubspace<T> dav_subspace(precondition,
+                        nband,
+                        dim,
+                        PARAM.inp.pw_diag_ndim,
+                        diag_ethr,
+                        david_maxiter,
+                        false, //always do the subspace diag (check the implementation)
+                        comm_info);
+                    std::vector<double> ethr_band(nband, diag_ethr);
+                    hsolver::DiagoIterAssist<T>::avg_iter
+                        += static_cast<double>(dav_subspace.diag(
+                            hpsi_func, psi,
+                            dim,
+                            eigenvalue.data(),
+                            ethr_band.data(),
+                            false /*scf*/));
+                }
+                else { throw std::runtime_error("HSolverLR::solve: method not implemented"); }
+            }
+
+            // 5. copy eigenvalues
+            for (int ist = 0;ist < nband;++ist) { eig[ist] = eigenvalue[ist]; }
+
+            // 6. output eigenvalues and eigenvectors
+            print_eigs(eigenvalue, "eigenvalues: (Ry)");
+            print_eigs(eigenvalue, "eigenvalues: (eV)", ModuleBase::Ry_to_eV);
+
+            // normalization is already satisfied
+            // std::cout << "check normalization of eigenvectors:" << std::endl;
+            // for (int ist = 0;ist < nband;++ist)
+            // {
+            //     double norm2 = 0;
+            //     for (int ik = 0;ik < psi.get_nk();++ik)
+            //     {
+            //         for (int ib = 0;ib < psi.get_nbasis();++ib)
+            //         {
+            //             norm2 += std::norm(psi(ist, ik, ib));
+            //             // std::cout << "norm2_now=" << norm2 << std::endl;
+            //         }
+            //     }
+            //     std::cout << "state " << ist << ", norm2=" << norm2 << std::endl;
+            // }
+
+            // output iters
+            std::cout << "Average iterative diagonalization steps: " << hsolver::DiagoIterAssist<T>::avg_iter
+                << " ; where current threshold is: " << hsolver::DiagoIterAssist<T>::PW_DIAG_THR << " . " << std::endl;
+        }
+    }
+}
\ No newline at end of file
diff --git a/source/module_lr/lr_spectrum.cpp b/source/module_lr/lr_spectrum.cpp
index d889817cfb..5cd9acfdb3 100644
--- a/source/module_lr/lr_spectrum.cpp
+++ b/source/module_lr/lr_spectrum.cpp
@@ -7,11 +7,9 @@
 #include "module_lr/utils/lr_util_hcontainer.h"
 #include "module_lr/utils/lr_util_print.h"
 template<typename T>
-void LR::LR_Spectrum<T>::cal_gint_rho(double** rho, const int& nspin_solve, const int& nrxx)
+void LR::LR_Spectrum<T>::cal_gint_rho(double** rho, const int& nrxx)
 {
-    for (int is = 0;is < nspin_solve;++is) {
-        ModuleBase::GlobalFunc::ZEROS(rho[is], nrxx);
-}
+    ModuleBase::GlobalFunc::ZEROS(rho[0], nrxx);
     Gint_inout inout_rho(rho, Gint_Tools::job_type::rho, 1, false);
     this->gint->cal_gint(&inout_rho);
 }
@@ -26,53 +24,52 @@ inline void check_sum_rule(const double& osc_tot)
 }
 
 template<>
-void LR::LR_Spectrum<double>::oscillator_strength()
+void LR::LR_Spectrum<double>::oscillator_strength(Grid_Driver& gd, const std::vector<double>& orb_cutoff)
 {
     ModuleBase::TITLE("LR::LR_Spectrum", "oscillator_strength");
     std::vector<double>& osc = this->oscillator_strength_;  // unit: Ry
-    osc.resize(X.get_nbands(), 0.0);
-    // const int nspin0 = (this->nspin == 2) ? 2 : 1;   use this in NSPIN=4 implementation
+    osc.resize(nstate, 0.0);
     double osc_tot = 0.0;
     elecstate::DensityMatrix<double, double> DM_trans(&this->pmat, 1, this->kv.kvec_d, this->nk);
-    DM_trans.init_DMR(&GlobalC::GridD, &this->ucell);
-    this->transition_dipole_.resize(X.get_nbands(), ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
-    for (int istate = 0;istate < X.get_nbands();++istate)
+    LR_Util::initialize_DMR(DM_trans, this->pmat, this->ucell, gd, orb_cutoff);
+    this->transition_dipole_.resize(nstate, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
+    for (int istate = 0;istate < nstate;++istate)
     {
-        X.fix_b(istate);
-
-        // LR_Util::print_psi_bandfirst(X, "final X", istate);
-
-        //1. transition density 
+        const int offset_b = istate * ldim;    //start index of band istate
+        for (int is = 0;is < this->nspin_x;++is)
+        {
+            const int offset_x = offset_b + is * nk * pX[0].get_local_size();
+            //1. transition density 
 #ifdef __MPI
-        std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_pblas(X, this->pX, this->psi_ks, this->pc, this->naos, this->nocc, this->nvirt, this->pmat);
-        // if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos, pmat);
+            std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_pblas(X + offset_x, this->pX[is], psi_ks[is], this->pc, this->naos, this->nocc[is], this->nvirt[is], this->pmat);
+            // if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos, pmat);
 #else
-        std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_blas(X, this->psi_ks, this->nocc, this->nvirt);
-        // if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos);
+            std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_blas(X + offset_x, this->psi_ks[is], this->nocc[is], this->nvirt[is]);
+            // if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos);
 #endif
-        for (int ik = 0;ik < this->nk;++ik) { DM_trans.set_DMK_pointer(ik, dm_trans_2d[ik].data<double>()); }
-        DM_trans.cal_DMR();
-        this->gint->transfer_DM2DtoGrid(DM_trans.get_DMR_vector());
+            for (int ik = 0;ik < this->nk;++ik) { DM_trans.set_DMK_pointer(ik, dm_trans_2d[ik].data<double>()); }
+            DM_trans.cal_DMR();
+            this->gint->transfer_DM2DtoGrid(DM_trans.get_DMR_vector());
 
-        // 2. transition density
-        double** rho_trans;
-        // LR_Util::new_p2(rho_trans, nspin_solve, this->rho_basis.nrxx);
-        LR_Util::new_p2(rho_trans, nspin, this->rho_basis.nrxx); // currently gint_kernel_rho uses PARAM.inp.nspin, it needs refactor
-        this->cal_gint_rho(rho_trans, nspin_solve, this->rho_basis.nrxx);
+            // 2. transition density
+            double** rho_trans;
+            LR_Util::new_p2(rho_trans, 1, this->rho_basis.nrxx);
+            this->cal_gint_rho(rho_trans, this->rho_basis.nrxx);
 
-        // 3. transition dipole moment
-        for (int ir = 0; ir < rho_basis.nrxx; ++ir)
-        {
-            int i = ir / (rho_basis.ny * rho_basis.nplane);
-            int j = ir / rho_basis.nplane - i * rho_basis.ny;
-            int k = ir % rho_basis.nplane + rho_basis.startz_current;
-            ModuleBase::Vector3<double> rd(static_cast<double>(i) / rho_basis.nx, static_cast<double>(j) / rho_basis.ny, static_cast<double>(k) / rho_basis.nz);  //+1/2 better?
-            rd -= ModuleBase::Vector3<double>(0.5, 0.5, 0.5);   //shift to the center of the grid (need ?)
-            ModuleBase::Vector3<double> rc = rd * ucell.latvec * ucell.lat0; // real coordinate
-            for (int is = 0;is < nspin_solve;++is) transition_dipole_[istate] += rc * rho_trans[is][ir];
+            // 3. transition dipole moment
+            for (int ir = 0; ir < rho_basis.nrxx; ++ir)
+            {
+                int i = ir / (rho_basis.ny * rho_basis.nplane);
+                int j = ir / rho_basis.nplane - i * rho_basis.ny;
+                int k = ir % rho_basis.nplane + rho_basis.startz_current;
+                ModuleBase::Vector3<double> rd(static_cast<double>(i) / rho_basis.nx, static_cast<double>(j) / rho_basis.ny, static_cast<double>(k) / rho_basis.nz);  //+1/2 better?
+                rd -= ModuleBase::Vector3<double>(0.5, 0.5, 0.5);   //shift to the center of the grid (need ?)
+                ModuleBase::Vector3<double> rc = rd * ucell.latvec * ucell.lat0; // real coordinate
+                transition_dipole_[istate] += rc * rho_trans[0][ir];
+            }
+            LR_Util::delete_p2(rho_trans, 1);
         }
         transition_dipole_[istate] *= (ucell.omega / static_cast<double>(gint->get_ncxyz()));   // dv
-        LR_Util::delete_p2(rho_trans, nspin_solve);
         Parallel_Reduce::reduce_all(transition_dipole_[istate].x);
         Parallel_Reduce::reduce_all(transition_dipole_[istate].y);
         Parallel_Reduce::reduce_all(transition_dipole_[istate].z);
@@ -83,73 +80,70 @@ void LR::LR_Spectrum<double>::oscillator_strength()
 }
 
 template<>
-void LR::LR_Spectrum<std::complex<double>>::oscillator_strength()
+void LR::LR_Spectrum<std::complex<double>>::oscillator_strength(Grid_Driver& gd, const std::vector<double>& orb_cutoff)
 {
     ModuleBase::TITLE("LR::LR_Spectrum", "oscillator_strength");
     std::vector<double>& osc = this->oscillator_strength_;  // unit: Ry
-    osc.resize(X.get_nbands(), 0.0);
-    // const int nspin0 = (this->nspin == 2) ? 2 : 1;   use this in NSPIN=4 implementation
+    osc.resize(nstate, 0.0);
     double osc_tot = 0.0;
     elecstate::DensityMatrix<std::complex<double>, std::complex<double>> DM_trans(&this->pmat, 1, this->kv.kvec_d, this->nk);
-    DM_trans.init_DMR(&GlobalC::GridD, &this->ucell);
+    LR_Util::initialize_DMR(DM_trans, this->pmat, this->ucell, gd, orb_cutoff);
     elecstate::DensityMatrix<std::complex<double>, double> DM_trans_real_imag(&this->pmat, 1, this->kv.kvec_d, this->nk);
-    DM_trans_real_imag.init_DMR(&GlobalC::GridD, &this->ucell);
+    LR_Util::initialize_DMR(DM_trans_real_imag, this->pmat, this->ucell, gd, orb_cutoff);
 
-    this->transition_dipole_.resize(X.get_nbands(), ModuleBase::Vector3<std::complex<double>>(0.0, 0.0, 0.0));
-    for (int istate = 0;istate < X.get_nbands();++istate)
+    this->transition_dipole_.resize(nstate, ModuleBase::Vector3<std::complex<double>>(0.0, 0.0, 0.0));
+    for (int istate = 0;istate < nstate;++istate)
     {
-        X.fix_b(istate);
-        // LR_Util::print_psi_bandfirst(X, "final X", istate);
-
-        //1. transition density 
+        const int offset_b = istate * ldim;    //start index of band istate
+        for (int is = 0;is < this->nspin_x;++is)
+        {
+            const int offset_x = offset_b + is * nk * pX[0].get_local_size();
+            //1. transition density 
 #ifdef __MPI
-        std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_pblas(X, this->pX, this->psi_ks, this->pc, this->naos, this->nocc, this->nvirt, this->pmat, /*renorm_k=*/false, this->nspin_solve);
-        // if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos, pmat);
+            std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_pblas(X + offset_x, this->pX[is], psi_ks[is], this->pc, this->naos, this->nocc[is], this->nvirt[is], this->pmat, /*renorm_k=*/false, 1);
+            // if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos, pmat);
 #else
-        std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_blas(X, this->psi_ks, this->nocc, this->nvirt,/*renorm_k=*/false, this->nspin_solve);
-        // if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos);
+            std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_blas(X + offset_x, psi_ks[is], this->nocc[is], this->nvirt[is],/*renorm_k=*/false, 1);
+            // if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos);
 #endif
-        for (int ik = 0;ik < this->nk;++ik) { DM_trans.set_DMK_pointer(ik, dm_trans_2d[ik].data<std::complex<double>>()); }
-        // for (int ik = 0;ik < this->nk;++ik)
-        //     LR_Util::print_tensor<std::complex<double>>(dm_trans_2d[ik], "1.DMK[ik=" + std::to_string(ik) + "]", dynamic_cast<Parallel_2D*>(&this->pmat));
-
-        DM_trans.cal_DMR();
-
-        // 2. transition density
-        double** rho_trans_real;
-        double** rho_trans_imag;
-        LR_Util::new_p2(rho_trans_real, nspin_solve, this->rho_basis.nrxx);
-        LR_Util::new_p2(rho_trans_imag, nspin_solve, this->rho_basis.nrxx);
-        // real part
-        LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'R');
-        this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
-        this->cal_gint_rho(rho_trans_real, nspin_solve, this->rho_basis.nrxx);
-        // LR_Util::print_grid_nonzero(rho_trans_real[0], this->rho_basis.nrxx, 10, "rho_trans");
-
-        // imag part
-        LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'I');
-        this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
-        this->cal_gint_rho(rho_trans_imag, nspin_solve, this->rho_basis.nrxx);
-        // LR_Util::print_grid_nonzero(rho_trans_imag[0], this->rho_basis.nrxx, 10, "rho_trans");
-
-
-        // 3. transition dipole moment
-        for (int ir = 0; ir < rho_basis.nrxx; ++ir)
-        {
-            int i = ir / (rho_basis.ny * rho_basis.nplane);
-            int j = ir / rho_basis.nplane - i * rho_basis.ny;
-            int k = ir % rho_basis.nplane + rho_basis.startz_current;
-            ModuleBase::Vector3<double> rd(static_cast<double>(i) / rho_basis.nx, static_cast<double>(j) / rho_basis.ny, static_cast<double>(k) / rho_basis.nz);  //+1/2 better?
-            rd -= ModuleBase::Vector3<double>(0.5, 0.5, 0.5);   //shift to the center of the grid (need ?)
-            ModuleBase::Vector3<double> rc = rd * ucell.latvec * ucell.lat0; // real coordinate
-            ModuleBase::Vector3<std::complex<double>> rc_complex(rc.x, rc.y, rc.z);
-            for (int is = 0;is < nspin_solve;++is)
-                transition_dipole_[istate] += rc_complex *
-                std::complex<double>(rho_trans_real[is][ir], rho_trans_imag[is][ir]);
+            for (int ik = 0;ik < this->nk;++ik) { DM_trans.set_DMK_pointer(ik, dm_trans_2d[ik].data<std::complex<double>>()); }
+            // for (int ik = 0;ik < this->nk;++ik)
+            //     LR_Util::print_tensor<std::complex<double>>(dm_trans_2d[ik], "1.DMK[ik=" + std::to_string(ik) + "]", dynamic_cast<Parallel_2D*>(&this->pmat));
+            DM_trans.cal_DMR();
+
+            // 2. transition density
+            double** rho_trans_real;
+            double** rho_trans_imag;
+            LR_Util::new_p2(rho_trans_real, 1, this->rho_basis.nrxx);
+            LR_Util::new_p2(rho_trans_imag, 1, this->rho_basis.nrxx);
+            // real part
+            LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'R');
+            this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
+            this->cal_gint_rho(rho_trans_real, this->rho_basis.nrxx);
+            // LR_Util::print_grid_nonzero(rho_trans_real[0], this->rho_basis.nrxx, 10, "rho_trans");
+
+            // imag part
+            LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'I');
+            this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
+            this->cal_gint_rho(rho_trans_imag, this->rho_basis.nrxx);
+            // LR_Util::print_grid_nonzero(rho_trans_imag[0], this->rho_basis.nrxx, 10, "rho_trans");
+
+            // 3. transition dipole moment
+            for (int ir = 0; ir < rho_basis.nrxx; ++ir)
+            {
+                int i = ir / (rho_basis.ny * rho_basis.nplane);
+                int j = ir / rho_basis.nplane - i * rho_basis.ny;
+                int k = ir % rho_basis.nplane + rho_basis.startz_current;
+                ModuleBase::Vector3<double> rd(static_cast<double>(i) / rho_basis.nx, static_cast<double>(j) / rho_basis.ny, static_cast<double>(k) / rho_basis.nz);  //+1/2 better?
+                rd -= ModuleBase::Vector3<double>(0.5, 0.5, 0.5);   //shift to the center of the grid (need ?)
+                ModuleBase::Vector3<double> rc = rd * ucell.latvec * ucell.lat0; // real coordinate
+                ModuleBase::Vector3<std::complex<double>> rc_complex(rc.x, rc.y, rc.z);
+                transition_dipole_[istate] += rc_complex * std::complex<double>(rho_trans_real[0][ir], rho_trans_imag[0][ir]);
+            }
+            LR_Util::delete_p2(rho_trans_real, 1);
+            LR_Util::delete_p2(rho_trans_imag, 1);
         }
         transition_dipole_[istate] *= (ucell.omega / static_cast<double>(gint->get_ncxyz()));   // dv
-        LR_Util::delete_p2(rho_trans_real, nspin_solve);
-        LR_Util::delete_p2(rho_trans_imag, nspin_solve);
         Parallel_Reduce::reduce_all(transition_dipole_[istate].x);
         Parallel_Reduce::reduce_all(transition_dipole_[istate].y);
         Parallel_Reduce::reduce_all(transition_dipole_[istate].z);
@@ -164,11 +158,11 @@ void LR::LR_Spectrum<std::complex<double>>::oscillator_strength()
     check_sum_rule(osc_tot);
 }
 template<typename T>
-void LR::LR_Spectrum<T>::optical_absorption(const std::vector<double>& freq, const double eta, const int ispin)
+void LR::LR_Spectrum<T>::optical_absorption(const std::vector<double>& freq, const double eta, const std::string& spintype)
 {
     ModuleBase::TITLE("LR::LR_Spectrum", "optical_absorption");
     std::vector<double>& osc = this->oscillator_strength_;
-    std::ofstream ofs(PARAM.globalv.global_out_dir + "absorption_" + this->spin_types[ispin] + ".dat");
+    std::ofstream ofs(PARAM.globalv.global_out_dir + "absorption_" + spintype + ".dat");
     if (GlobalV::MY_RANK == 0) { ofs << "Frequency (eV) | wave length(nm) | Absorption (a.u.)" << std::endl; }
     double FourPI_div_c = ModuleBase::FOUR_PI / 137.036;
     for (int f = 0;f < freq.size();++f)
@@ -182,17 +176,17 @@ void LR::LR_Spectrum<T>::optical_absorption(const std::vector<double>& freq, con
 }
 
 template<typename T>
-void LR::LR_Spectrum<T>::transition_analysis(const int ispin)
+void LR::LR_Spectrum<T>::transition_analysis(const std::string& spintype)
 {
     ModuleBase::TITLE("LR::LR_Spectrum", "transition_analysis");
     std::ofstream& ofs = GlobalV::ofs_running;
     ofs << "==================================================================== " << std::endl;
-    ofs << std::setw(40) << this->spin_types[ispin] << std::endl;
+    ofs << std::setw(40) << spintype << std::endl;
     ofs << "==================================================================== " << std::endl;
     ofs << std::setw(8) << "State" << std::setw(30) << "Excitation Energy (Ry, eV)" <<
         std::setw(45) << "Transition dipole x, y, z (a.u.)" << std::setw(30) << "Oscillator strength(a.u.)" << std::endl;
     ofs << "------------------------------------------------------------------------------------ " << std::endl;
-    for (int istate = 0;istate < X.get_nbands();++istate)
+    for (int istate = 0;istate < nstate;++istate)
         ofs << std::setw(8) << istate << std::setw(15) << std::setprecision(6) << eig[istate] << std::setw(15) << eig[istate] * ModuleBase::Ry_to_eV
         << std::setw(15) << transition_dipole_[istate].x << std::setw(15) << transition_dipole_[istate].y << std::setw(15) << transition_dipole_[istate].z
         << std::setw(30) << oscillator_strength_[istate] << std::endl;
@@ -202,38 +196,54 @@ void LR::LR_Spectrum<T>::transition_analysis(const int ispin)
         << std::setw(30) << "Excitation rate"
         << std::setw(10) << "k-point" << std::endl;
     ofs << "------------------------------------------------------------------------------------ " << std::endl;
-    for (int istate = 0;istate < X.get_nbands();++istate)
+    for (int istate = 0;istate < nstate;++istate)
     {
         /// find the main contributions (> 0.5)
-        X.fix_b(istate);
-        psi::Psi<T> X_full(X.get_nk(), 1, nocc * nvirt, nullptr, false);// one-band
-        X_full.zero_out();
-        for (int ik = 0;ik < X.get_nk();++ik)
+        const int loffset_b = istate * ldim;
+        std::vector<T> X_full(gdim, T(0));// one-band, global
+        for (int is = 0;is < nspin_x;++is)
         {
-            X.fix_k(ik);
-            X_full.fix_k(ik);
+            const int loffset_bs = loffset_b + is * nk * pX[0].get_local_size();
+            const int goffset_s = is * nk * nocc[0] * nvirt[0];
+            for (int ik = 0;ik < nk;++ik)
+            {
+                const int loffset_x = loffset_bs + ik * pX[is].get_local_size();
+                const int goffset_x = goffset_s + ik * nocc[is] * nvirt[is];
 #ifdef __MPI
-            LR_Util::gather_2d_to_full(this->pX, X.get_pointer(), X_full.get_pointer(), false, nvirt, nocc);
+                LR_Util::gather_2d_to_full(this->pX[is], X + loffset_x, X_full.data() + goffset_x, false, nvirt[is], nocc[is]);
 #endif
+            }
         }
         std::map<double, int, std::greater<double>> abs_order;
-        X_full.fix_k(0);
-        for (int i = 0;i < X.get_nk() * nocc * nvirt;++i) { double abs = std::abs(X_full.get_pointer()[i]);if (abs > 0.3) { abs_order[abs] = i; } }
+        for (int i = 0;i < gdim;++i) { double abs = std::abs(X_full.at(i));if (abs > ana_thr) { abs_order[abs] = i; } }
         if (abs_order.size() > 0) {
             for (auto it = abs_order.cbegin();it != abs_order.cend();++it)
             {
-                int ik = it->second / (nocc * nvirt);
-                int ipair = it->second - ik * nocc * nvirt;
+                auto pair_info = get_pair_info(it->second);
+                const int& is = pair_info["ispin"];
+                const std::string s = nspin_x == 2 ? (is == 0 ? "a" : "b") : "";
                 ofs << std::setw(8) << (it == abs_order.cbegin() ? std::to_string(istate) : " ")
-                    << std::setw(20) << ipair / nvirt + 1 << std::setw(20) << ipair % nvirt + nocc + 1// iocc and ivirt
-                    << std::setw(30) << X_full(ik, ipair)
-                    << std::setw(30) << std::norm(X_full(ik, ipair))
-                    << std::setw(10) << ik << std::endl;
+                    << std::setw(20) << std::to_string(pair_info["iocc"] + 1) + s << std::setw(20) << std::to_string(pair_info["ivirt"] + nocc[is] + 1) + s// iocc and ivirt
+                    << std::setw(30) << X_full.at(it->second)
+                    << std::setw(30) << std::norm(X_full.at(it->second))
+                    << std::setw(10) << pair_info["ik"] + 1 << std::endl;
             }
         }
     }
     ofs << "==================================================================== " << std::endl;
-    X.fix_kb(0, 0);
+}
+
+template<typename T>
+std::map<std::string, int> LR::LR_Spectrum<T>::get_pair_info(const int i)
+{
+    assert(i >= 0 && i < gdim);
+    const int dim_spin0 = nk * nocc[0] * nvirt[0];
+    const int ispin = (nspin_x == 2 && i >= dim_spin0) ? 1 : 0;
+    const int ik = (i - ispin*dim_spin0) / (nocc[ispin] * nvirt[ispin]);
+    const int ipair = (i - ispin*dim_spin0) - ik * nocc[ispin] * nvirt[ispin];
+    const int iocc = ipair / nvirt[ispin];
+    const int ivirt = ipair % nvirt[ispin];
+    return  { {"ispin", ispin}, {"ik", ik}, {"iocc", iocc}, {"ivirt", ivirt} };
 }
 
 template class LR::LR_Spectrum<double>;
diff --git a/source/module_lr/lr_spectrum.h b/source/module_lr/lr_spectrum.h
index 673a6b46ef..4b521cd564 100644
--- a/source/module_lr/lr_spectrum.h
+++ b/source/module_lr/lr_spectrum.h
@@ -2,6 +2,7 @@
 #include "module_lr/utils/gint_template.h"
 #include "module_psi/psi.h"
 #include "module_elecstate/module_dm/density_matrix.h"
+#include "module_lr/utils/lr_util.h"
 
 namespace LR
 {
@@ -9,38 +10,50 @@ namespace LR
     class LR_Spectrum
     {
     public:
-        LR_Spectrum(const double* eig, const psi::Psi<T>& X, const int& nspin, const int& naos, const int& nocc, const int& nvirt,
-            typename TGint<T>::type* gint, const ModulePW::PW_Basis& rho_basis, psi::Psi<T>& psi_ks,
-            const UnitCell& ucell, const K_Vectors& kv_in, Parallel_2D& pX_in, Parallel_2D& pc_in, Parallel_Orbitals& pmat_in) :
-            eig(eig), X(X), nspin(nspin), naos(naos), nocc(nocc), nvirt(nvirt), nk(kv_in.get_nks() / nspin),
-            gint(gint), rho_basis(rho_basis), psi_ks(psi_ks),
-            ucell(ucell), kv(kv_in), pX(pX_in), pc(pc_in), pmat(pmat_in) {};
-        /// $$2/3\Omega\sum_{ia\sigma} |\braket{\psi_{i}|\mathbf{r}|\psi_{a}} |^2\int \rho_{\alpha\beta}(\mathbf{r}) \mathbf{r} d\mathbf{r}$$
-        void oscillator_strength();
+        LR_Spectrum(const int& nspin_global, const int& naos, const std::vector<int>& nocc, const std::vector<int>& nvirt,
+            typename TGint<T>::type* gint, const ModulePW::PW_Basis& rho_basis, psi::Psi<T>& psi_ks_in,
+            const UnitCell& ucell, const K_Vectors& kv_in, Grid_Driver& gd, const std::vector<double>& orb_cutoff,
+            const std::vector<Parallel_2D>& pX_in, const Parallel_2D& pc_in, const Parallel_Orbitals& pmat_in,
+            const double* eig, const T* X, const int& nstate, const bool& openshell) :
+            nspin_x(openshell ? 2 : 1), naos(naos), nocc(nocc), nvirt(nvirt), nk(kv_in.get_nks() / nspin_global),
+            gint(gint), rho_basis(rho_basis), ucell(ucell), kv(kv_in),
+            pX(pX_in), pc(pc_in), pmat(pmat_in),
+            eig(eig), X(X), nstate(nstate),
+            ldim(nk* (nspin_x == 2 ? pX_in[0].get_local_size() + pX_in[1].get_local_size() : pX_in[0].get_local_size())),
+            gdim(nk* std::inner_product(nocc.begin(), nocc.end(), nvirt.begin(), 0))
+        {
+            for (int is = 0;is < nspin_global;++is) { psi_ks.emplace_back(LR_Util::get_psi_spin(psi_ks_in, is, nk)); }
+            this->oscillator_strength(gd, orb_cutoff);
+        };
         /// @brief calculate the optical absorption spectrum
-        void optical_absorption(const std::vector<double>& freq, const double eta, const int ispin = 0);
+        void optical_absorption(const std::vector<double>& freq, const double eta, const std::string& spintype);
         /// @brief print out the transition dipole moment and the main contributions to the transition amplitude
-        void transition_analysis(const int ispin = 0);
+        void transition_analysis(const std::string& spintype);
     private:
-        const int& nspin;
-        const int nspin_solve = 1;
-        const int& naos;
-        const int& nocc;
-        const int& nvirt;
+        /// $$2/3\Omega\sum_{ia\sigma} |\braket{\psi_{i}|\mathbf{r}|\psi_{a}} |^2\int \rho_{\alpha\beta}(\mathbf{r}) \mathbf{r} d\mathbf{r}$$
+        void oscillator_strength(Grid_Driver& gd, const std::vector<double>& orb_cutoff);
+        const int nspin_x = 1;   ///< 1 for singlet/triplet, 2 for updown(openshell)
+        const int naos = 1;
+        const std::vector<int>& nocc;
+        const std::vector<int>& nvirt;
         const int nk = 1;
+        const int nstate = 1;
+        const int ldim = 1;///< local leading dimension of X, or the data size of each state
+        const int gdim = 1;///< global leading dimension of X
+        const double ana_thr = 0.3;     ///< {abs(X) > thr} will appear in the transition analysis log
         const double* eig;
-        const psi::Psi<T>& X;
+        const T* X;
         const K_Vectors& kv;
-        const psi::Psi<T>& psi_ks;
-        Parallel_2D& pX;
-        Parallel_2D& pc;
-        Parallel_Orbitals& pmat;
+        std::vector<psi::Psi<T>> psi_ks;
+        const std::vector<Parallel_2D>& pX;
+        const Parallel_2D& pc;
+        const Parallel_Orbitals& pmat;
         typename TGint<T>::type* gint = nullptr;
         const ModulePW::PW_Basis& rho_basis;
         const UnitCell& ucell;
-        const std::vector<std::string> spin_types = { "Spin Singlet", "Spin Triplet" };
 
-        void cal_gint_rho(double** rho, const int& nspin, const int& nrxx);
+        void cal_gint_rho(double** rho, const int& nrxx);
+        std::map<std::string, int> get_pair_info(const int i);  ///< given the index in X, return its ispin, ik, iocc, ivirt
 
         std::vector<ModuleBase::Vector3<T>> transition_dipole_;   ///< $\braket{ \psi_{i} | \mathbf{r} | \psi_{a} }$
         std::vector<double> oscillator_strength_;///< $2/3\Omega |\sum_{ia\sigma} \braket{\psi_{i}|\mathbf{r}|\psi_{a}} |^2$
diff --git a/source/module_lr/operator_casida/operator_lr_diag.h b/source/module_lr/operator_casida/operator_lr_diag.h
index fb2910c1e7..b18343e04a 100644
--- a/source/module_lr/operator_casida/operator_lr_diag.h
+++ b/source/module_lr/operator_casida/operator_lr_diag.h
@@ -12,50 +12,48 @@ namespace LR
     class OperatorLRDiag : public hamilt::Operator<T, Device>
     {
     public:
-        OperatorLRDiag(const ModuleBase::matrix& eig_ks_in, const Parallel_2D* pX_in, const int& nk_in, const int& nocc_in, const int& nvirt_in)
-            : eig_ks(eig_ks_in), pX(pX_in), nk(nk_in), nocc(nocc_in), nvirt(nvirt_in)
+        OperatorLRDiag(const double* eig_ks, const Parallel_2D& pX_in, const int& nk_in, const int& nocc_in, const int& nvirt_in)
+            : pX(pX_in), nk(nk_in), nocc(nocc_in), nvirt(nvirt_in)
         {   // calculate the difference of eigenvalues
             ModuleBase::TITLE("OperatorLRDiag", "OperatorLRDiag");
-#ifdef __MPI
-            Parallel_Common::bcast_double(eig_ks.c, eig_ks.nr * eig_ks.nc);
-#endif
-            this->act_type = 2;
+            const int nbands = nocc + nvirt;
             this->cal_type = hamilt::calculation_type::no;
-            this->eig_ks_diff.create(nk, pX->get_local_size(), false);
+            this->eig_ks_diff.create(nk, pX.get_local_size(), false);
             for (int ik = 0;ik < nk;++ik)
-                for (int io = 0;io < pX->get_col_size();++io)    //nocc_local
-                    for (int iv = 0;iv < pX->get_row_size();++iv)    //nvirt_local
+            {
+                const int& istart = ik * nbands;
+                for (int io = 0;io < pX.get_col_size();++io)    //nocc_local
+                {
+                    for (int iv = 0;iv < pX.get_row_size();++iv)    //nvirt_local
                     {
-                        int io_g = pX->local2global_col(io);
-                        int iv_g = pX->local2global_row(iv);
-                        this->eig_ks_diff(ik, io * pX->get_row_size() + iv) = eig_ks(ik, nocc + iv_g) - eig_ks(ik, io_g);
+                        int io_g = pX.local2global_col(io);
+                        int iv_g = pX.local2global_row(iv);
+                        this->eig_ks_diff(ik, io * pX.get_row_size() + iv) = eig_ks[istart + nocc + iv_g] - eig_ks[istart + io_g];
                     }
+                }
+            }
         };
         void init(const int ik_in) override {};
 
         /// caution: put this operator at the head of the operator list,
         /// because vector_mul_vector_op directly assign to (rather than add on) psi_out.
-        virtual void act(const psi::Psi<T>& psi_in, psi::Psi<T>& psi_out, const int nbands) const override
+        virtual void  act(const int nbands,
+            const int nbasis,
+            const int npol,
+            const T* psi_in,
+            T* hpsi,
+            const int ngk_ik = 0)const override
         {
             ModuleBase::TITLE("OperatorLRDiag", "act");
-            assert(nbands <= psi_in.get_nbands());
-
-            psi::Psi<T> psi_in_bfirst = LR_Util::k1_to_bfirst_wrapper(psi_in, this->nk, this->pX->get_local_size());
-            psi::Psi<T> psi_out_bfirst = LR_Util::k1_to_bfirst_wrapper(psi_out, this->nk, this->pX->get_local_size());
-            for (int ib = 0;ib < nbands;++ib)
-            {
-                psi_in_bfirst.fix_b(ib);
-                psi_out_bfirst.fix_b(ib);
-                hsolver::vector_mul_vector_op<T, Device>()(this->ctx,
-                    psi_in_bfirst.get_nk() * psi_in_bfirst.get_nbasis(),
-                    psi_out_bfirst.get_pointer(),
-                    psi_in_bfirst.get_pointer(),
-                    this->eig_ks_diff.c);
-            }
+            const int nlocal_ph = nk * pX.get_local_size();   // local size of particle-hole basis
+            hsolver::vector_mul_vector_op<T, Device>()(this->ctx,
+                nk * pX.get_local_size(),
+                hpsi,
+                psi_in,
+                this->eig_ks_diff.c);
         }
     private:
-        const ModuleBase::matrix& eig_ks;
-        const Parallel_2D* pX;
+        const Parallel_2D& pX;
         ModuleBase::matrix eig_ks_diff;
         const int& nk;
         const int& nocc;
diff --git a/source/module_lr/operator_casida/operator_lr_exx.cpp b/source/module_lr/operator_casida/operator_lr_exx.cpp
index 3ed5f06353..fba9b5f898 100644
--- a/source/module_lr/operator_casida/operator_lr_exx.cpp
+++ b/source/module_lr/operator_casida/operator_lr_exx.cpp
@@ -10,24 +10,21 @@ namespace LR
     void OperatorLREXX<T>::allocate_Ds_onebase()
     {
         ModuleBase::TITLE("OperatorLREXX", "allocate_Ds_onebase");
-        this->Ds_onebase.resize(this->nspin_solve);
-        for (int is = 0;is < this->nspin_solve;++is) {
-            for (int iat1 = 0;iat1 < ucell.nat;++iat1) {
-                const int it1 = ucell.iat2it[iat1];
-                for (int iat2 = 0;iat2 < ucell.nat;++iat2) {
-                    const int it2=ucell.iat2it[iat2];
-                    for (auto cell : this->BvK_cells) {
-                        this->Ds_onebase[is][iat1][std::make_pair(iat2, cell)] = aims_nbasis.empty() ?
-                            RI::Tensor<T>({ static_cast<size_t>(ucell.atoms[it1].nw),  static_cast<size_t>(ucell.atoms[it2].nw) }) :
-                            RI::Tensor<T>({ static_cast<size_t>( aims_nbasis[it1]),  static_cast<size_t>( aims_nbasis[it2]) });
-                    }
+        for (int iat1 = 0;iat1 < ucell.nat;++iat1) {
+            const int it1 = ucell.iat2it[iat1];
+            for (int iat2 = 0;iat2 < ucell.nat;++iat2) {
+                const int it2 = ucell.iat2it[iat2];
+                for (auto cell : this->BvK_cells) {
+                    this->Ds_onebase[iat1][std::make_pair(iat2, cell)] = aims_nbasis.empty() ?
+                        RI::Tensor<T>({ static_cast<size_t>(ucell.atoms[it1].nw),  static_cast<size_t>(ucell.atoms[it2].nw) }) :
+                        RI::Tensor<T>({ static_cast<size_t>(aims_nbasis[it1]),  static_cast<size_t>(aims_nbasis[it2]) });
                 }
             }
         }
     }
 
     template<>
-    void OperatorLREXX<double>::cal_DM_onebase(const int io, const int iv, const int ik, const int is) const
+    void OperatorLREXX<double>::cal_DM_onebase(const int io, const int iv, const int ik) const
     {
         ModuleBase::TITLE("OperatorLREXX", "cal_DM_onebase");
         assert(ik == 0);
@@ -40,19 +37,19 @@ namespace LR
                         {
                             int iat1 = ucell.itia2iat(it1, ia1);
                             int iat2 = ucell.itia2iat(it2, ia2);
-                            auto& D2d = this->Ds_onebase[is][iat1][std::make_pair(iat2, cell)];
+                            auto& D2d = this->Ds_onebase[iat1][std::make_pair(iat2, cell)];
                             const int nw1 = aims_nbasis.empty() ? ucell.atoms[it1].nw : aims_nbasis[it1];
                             const int nw2 = aims_nbasis.empty() ? ucell.atoms[it2].nw : aims_nbasis[it2];
                             for (int iw1 = 0;iw1 < nw1;++iw1)
                                 for (int iw2 = 0;iw2 < nw2;++iw2)
-                                    if(this->pmat->in_this_processor(ucell.itiaiw2iwt(it1, ia1, iw1), ucell.itiaiw2iwt(it2, ia2, iw2)))
+                                    if (this->pmat.in_this_processor(ucell.itiaiw2iwt(it1, ia1, iw1), ucell.itiaiw2iwt(it2, ia2, iw2)))
                                         D2d(iw1, iw2) = this->psi_ks_full(ik, io, ucell.itiaiw2iwt(it1, ia1, iw1)) * this->psi_ks_full(ik, nocc + iv, ucell.itiaiw2iwt(it2, ia2, iw2));
                         }
         }
     }
 
     template<>
-    void OperatorLREXX<std::complex<double>>::cal_DM_onebase(const int io, const int iv, const int ik, const int is) const
+    void OperatorLREXX<std::complex<double>>::cal_DM_onebase(const int io, const int iv, const int ik) const
     {
         ModuleBase::TITLE("OperatorLREXX", "cal_DM_onebase");
         for (auto cell : this->BvK_cells)
@@ -66,92 +63,74 @@ namespace LR
                         {
                             int iat1 = ucell.itia2iat(it1, ia1);
                             int iat2 = ucell.itia2iat(it2, ia2);
-                            auto& D2d = this->Ds_onebase[is][iat1][std::make_pair(iat2, cell)];
+                            auto& D2d = this->Ds_onebase[iat1][std::make_pair(iat2, cell)];
                             const int nw1 = aims_nbasis.empty() ? ucell.atoms[it1].nw : aims_nbasis[it1];
                             const int nw2 = aims_nbasis.empty() ? ucell.atoms[it2].nw : aims_nbasis[it2];
                             for (int iw1 = 0;iw1 < nw1;++iw1)
                                 for (int iw2 = 0;iw2 < nw2;++iw2)
-                                    if(this->pmat->in_this_processor(ucell.itiaiw2iwt(it1, ia1, iw1), ucell.itiaiw2iwt(it2, ia2, iw2)))
+                                    if (this->pmat.in_this_processor(ucell.itiaiw2iwt(it1, ia1, iw1), ucell.itiaiw2iwt(it2, ia2, iw2)))
                                         D2d(iw1, iw2) = frac * std::conj(this->psi_ks_full(ik, io, ucell.itiaiw2iwt(it1, ia1, iw1))) * this->psi_ks_full(ik, nocc + iv, ucell.itiaiw2iwt(it2, ia2, iw2));
                         }
         }
     }
 
     template<typename T>
-    void OperatorLREXX<T>::act(const psi::Psi<T>& psi_in, psi::Psi<T>& psi_out, const int nbands) const
+    void OperatorLREXX<T>::act(const int nbands, const int nbasis, const int npol, const T* psi_in, T* hpsi, const int ngk_ik)const
     {
         ModuleBase::TITLE("OperatorLREXX", "act");
 
-        assert(nbands <= psi_in.get_nbands());
         const int& nk = this->kv.get_nks() / this->nspin;
-        psi::Psi<T> psi_in_bfirst = LR_Util::k1_to_bfirst_wrapper(psi_in, nk, this->pX->get_local_size());
-        psi::Psi<T> psi_out_bfirst = LR_Util::k1_to_bfirst_wrapper(psi_out, nk, this->pX->get_local_size());
 
         // convert parallel info to LibRI interfaces
-        std::vector<std::tuple<std::set<TA>, std::set<TA>>> judge = RI_2D_Comm::get_2D_judge(*this->pmat);
-        for (int ib = 0;ib < nbands;++ib)
-        {
-            psi_out_bfirst.fix_b(ib);
-            // suppose Cs，Vs, have already been calculated in the ion-step of ground state, 
-            // DM_trans(k) and DM_trans(R) has already been calculated from psi_in in OperatorLRHxc::act
-            // but int RI_benchmark, DM_trans(k) should be first calculated here
-            if (cal_dm_trans)
-            {
-#ifdef __MPI
-                std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_pblas(psi_in_bfirst, *pX, *psi_ks, *pc, naos, nocc, nvirt, *pmat);
-                if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos, *pmat);
-#else
-                std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_blas(psi_in_bfirst, *psi_ks, nocc, nvirt);
-                if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos);
-#endif
-                // tensor to vector, then set DMK
-                for (int ik = 0;ik < nk;++ik) { this->DM_trans[ib]->set_DMK_pointer(ik, dm_trans_2d[ik].data<T>()); }
-            }
+        std::vector<std::tuple<std::set<TA>, std::set<TA>>> judge = RI_2D_Comm::get_2D_judge(this->pmat);
 
-            // 1. set_Ds (once)
-            // convert to vector<T*> for the interface of RI_2D_Comm::split_m2D_ktoR (interface will be unified to ct::Tensor)
-            std::vector<std::vector<T>> DMk_trans_vector = this->DM_trans[ib]->get_DMK_vector();
-            // assert(DMk_trans_vector.size() == nk);
-            std::vector<const std::vector<T>*> DMk_trans_pointer(nk);
-            for (int ik = 0;ik < nk;++ik) {DMk_trans_pointer[ik] = &DMk_trans_vector[ik];}
-            // if multi-k, DM_trans(TR=double) -> Ds_trans(TR=T=complex<double>)
-            std::vector<std::map<TA, std::map<TAC, RI::Tensor<T>>>> Ds_trans =
-                aims_nbasis.empty() ? 
-                RI_2D_Comm::split_m2D_ktoR<T>(this->kv, DMk_trans_pointer, *this->pmat, this->nspin_solve)
-                : RI_Benchmark::split_Ds(DMk_trans_vector, aims_nbasis, ucell); //0.5 will be multiplied
-            // LR_Util::print_CV(Ds_trans[0], "Ds_trans in OperatorLREXX", 1e-10);
-            // 2. cal_Hs
-            auto lri = this->exx_lri.lock();
-            for (int is = 0;is < nspin_solve;++is)
-            {
-                // LR_Util::print_CV(Ds_trans[is], "Ds_trans in OperatorLREXX", 1e-10);
-                lri->exx_lri.set_Ds(std::move(Ds_trans[is]), lri->info.dm_threshold);
-                lri->exx_lri.cal_Hs();
-                lri->Hexxs[is] = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
-                    lri->mpi_comm, std::move(lri->exx_lri.Hs), std::get<0>(judge[is]), std::get<1>(judge[is]));
-                lri->post_process_Hexx(lri->Hexxs[is]);
-            }
+        // suppose Cs，Vs, have already been calculated in the ion-step of ground state
+        // and DM_trans has been calculated in hPsi() outside.
 
-            // 3. set [AX]_iak = DM_onbase * Hexxs for each occ-virt pair and each k-point
-            // caution: parrallel
+        // 1. set_Ds (once)
+        // convert to vector<T*> for the interface of RI_2D_Comm::split_m2D_ktoR (interface will be unified to ct::Tensor)
+        std::vector<std::vector<T>> DMk_trans_vector = this->DM_trans->get_DMK_vector();
+        // assert(DMk_trans_vector.size() == nk);
+        std::vector<const std::vector<T>*> DMk_trans_pointer(nk);
+        for (int ik = 0;ik < nk;++ik) { DMk_trans_pointer[ik] = &DMk_trans_vector[ik]; }
+        // if multi-k, DM_trans(TR=double) -> Ds_trans(TR=T=complex<double>)
+        std::vector<std::map<TA, std::map<TAC, RI::Tensor<T>>>> Ds_trans =
+            aims_nbasis.empty() ?
+            RI_2D_Comm::split_m2D_ktoR<T>(this->kv, DMk_trans_pointer, this->pmat, 1)
+            : RI_Benchmark::split_Ds(DMk_trans_vector, aims_nbasis, ucell); //0.5 will be multiplied
+        // LR_Util::print_CV(Ds_trans[0], "Ds_trans in OperatorLREXX", 1e-10);
+        // 2. cal_Hs
+        auto lri = this->exx_lri.lock();
 
-            for (int io = 0;io < this->nocc;++io) {
-                for (int iv = 0;iv < this->nvirt;++iv) {
-                    for (int ik = 0;ik < nk;++ik) {
-                        for (int is = 0;is < this->nspin_solve;++is)
-                            {
-                                this->cal_DM_onebase(io, iv, ik, is);       //set Ds_onebase for all e-h pairs (not only on this processor)
-                                // LR_Util::print_CV(Ds_onebase[is], "Ds_onebase of occ " + std::to_string(io) + ", virtual " + std::to_string(iv) + " in OperatorLREXX", 1e-10);
-                                const T& ene= 2 * alpha * //minus for exchange(but here plus is right, why?), 2 for Hartree to Ry
-                                    lri->exx_lri.post_2D.cal_energy(this->Ds_onebase[is], lri->Hexxs[is]);
-                                if(this->pX->in_this_processor(iv, io)) { 
-                                    psi_out_bfirst(ik, this->pX->global2local_col(io) * this->pX->get_row_size() + this->pX->global2local_row(iv)) += ene;
-}
-                            }
-}
-}
-}
+        // LR_Util::print_CV(Ds_trans[is], "Ds_trans in OperatorLREXX", 1e-10);
+        lri->exx_lri.set_Ds(std::move(Ds_trans[0]), lri->info.dm_threshold);
+        lri->exx_lri.cal_Hs();
+        lri->Hexxs[0] = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+            lri->mpi_comm, std::move(lri->exx_lri.Hs), std::get<0>(judge[0]), std::get<1>(judge[0]));
+        lri->post_process_Hexx(lri->Hexxs[0]);
+
+        // 3. set [AX]_iak = DM_onbase * Hexxs for each occ-virt pair and each k-point
+        // caution: parrallel
+
+        for (int io = 0;io < this->nocc;++io)
+        {
+            for (int iv = 0;iv < this->nvirt;++iv)
+            {
+                for (int ik = 0;ik < nk;++ik)
+                {
+                    const int xstart_bk = ik * pX.get_local_size();
+                    this->cal_DM_onebase(io, iv, ik);       //set Ds_onebase for all e-h pairs (not only on this processor)
+                    // LR_Util::print_CV(Ds_onebase[is], "Ds_onebase of occ " + std::to_string(io) + ", virtual " + std::to_string(iv) + " in OperatorLREXX", 1e-10);
+                    const T& ene = 2 * alpha * //minus for exchange(but here plus is right, why?), 2 for Hartree to Ry
+                        lri->exx_lri.post_2D.cal_energy(this->Ds_onebase, lri->Hexxs[0]);
+                    if (this->pX.in_this_processor(iv, io))
+                    {
+                        hpsi[xstart_bk + ik * pX.get_local_size() + this->pX.global2local_col(io) * this->pX.get_row_size() + this->pX.global2local_row(iv)] += ene;
+                    }
+                }
+            }
         }
+
     }
     template class OperatorLREXX<double>;
     template class OperatorLREXX<std::complex<double>>;
diff --git a/source/module_lr/operator_casida/operator_lr_exx.h b/source/module_lr/operator_casida/operator_lr_exx.h
index 1ebea0c9f2..ac052d2655 100644
--- a/source/module_lr/operator_casida/operator_lr_exx.h
+++ b/source/module_lr/operator_casida/operator_lr_exx.h
@@ -22,47 +22,44 @@ namespace LR
             const int& nocc,
             const int& nvirt,
             const UnitCell& ucell_in,
-            const psi::Psi<T>* psi_ks_in,
-            std::vector<std::unique_ptr<elecstate::DensityMatrix<T, T>>>& DM_trans_in,
+            const psi::Psi<T>& psi_ks_in,
+            std::unique_ptr<elecstate::DensityMatrix<T, T>>& DM_trans_in,
             // HContainer<double>* hR_in,
             std::weak_ptr<Exx_LRI<T>> exx_lri_in,
             const K_Vectors& kv_in,
-            Parallel_2D* pX_in,
-            Parallel_2D* pc_in,
-            Parallel_Orbitals* pmat_in,
+            const Parallel_2D& pX_in,
+            const Parallel_2D& pc_in,
+            const Parallel_Orbitals& pmat_in,
             const double& alpha = 1.0,
-            const bool& cal_dm_trans = false,
             const std::vector<int>& aims_nbasis = {})
             : nspin(nspin), naos(naos), nocc(nocc), nvirt(nvirt),
             psi_ks(psi_ks_in), DM_trans(DM_trans_in), exx_lri(exx_lri_in), kv(kv_in),
-            pX(pX_in), pc(pc_in), pmat(pmat_in), ucell(ucell_in), alpha(alpha), cal_dm_trans(cal_dm_trans),
+            pX(pX_in), pc(pc_in), pmat(pmat_in), ucell(ucell_in), alpha(alpha),
             aims_nbasis(aims_nbasis)
         {
             ModuleBase::TITLE("OperatorLREXX", "OperatorLREXX");
             this->cal_type = hamilt::calculation_type::lcao_exx;
-            this->act_type = 2;
             this->is_first_node = false;
 
             // reduce psi_ks for later use
             this->psi_ks_full.resize(this->kv.get_nks(), nocc + nvirt, this->naos);
-            LR_Util::gather_2d_to_full(*this->pc, this->psi_ks->get_pointer(), this->psi_ks_full.get_pointer(), false, this->naos, nocc + nvirt);
+            LR_Util::gather_2d_to_full(this->pc, this->psi_ks.get_pointer(), this->psi_ks_full.get_pointer(), false, this->naos, nocc + nvirt);
 
             // get cells in BvK supercell
             const TC period = RI_Util::get_Born_vonKarmen_period(kv_in);
             this->BvK_cells = RI_Util::get_Born_von_Karmen_cells(period);
 
             this->allocate_Ds_onebase();
-            this->exx_lri.lock()->Hexxs.resize(this->nspin_solve);
+            this->exx_lri.lock()->Hexxs.resize(1);
         };
 
         void init(const int ik_in) override {};
 
-        // virtual psi::Psi<T> act(const psi::Psi<T>& psi_in) const override;
-        virtual void act(const psi::Psi<T>& psi_in, psi::Psi<T>& psi_out, const int nbands) const override;
+        virtual void act(const int nbands, const int nbasis, const int npol, const T* psi_in, T* hpsi, const int ngk_ik = 0)const override;
+
     private:
         //global sizes
         const int& nspin;
-        const int nspin_solve = 1;
         const int& naos;
         const int& nocc;
         const int& nvirt;
@@ -71,18 +68,18 @@ namespace LR
         const bool tdm_sym = false; ///< whether transition density matrix is symmetric
         const K_Vectors& kv;
         /// ground state wavefunction
-        const psi::Psi<T>* psi_ks = nullptr;
+        const psi::Psi<T>& psi_ks = nullptr;
         psi::Psi<T> psi_ks_full;
         const std::vector<int> aims_nbasis={};    ///< number of basis functions for each type of atom in FHI-aims
 
         /// transition density matrix 
-        std::vector<std::unique_ptr<elecstate::DensityMatrix<T, T>>>& DM_trans;
+        std::unique_ptr<elecstate::DensityMatrix<T, T>>& DM_trans;
 
         /// density matrix of a certain (i, a, k), with full naos*naos size for each key
         /// D^{iak}_{\mu\nu}(k): 1/N_k * c^*_{ak,\mu} c_{ik,\nu}
         /// D^{iak}_{\mu\nu}(R): D^{iak}_{\mu\nu}(k)e^{-ikR}
         // elecstate::DensityMatrix<T, double>* DM_onebase;
-        mutable std::vector<std::map<TA, std::map<TAC, RI::Tensor<T>>>> Ds_onebase;
+        mutable std::map<TA, std::map<TAC, RI::Tensor<T>>> Ds_onebase;
 
         // cells in the Born von Karmen supercell (direct)
         std::vector<std::array<int, Ndim>> BvK_cells;
@@ -99,15 +96,15 @@ namespace LR
         const UnitCell& ucell;
 
         ///parallel info
-        Parallel_2D* pc = nullptr;
-        Parallel_2D* pX = nullptr;
-        Parallel_Orbitals* pmat = nullptr;
+        const Parallel_2D& pc;
+        const Parallel_2D& pX;
+        const Parallel_Orbitals& pmat;
 
 
         // allocate Ds_onebase
         void allocate_Ds_onebase();
 
-        void cal_DM_onebase(const int io, const int iv, const int ik, const int is) const;
+        void cal_DM_onebase(const int io, const int iv, const int ik) const;
 
     };
 }
diff --git a/source/module_lr/operator_casida/operator_lr_hxc.cpp b/source/module_lr/operator_casida/operator_lr_hxc.cpp
index a2ddfe616c..58f01932b4 100644
--- a/source/module_lr/operator_casida/operator_lr_hxc.cpp
+++ b/source/module_lr/operator_casida/operator_lr_hxc.cpp
@@ -8,7 +8,6 @@
 #include "module_lr/utils/lr_util_print.h"
 // #include "module_hamilt_lcao/hamilt_lcaodft/DM_gamma_2d_to_grid.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "module_lr/dm_trans/dm_trans.h"
 #include "module_lr/AX/AX.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 
@@ -18,127 +17,83 @@ inline std::complex<double> conj(std::complex<double> a) { return std::conj(a);
 namespace LR
 {
     template<typename T, typename Device>
-    void OperatorLRHxc<T, Device>::act(const psi::Psi<T>& psi_in, psi::Psi<T>& psi_out, const int nbands) const
+    void OperatorLRHxc<T, Device>::act(const int nbands, const int nbasis, const int npol, const T* psi_in, T* hpsi, const int ngk_ik)const
     {
         ModuleBase::TITLE("OperatorLRHxc", "act");
-        assert(nbands <= psi_in.get_nbands());
-        const int& nk = this->kv.get_nks() / this->nspin;
-
-        //print 
-        // if (this->first_print) LR_Util::print_psi_kfirst(*psi_ks, "psi_ks");
+        const int& sl = ispin_ks[0];
+        const auto psil_ks = LR_Util::get_psi_spin(psi_ks, sl, nk);
+        const int& lgd = gint->gridt->lgd;
 
-        this->init_DM_trans(nbands, this->DM_trans);    // initialize transion density matrix
+        this->DM_trans->cal_DMR();  //DM_trans->get_DMR_vector() is 2d-block parallized
+        // LR_Util::print_DMR(*DM_trans, ucell.nat, "DMR");
 
-        psi::Psi<T> psi_in_bfirst = LR_Util::k1_to_bfirst_wrapper(psi_in, nk, this->pX->get_local_size());
-        psi::Psi<T> psi_out_bfirst = LR_Util::k1_to_bfirst_wrapper(psi_out, nk, this->pX->get_local_size());
+        // ========================= begin grid calculation=========================
+        this->grid_calculation(nbands);   //DM(R) to H(R)
+        // ========================= end grid calculation =========================
 
-        const int& lgd = gint->gridt->lgd;
-        for (int ib = 0;ib < nbands;++ib)
-        {
-            // if (this->first_print) LR_Util::print_psi_bandfirst(psi_in_bfirst, "psi_in_bfirst", ib);
+        // V(R)->V(k) 
+        std::vector<ct::Tensor> v_hxc_2d(nk, LR_Util::newTensor<T>({ pmat.get_col_size(), pmat.get_row_size() }));
+        for (auto& v : v_hxc_2d) v.zero();
+        int nrow = ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver) ? this->pmat.get_row_size() : this->pmat.get_col_size();
+        for (int ik = 0;ik < nk;++ik) { folding_HR(*this->hR, v_hxc_2d[ik].data<T>(), this->kv.kvec_d[ik], nrow, 1); }  // V(R) -> V(k)
+        // LR_Util::print_HR(*this->hR, this->ucell.nat, "4.VR");
+        // if (this->first_print)
+        // for (int ik = 0;ik < nk;++ik)
+        //     LR_Util::print_tensor<T>(v_hxc_2d[ik], "4.V(k)[ik=" + std::to_string(ik) + "]", &this->pmat);
 
-            // if Hxc-only, the memory of single-band DM_trans is enough.
-            // if followed by EXX, we need to allocate memory for all bands.
-            int ib_dm = (this->next_op == nullptr) ? 0 : ib;
-            psi_in_bfirst.fix_b(ib);
-            psi_out_bfirst.fix_b(ib);
-
-            // 1. transition density matrix
+        // 5. [AX]^{Hxc}_{ai}=\sum_{\mu,\nu}c^*_{a,\mu,}V^{Hxc}_{\mu,\nu}c_{\nu,i}
 #ifdef __MPI
-            std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_pblas(psi_in_bfirst, *pX, *psi_ks, *pc, naos, nocc, nvirt, *pmat);
-            if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos, *pmat);
+        cal_AX_pblas(v_hxc_2d, this->pmat, psil_ks, this->pc, naos, nocc[sl], nvirt[sl], this->pX[sl], hpsi);
 #else
-            std::vector<container::Tensor>  dm_trans_2d = cal_dm_trans_blas(psi_in_bfirst, *psi_ks, nocc, nvirt);
-            if (this->tdm_sym) for (auto& t : dm_trans_2d) LR_Util::matsym(t.data<T>(), naos);
+        cal_AX_blas(v_hxc_2d, psil_ks, nocc[sl], nvirt[sl], hpsi);
 #endif
-            // tensor to vector, then set DMK
-            for (int ik = 0;ik < nk;++ik) { this->DM_trans[ib_dm]->set_DMK_pointer(ik, dm_trans_2d[ik].data<T>()); }
-
-            // if (this->first_print)
-            //     for (int ik = 0;ik < nk;++ik)
-            //         LR_Util::print_tensor<std::complex<double>>(dm_trans_2d[ik], "1.DMK[ik=" + std::to_string(ik) + "]", this->pmat);
-
-            // use cal_DMR to get DMR form DMK by FT
-            this->DM_trans[ib_dm]->cal_DMR();  //DM_trans->get_DMR_vector() is 2d-block parallized
-            // LR_Util::print_DMR(*this->DM_trans[0], ucell.nat, "DM(R) (complex)");
-
-            // ========================= begin grid calculation=========================
-            this->grid_calculation(nbands, ib_dm);   //DM(R) to H(R)
-            // ========================= end grid calculation =========================
-
-            // V(R)->V(k)
-            std::vector<ct::Tensor> v_hxc_2d(this->kv.get_nks(),
-                ct::Tensor(ct::DataTypeToEnum<T>::value, ct::DeviceTypeToEnum<base_device::DEVICE_CPU>::value,
-                    { pmat->get_col_size(), pmat->get_row_size() }));
-            for (auto& v : v_hxc_2d) v.zero();
-            int nrow = ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(PARAM.inp.ks_solver) ? this->pmat->get_row_size() : this->pmat->get_col_size();
-            for (int ik = 0;ik < nk;++ik) { folding_HR(*this->hR, v_hxc_2d[ik].data<T>(), this->kv.kvec_d[ik], nrow, 1); }  // V(R) -> V(k)
-            // LR_Util::print_HR(*this->hR, this->ucell.nat, "4.VR");
-            // if (this->first_print)
-            //     for (int ik = 0;ik < nk;++ik)
-            //         LR_Util::print_tensor<T>(v_hxc_2d[ik], "4.V(k)[ik=" + std::to_string(ik) + "]", this->pmat);
-
-            // 5. [AX]^{Hxc}_{ai}=\sum_{\mu,\nu}c^*_{a,\mu,}V^{Hxc}_{\mu,\nu}c_{\nu,i}
-#ifdef __MPI
-            cal_AX_pblas(v_hxc_2d, *this->pmat, *this->psi_ks, *this->pc, naos, nocc, nvirt, *this->pX, psi_out_bfirst);
-#else
-            cal_AX_blas(v_hxc_2d, *this->psi_ks, nocc, nvirt, psi_out_bfirst);
-#endif
-            // if (this->first_print) LR_Util::print_psi_bandfirst(psi_out_bfirst, "5.AX", ib);
-        }
     }
 
 
     template<>
-    void OperatorLRHxc<double, base_device::DEVICE_CPU>::grid_calculation(const int& nbands, const int& iband_dm) const
+    void OperatorLRHxc<double, base_device::DEVICE_CPU>::grid_calculation(const int& nbands) const
     {
         ModuleBase::TITLE("OperatorLRHxc", "grid_calculation(real)");
         ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation");
-        this->gint->transfer_DM2DtoGrid(this->DM_trans[iband_dm]->get_DMR_vector());     // 2d block to grid
+        this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector());     // 2d block to grid
 
         // 2. transition electron density
         // \f[ \tilde{\rho}(r)=\sum_{\mu_j, \mu_b}\tilde{\rho}_{\mu_j,\mu_b}\phi_{\mu_b}(r)\phi_{\mu_j}(r) \f]
         double** rho_trans;
         const int& nrxx = this->pot.lock()->nrxx;
-        // LR_Util::new_p2(rho_trans, nspin_solve, nrxx);
         LR_Util::new_p2(rho_trans, 1, nrxx); // currently gint_kernel_rho uses PARAM.inp.nspin, it needs refactor
-        for (int is = 0;is < nspin_solve;++is)ModuleBase::GlobalFunc::ZEROS(rho_trans[is], nrxx);
+        ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx);
         Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false);
         this->gint->cal_gint(&inout_rho);
 
         // 3. v_hxc = f_hxc * rho_trans
-        ModuleBase::matrix vr_hxc(nspin_solve, nrxx);   //grid
-        this->pot.lock()->cal_v_eff(rho_trans, &GlobalC::ucell, vr_hxc);
-        LR_Util::delete_p2(rho_trans, nspin_solve);
+        ModuleBase::matrix vr_hxc(1, nrxx);   //grid
+        this->pot.lock()->cal_v_eff(rho_trans, &GlobalC::ucell, vr_hxc, ispin_ks);
+        LR_Util::delete_p2(rho_trans, 1);
 
         // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r)
-        // V(R) for each spin
-        for (int is = 0;is < nspin_solve;++is)
-        {
-            double* vr_hxc_is = &vr_hxc.c[is * nrxx];   //v(r) at current spin
-            Gint_inout inout_vlocal(vr_hxc_is, is, Gint_Tools::job_type::vlocal);
-            this->gint->get_hRGint()->set_zero();
-            this->gint->cal_gint(&inout_vlocal);
-        }
+        Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal);
+        this->gint->get_hRGint()->set_zero();
+        this->gint->cal_gint(&inout_vlocal);
         this->hR->set_zero();   // clear hR for each bands
         this->gint->transfer_pvpR(&*this->hR, &GlobalC::ucell);    //grid to 2d block
         ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation");
     }
 
     template<>
-    void OperatorLRHxc<std::complex<double>, base_device::DEVICE_CPU>::grid_calculation(const int& nbands, const int& iband_dm) const
+    void OperatorLRHxc<std::complex<double>, base_device::DEVICE_CPU>::grid_calculation(const int& nbands) const
     {
         ModuleBase::TITLE("OperatorLRHxc", "grid_calculation(complex)");
         ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation");
 
-        elecstate::DensityMatrix<std::complex<double>, double> DM_trans_real_imag(pmat, 1, kv.kvec_d, kv.get_nks() / nspin);
+        elecstate::DensityMatrix<std::complex<double>, double> DM_trans_real_imag(&pmat, 1, kv.kvec_d, kv.get_nks() / nspin);
         DM_trans_real_imag.init_DMR(*this->hR);
-        hamilt::HContainer<double> HR_real_imag(GlobalC::ucell, this->pmat);
-        this->initialize_HR(HR_real_imag, ucell, gd, this->pmat);
+        hamilt::HContainer<double> HR_real_imag(GlobalC::ucell, &this->pmat);
+        LR_Util::initialize_HR<std::complex<double>, double>(HR_real_imag, ucell, gd, orb_cutoff_);
 
-        auto dmR_to_hR = [&, this](const int& iband_dm, const char& type) -> void
+        auto dmR_to_hR = [&, this](const char& type) -> void
             {
-                LR_Util::get_DMR_real_imag_part(*this->DM_trans[iband_dm], DM_trans_real_imag, ucell.nat, type);
+                LR_Util::get_DMR_real_imag_part(*this->DM_trans, DM_trans_real_imag, ucell.nat, type);
                 // if (this->first_print)LR_Util::print_DMR(DM_trans_real_imag, ucell.nat, "DMR(2d, real)");
 
                 this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
@@ -147,28 +102,25 @@ namespace LR
                 // 2. transition electron density
                 double** rho_trans;
                 const int& nrxx = this->pot.lock()->nrxx;
-                // LR_Util::new_p2(rho_trans, nspin_solve, nrxx);
-                LR_Util::new_p2(rho_trans, nspin, nrxx); // currently gint_kernel_rho uses PARAM.inp.nspin, it needs refactor
-                for (int is = 0;is < nspin_solve;++is)ModuleBase::GlobalFunc::ZEROS(rho_trans[is], nrxx);
+
+                LR_Util::new_p2(rho_trans, 1, nrxx); // nspin=1 for transition density
+                ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx);
                 Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false);
                 this->gint->cal_gint(&inout_rho);
                 // print_grid_nonzero(rho_trans[0], nrxx, 10, "rho_trans");
 
                 // 3. v_hxc = f_hxc * rho_trans
-                ModuleBase::matrix vr_hxc(nspin_solve, nrxx);   //grid
-                this->pot.lock()->cal_v_eff(rho_trans, &GlobalC::ucell, vr_hxc);
+                ModuleBase::matrix vr_hxc(1, nrxx);   //grid
+                this->pot.lock()->cal_v_eff(rho_trans, &GlobalC::ucell, vr_hxc, ispin_ks);
                 // print_grid_nonzero(vr_hxc.c, this->poticab->nrxx, 10, "vr_hxc");
 
-                LR_Util::delete_p2(rho_trans, nspin_solve);
+                LR_Util::delete_p2(rho_trans, 1);
 
                 // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r)
-                for (int is = 0;is < nspin_solve;++is)
-                {
-                    double* vr_hxc_is = &vr_hxc.c[is * nrxx];   //v(r) at current spin
-                    Gint_inout inout_vlocal(vr_hxc_is, is, Gint_Tools::job_type::vlocal);
-                    this->gint->get_hRGint()->set_zero();
-                    this->gint->cal_gint(&inout_vlocal);
-                }
+                Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal);
+                this->gint->get_hRGint()->set_zero();
+                this->gint->cal_gint(&inout_vlocal);
+
                 // LR_Util::print_HR(*this->gint->get_hRGint(), this->ucell.nat, "VR(grid)");
                 HR_real_imag.set_zero();
                 this->gint->transfer_pvpR(&HR_real_imag, &GlobalC::ucell, &GlobalC::GridD);
@@ -176,8 +128,8 @@ namespace LR
                 LR_Util::set_HR_real_imag_part(HR_real_imag, *this->hR, GlobalC::ucell.nat, type);
             };
         this->hR->set_zero();
-        dmR_to_hR(iband_dm, 'R');   //real
-        if (kv.get_nks() / this->nspin > 1) { dmR_to_hR(iband_dm, 'I'); }   //imag for multi-k
+        dmR_to_hR('R');   //real
+        if (kv.get_nks() / this->nspin > 1) { dmR_to_hR('I'); }   //imag for multi-k
         ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation");
     }
 
diff --git a/source/module_lr/operator_casida/operator_lr_hxc.h b/source/module_lr/operator_casida/operator_lr_hxc.h
index 13a3fb7efd..e9ea332f88 100644
--- a/source/module_lr/operator_casida/operator_lr_hxc.h
+++ b/source/module_lr/operator_casida/operator_lr_hxc.h
@@ -17,104 +17,63 @@ namespace LR
         //when nspin=2, nks is 2 times of real number of k-points. else (nspin=1 or 4), nks is the real number of k-points
         OperatorLRHxc(const int& nspin,
             const int& naos,
-            const int& nocc,
-            const int& nvirt,
-            const psi::Psi<T, Device>* psi_ks_in,
-            std::vector<std::unique_ptr<elecstate::DensityMatrix<T, T>>>& DM_trans_in,
+            const std::vector<int>& nocc,
+            const std::vector<int>& nvirt,
+            const psi::Psi<T, Device>& psi_ks_in,
+            std::unique_ptr<elecstate::DensityMatrix<T, T>>& DM_trans_in,
             typename TGint<T>::type* gint_in,
             std::weak_ptr<PotHxcLR> pot_in,
             const UnitCell& ucell_in,
             const std::vector<double>& orb_cutoff,
             Grid_Driver& gd_in,
             const K_Vectors& kv_in,
-            Parallel_2D* pX_in,
-            Parallel_2D* pc_in,
-            Parallel_Orbitals* pmat_in)
-            : nspin(nspin), naos(naos), nocc(nocc), nvirt(nvirt),
+            const std::vector<Parallel_2D>& pX_in,
+            const Parallel_2D& pc_in,
+            const Parallel_Orbitals& pmat_in,
+            const std::vector<int>& ispin_ks = { 0 })
+            : nspin(nspin), naos(naos), nocc(nocc), nvirt(nvirt), nk(kv_in.get_nks() / nspin),
             psi_ks(psi_ks_in), DM_trans(DM_trans_in), gint(gint_in), pot(pot_in),
             ucell(ucell_in), orb_cutoff_(orb_cutoff), gd(gd_in), kv(kv_in),
-            pX(pX_in), pc(pc_in), pmat(pmat_in)
+            pX(pX_in), pc(pc_in), pmat(pmat_in), ispin_ks(ispin_ks)
         {
             ModuleBase::TITLE("OperatorLRHxc", "OperatorLRHxc");
             this->cal_type = hamilt::calculation_type::lcao_gint;
-            this->act_type = 2;
             this->is_first_node = true;
-            this->hR = std::unique_ptr<hamilt::HContainer<T>>(new hamilt::HContainer<T>(pmat_in));
-            this->initialize_HR(*this->hR, ucell_in, gd_in, pmat_in);
-            this->DM_trans[0]->init_DMR(*this->hR);
+            this->hR = std::unique_ptr<hamilt::HContainer<T>>(new hamilt::HContainer<T>(&pmat_in));
+            LR_Util::initialize_HR<T, T>(*this->hR, ucell_in, gd_in, orb_cutoff);
+            assert(&pmat_in == this->hR->get_paraV());
         };
         ~OperatorLRHxc() { };
 
         void init(const int ik_in) override {};
 
-        // virtual psi::Psi<T> act(const psi::Psi<T>& psi_in) const override;
-        virtual void act(const psi::Psi<T>& psi_in, psi::Psi<T>& psi_out, const int nbands) const override;
+        virtual void act(const int nbands, const int nbasis, const int npol, const T* psi_in, T* hpsi, const int ngk_ik = 0)const override;
+
     private:
-        template<typename TR>   //T=double, TR=double; T=std::complex<double>, TR=std::complex<double>/double
-        void initialize_HR(hamilt::HContainer<TR>& hR, const UnitCell& ucell, Grid_Driver& gd, const Parallel_Orbitals* pmat) const
-        {
-            for (int iat1 = 0; iat1 < ucell.nat; iat1++)
-            {
-                auto tau1 = ucell.get_tau(iat1);
-                int T1, I1;
-                ucell.iat2iait(iat1, &I1, &T1);
-                AdjacentAtomInfo adjs;
-                gd.Find_atom(ucell, tau1, T1, I1, &adjs);
-                for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
-                {
-                    const int T2 = adjs.ntype[ad];
-                    const int I2 = adjs.natom[ad];
-                    int iat2 = this->ucell.itia2iat(T2, I2);
-                    if (pmat->get_row_size(iat1) <= 0 || pmat->get_col_size(iat2) <= 0) { continue; }
-                    const ModuleBase::Vector3<int>& R_index = adjs.box[ad];
-                    if (ucell.cal_dtau(iat1, iat2, R_index).norm() * this->ucell.lat0 >= orb_cutoff_[T1] + orb_cutoff_[T2]) { continue; }
-                    hamilt::AtomPair<TR> tmp(iat1, iat2, R_index.x, R_index.y, R_index.z, pmat);
-                    hR.insert_pair(tmp);
-                }
-            }
-            hR.allocate(nullptr, true);
-            hR.set_paraV(pmat);
-            if (std::is_same<T, double>::value) { hR.fix_gamma(); }
-        }
-        template<typename TR>
-        void init_DM_trans(const int& nbands, std::vector<std::unique_ptr<elecstate::DensityMatrix<T, TR>>>& DM_trans)const
-        {
-            // LR_Util::print_DMR(*this->DM_trans[0], ucell.nat, "DMR[ib=" + std::to_string(0) + "]");
-            if (this->next_op != nullptr)
-            {
-                int prev_size = DM_trans.size();
-                if (prev_size > nbands) { for (int ib = nbands;ib < prev_size;++ib) { DM_trans[ib].reset(); } }
-                DM_trans.resize(nbands);
-                for (int ib = prev_size;ib < nbands;++ib)
-                {
-                    // the first dimenstion of DensityMatrix is nk=nks/nspin 
-                    DM_trans[ib] = LR_Util::make_unique<elecstate::DensityMatrix<T, TR>>(this->pmat, 1, this->kv.kvec_d, this->kv.get_nks() / nspin);
-                    DM_trans[ib]->init_DMR(*this->hR);
-                }
-            }
-        }
-        void grid_calculation(const int& nbands, const int& iband_dm)const;
+        void grid_calculation(const int& nbands)const;
 
         //global sizes
         const int& nspin;
-        const int nspin_solve = 1;    ///< in singlet-triplet calculation, the Casida equation is solved respectively so nspin_solve in a single problem is 1
         const int& naos;
-        const int& nocc;
-        const int& nvirt;
+        const int nk = 1;
+        // const int nloc_per_band = 1;    ///< local size of each state of X  (passed by nbasis in act())
+        const std::vector<int>& nocc;
+        const std::vector<int>& nvirt;
+        const std::vector<int> ispin_ks = { 0 };  ///< the index of spin of psi_ks used in {AX, DM_trans}
         const K_Vectors& kv;
         /// ground state wavefunction
-        const psi::Psi<T, Device>* psi_ks = nullptr;
+        const psi::Psi<T, Device>& psi_ks = nullptr;
 
         /// transition density matrix
-        std::vector<std::unique_ptr<elecstate::DensityMatrix<T, T>>>& DM_trans;
+        std::unique_ptr<elecstate::DensityMatrix<T, T>>& DM_trans;
 
         /// transition hamiltonian in AO representation
         std::unique_ptr<hamilt::HContainer<T>> hR = nullptr;
 
         /// parallel info
-        Parallel_2D* pc = nullptr;
-        Parallel_2D* pX = nullptr;
-        Parallel_Orbitals* pmat = nullptr;
+        const Parallel_2D& pc;
+        const std::vector<Parallel_2D>& pX;
+        const Parallel_Orbitals& pmat;
 
         std::weak_ptr<PotHxcLR> pot;
 
@@ -124,8 +83,6 @@ namespace LR
         std::vector<double> orb_cutoff_;
         Grid_Driver& gd;
 
-        bool tdm_sym = false; ///< whether transition density matrix is symmetric
-
         /// test
         mutable bool first_print = true;
     };
diff --git a/source/module_lr/potentials/kernel_xc.cpp b/source/module_lr/potentials/kernel_xc.cpp
index 2990ce5d52..dee51ef3c5 100644
--- a/source/module_lr/potentials/kernel_xc.cpp
+++ b/source/module_lr/potentials/kernel_xc.cpp
@@ -120,8 +120,8 @@ void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const doubl
 
         for (xc_func_type& func : funcs)
         {
-            constexpr double rho_threshold = 1E-6;
-            constexpr double grho_threshold = 1E-10;
+            const double rho_threshold = 1E-6;
+            const double grho_threshold = 1E-10;
 
             xc_func_set_dens_threshold(&func, rho_threshold);
 
diff --git a/source/module_lr/potentials/pot_hxc_lrtd.cpp b/source/module_lr/potentials/pot_hxc_lrtd.cpp
index b2713c8670..ebdb8747c5 100644
--- a/source/module_lr/potentials/pot_hxc_lrtd.cpp
+++ b/source/module_lr/potentials/pot_hxc_lrtd.cpp
@@ -6,11 +6,13 @@
 #include "module_hamilt_general/module_xc/xc_functional.h"
 #include <set>
 #include "module_lr/utils/lr_util.h"
+
+#define FXC_PARA_TYPE const double* const rho, ModuleBase::matrix& v_eff, const std::vector<int>& ispin_op = { 0,0 }
 namespace LR
 {
     // constructor for exchange-correlation kernel
     PotHxcLR::PotHxcLR(const std::string& xc_kernel_in, const ModulePW::PW_Basis* rho_basis_in, const UnitCell* ucell_in,
-        const Charge* chg_gs/*ground state*/, SpinType st_in)
+        const Charge* chg_gs/*ground state*/, const SpinType& st_in)
         :xc_kernel(xc_kernel_in), tpiba_(ucell_in->tpiba), spin_type_(st_in)
     {
         this->rho_basis_ = rho_basis_in;
@@ -32,29 +34,28 @@ namespace LR
         }
     }
 
-    void PotHxcLR::cal_v_eff(double** rho, const UnitCell* ucell, ModuleBase::matrix& v_eff)
+    void PotHxcLR::cal_v_eff(double** rho, const UnitCell* ucell, ModuleBase::matrix& v_eff, const std::vector<int>& ispin_op)
     {
         ModuleBase::TITLE("PotHxcLR", "cal_v_eff");
         ModuleBase::timer::tick("PotHxcLR", "cal_v_eff");
         auto& fxc = this->xc_kernel_components_;
 
-        const int& nspin_solve = v_eff.nr;
-        assert(nspin_solve == 1);
         // Hartree
         switch (this->spin_type_)
         {
-        case SpinType::S1:   // check the coefficient:  does S1 already include the factor 2?
-            v_eff += elecstate::H_Hartree_pw::v_hartree(*ucell, const_cast<ModulePW::PW_Basis*>(this->rho_basis_), nspin_solve, rho);
+        case SpinType::S1: case SpinType::S2_updown:
+            v_eff += elecstate::H_Hartree_pw::v_hartree(*ucell, const_cast<ModulePW::PW_Basis*>(this->rho_basis_), 1, rho);
             break;
         case SpinType::S2_singlet:
-            v_eff += 2 * elecstate::H_Hartree_pw::v_hartree(*ucell, const_cast<ModulePW::PW_Basis*>(this->rho_basis_), nspin_solve, rho);
+            v_eff += 2 * elecstate::H_Hartree_pw::v_hartree(*ucell, const_cast<ModulePW::PW_Basis*>(this->rho_basis_), 1, rho);
+            break;
         default:
             break;
         }
         // XC
         if (xc_kernel == "rpa" || xc_kernel == "hf") { return; }    // no xc
 #ifdef USE_LIBXC
-        this->kernel_to_potential_[spin_type_](rho[0], v_eff);
+        this->kernel_to_potential_[spin_type_](rho[0], v_eff, ispin_op);
 #else
         throw std::domain_error("GlobalV::XC_Functional::get_func_type() =" + std::to_string(XC_Functional::get_func_type())
             + " unfinished in " + std::string(__FILE__) + " line " + std::to_string(__LINE__));
@@ -70,13 +71,13 @@ namespace LR
         if (xc == XCType::LDA) switch (s)
         {
         case SpinType::S1:
-            funcs[s] = [this, &fxc](const double* const rho, ModuleBase::matrix& v_eff)->void
+            funcs[s] = [this, &fxc](FXC_PARA_TYPE)->void
                 {
                     for (int ir = 0;ir < nrxx;++ir) { v_eff(0, ir) += ModuleBase::e2 * fxc.get_kernel("v2rho2").at(ir) * rho[ir]; }
                 };
             break;
         case SpinType::S2_singlet:
-            funcs[s] = [this, &fxc](const double* const rho, ModuleBase::matrix& v_eff)->void
+            funcs[s] = [this, &fxc](FXC_PARA_TYPE)->void
                 {
                     for (int ir = 0;ir < nrxx;++ir)
                     {
@@ -87,7 +88,7 @@ namespace LR
                 };
             break;
         case SpinType::S2_triplet:
-            funcs[s] = [this, &fxc](const double* const rho, ModuleBase::matrix& v_eff)->void
+            funcs[s] = [this, &fxc](FXC_PARA_TYPE)->void
                 {
                     for (int ir = 0;ir < nrxx;++ir)
                     {
@@ -97,6 +98,14 @@ namespace LR
                     }
                 };
             break;
+        case SpinType::S2_updown:
+            funcs[s] = [this, &fxc](FXC_PARA_TYPE)->void
+                {
+                    assert(ispin_op.size() >= 2);
+                    const int is = ispin_op[0] + ispin_op[1];
+                    for (int ir = 0;ir < nrxx;++ir) { v_eff(0, ir) += ModuleBase::e2 * fxc.get_kernel("v2rho2").at(3 * ir + is) * rho[ir]; }
+                };
+            break;
         default:
             throw std::domain_error("SpinType =" + std::to_string(static_cast<int>(s))
                 + " unfinished in " + std::string(__FILE__) + " line " + std::to_string(__LINE__));
@@ -105,7 +114,7 @@ namespace LR
         else if (xc == XCType::GGA || xc == XCType::HYB_GGA) switch (s)
         {
         case SpinType::S1:
-            funcs[s] = [this, &fxc](const double* const rho, ModuleBase::matrix& v_eff)->void
+            funcs[s] = [this, &fxc](FXC_PARA_TYPE)->void
                 {
                     // test: output drho
                     // double thr = 1e-1;
diff --git a/source/module_lr/potentials/pot_hxc_lrtd.h b/source/module_lr/potentials/pot_hxc_lrtd.h
index 6a4df0fd19..a04ab42bb9 100644
--- a/source/module_lr/potentials/pot_hxc_lrtd.h
+++ b/source/module_lr/potentials/pot_hxc_lrtd.h
@@ -12,14 +12,14 @@ namespace LR
         /// S1: K^Hartree + K^xc
         /// S2_singlet: 2*K^Hartree + K^xc_{upup} + K^xc_{updown}
         /// S2_triplet: K^xc_{upup} - K^xc_{updown}
-        enum SpinType { S1 = 0, S2_singlet = 1, S2_triplet = 2 };
+        enum SpinType { S1 = 0, S2_singlet = 1, S2_triplet = 2, S2_updown = 3 };
         enum XCType { None = 0, LDA = 1, GGA = 2, HYB_GGA = 4 };
         /// constructor for exchange-correlation kernel
         PotHxcLR(const std::string& xc_kernel_in, const ModulePW::PW_Basis* rho_basis_in, const UnitCell* ucell_in, const Charge* chg_gs/*ground state*/,
-            SpinType st_in = SpinType::S1);
+            const SpinType& st_in = SpinType::S1);
         ~PotHxcLR() {}
         void cal_v_eff(const Charge* chg/*excited state*/, const UnitCell* ucell, ModuleBase::matrix& v_eff) override {};
-        void cal_v_eff(double** rho, const UnitCell* ucell, ModuleBase::matrix& v_eff);
+        void cal_v_eff(double** rho, const UnitCell* ucell, ModuleBase::matrix& v_eff, const std::vector<int>& ispin_op = { 0,0 });
         int nrxx;
     private:
         int nspin;
@@ -31,14 +31,15 @@ namespace LR
         KernelXC xc_kernel_components_;
         const std::string xc_kernel;
         const double& tpiba_;
-        SpinType spin_type_ = SpinType::S1;
+        const SpinType spin_type_ = SpinType::S1;
         XCType xc_type_ = XCType::None;
 
         // enum class as key for unordered_map is not supported in C++11 sometimes
         // https://github.com/llvm/llvm-project/issues/49601
         // struct SpinHash { std::size_t operator()(const SpinType& s) const { return std::hash<int>()(static_cast<int>(s)); } };
         using Tfunc = std::function<void(const double* const /**<[in] rho*/,
-            ModuleBase::matrix& /**<[out] v_eff */)>;
+            ModuleBase::matrix& /**<[out] v_eff */,
+            const std::vector<int>& ispin_op) >;
         // std::unordered_map<SpinType, Tfunc, SpinHash> kernel_to_potential_;
         std::map<SpinType, Tfunc> kernel_to_potential_;
 
diff --git a/source/module_lr/ri_benchmark/operator_ri_hartree.h b/source/module_lr/ri_benchmark/operator_ri_hartree.h
index 311f15cd53..cc398be90c 100644
--- a/source/module_lr/ri_benchmark/operator_ri_hartree.h
+++ b/source/module_lr/ri_benchmark/operator_ri_hartree.h
@@ -1,3 +1,4 @@
+#pragma once
 #include "module_hamilt_general/operator.h"
 #include "module_lr/ri_benchmark/ri_benchmark.h"
 #include "module_lr/utils/lr_util_print.h"
@@ -49,22 +50,18 @@ namespace RI_Benchmark
             }
         };
         ~OperatorRIHartree() {}
-        void act(const psi::Psi<T>& X_in, psi::Psi<T>& X_out, const int nbands) const override
+        void act(const int nbands, const int nbasis, const int npol, const T* psi_in, T* hpsi, const int ngk_ik = 0)const override
         {
             assert(GlobalV::MY_RANK == 0);  // only serial now
-            const int nk = 1;
-            const psi::Psi<T>& X = LR_Util::k1_to_bfirst_wrapper(X_in, nk, npairs);
-            psi::Psi<T> AX = LR_Util::k1_to_bfirst_wrapper(X_out, nk, npairs);
-            for (int ib = 0;ib < nbands;++ib)
-            {
-                TLRIX<T> CsX_vo = cal_CsX(Cs_vo_mo, &X(ib, 0, 0));
-                TLRIX<T> CsX_ov = cal_CsX(Cs_ov_mo, &X(ib, 0, 0));
-                // LR_Util::print_CsX(Cs_bX, nvirt, "Cs_bX of state " + std::to_string(ib));
-                cal_AX(CV_vo, CsX_vo, &AX(ib, 0, 0), 4.);
-                cal_AX(CV_vo, CsX_ov, &AX(ib, 0, 0), 4.);
-                cal_AX(CV_ov, CsX_vo, &AX(ib, 0, 0), 4.);
-                cal_AX(CV_ov, CsX_ov, &AX(ib, 0, 0), 4.);
-            }
+            assert(nbasis == npairs);
+            TLRIX<T> CsX_vo = cal_CsX(Cs_vo_mo, psi_in);
+            TLRIX<T> CsX_ov = cal_CsX(Cs_ov_mo, psi_in);
+            // LR_Util::print_CsX(Cs_bX, nvirt, "Cs_bX of state " + std::to_string(ib));
+            // 4 for 4 terms in the expansion of local RI
+            cal_AX(CV_vo, CsX_vo, hpsi, 4.);
+            cal_AX(CV_vo, CsX_ov, hpsi, 4.);
+            cal_AX(CV_ov, CsX_vo, hpsi, 4.);
+            cal_AX(CV_ov, CsX_ov, hpsi, 4.);
         }
     protected:
         const int& naos;
diff --git a/source/module_lr/ri_benchmark/ri_benchmark.hpp b/source/module_lr/ri_benchmark/ri_benchmark.hpp
index 671ccd7774..82ab9249ac 100644
--- a/source/module_lr/ri_benchmark/ri_benchmark.hpp
+++ b/source/module_lr/ri_benchmark/ri_benchmark.hpp
@@ -153,7 +153,7 @@ namespace RI_Benchmark
         return Amat_full;
     }
     template <typename TK>
-    TLRIX<TK> cal_CsX(const TLRI<TK>& Cs_mo, TK* X)
+    TLRIX<TK> cal_CsX(const TLRI<TK>& Cs_mo, const TK* X)
     {
         TLRIX<TK> CsX;
         for (auto& it1 : Cs_mo)
diff --git a/source/module_lr/utils/gint_move.hpp b/source/module_lr/utils/gint_move.hpp
index acdbdad997..7d21ca6037 100644
--- a/source/module_lr/utils/gint_move.hpp
+++ b/source/module_lr/utils/gint_move.hpp
@@ -9,15 +9,15 @@
 
 template <typename T>
 using D2 = void(*) (T**, size_t);
-template <typename T>
-using D3 = void(*) (T***, size_t, size_t);
+// template <typename T>
+// using D3 = void(*) (T***, size_t, size_t);
 // template <typename T>
 // D2<T> d2 = LR_Util::delete_p2<T>;
 // template <typename T>
 // D3<T> d3 = LR_Util::delete_p3<T>;
 // Change to C++ 11
 D2<double> d2 = LR_Util::delete_p2<double>;
-D3<double> d3 = LR_Util::delete_p3<double>;
+// D3<double> d3 = LR_Util::delete_p3<double>;
 
 
 Gint& Gint::operator=(Gint&& rhs)
diff --git a/source/module_lr/utils/lr_util.cpp b/source/module_lr/utils/lr_util.cpp
index 7634fe1574..a8964d5c97 100644
--- a/source/module_lr/utils/lr_util.cpp
+++ b/source/module_lr/utils/lr_util.cpp
@@ -212,7 +212,7 @@ namespace LR_Util
         int info = 0;
         char jobz = 'V', uplo = 'U';
         double work_tmp;
-        constexpr int minus_one = -1;
+        const int minus_one = -1;
         dsyev_(&jobz, &uplo, &n, mat, &n, eig, &work_tmp, &minus_one, &info);		// get best lwork
         const int lwork = work_tmp;
         double* work2 = new double[lwork];
@@ -241,7 +241,7 @@ namespace LR_Util
         int info = 0;
         char jobvl = 'N', jobvr = 'V';  //calculate right eigenvectors
         double work_tmp;
-        constexpr int minus_one = -1;
+        const int minus_one = -1;
         std::vector<double> eig_real(n);
         std::vector<double> eig_imag(n);
         const int ldvl = 1, ldvr = n;
diff --git a/source/module_lr/utils/lr_util.h b/source/module_lr/utils/lr_util.h
index 136b9481f1..93f26be2a6 100644
--- a/source/module_lr/utils/lr_util.h
+++ b/source/module_lr/utils/lr_util.h
@@ -50,21 +50,17 @@ namespace LR_Util
     /// =================ALGORITHM====================
 
     //====== newers and deleters========
-    //(arbitrary dimention will be supported in the future)
-    /// @brief  delete 2d pointer 
-    /// @tparam T 
-    /// @param p2 
-    /// @param size 
+    /// @brief  delete 2d pointer  
     template <typename T>
     void delete_p2(T** p2, size_t size);
-    
-    /// @brief  delete 3d pointer 
-    /// @tparam T 
-    /// @param p2 
-    /// @param size1
-    /// @param size2
+    /// @brief  new 2d pointer  
     template <typename T>
-    void delete_p3(T*** p3, size_t size1, size_t size2);
+    void new_p2(T**& p2, size_t size1, size_t size2);
+
+    template<typename T> ct::Tensor newTensor(const ct::TensorShape& shape)
+    {
+        return ct::Tensor(ct::DataTypeToEnum<T>::value, ct::DeviceTypeToEnum<base_device::DEVICE_CPU>::value, shape);
+    }
 
     ///================ BLAS ======================
     /// calculate (A+A^T)/2
diff --git a/source/module_lr/utils/lr_util.hpp b/source/module_lr/utils/lr_util.hpp
index eb91617111..97a5113eb7 100644
--- a/source/module_lr/utils/lr_util.hpp
+++ b/source/module_lr/utils/lr_util.hpp
@@ -27,8 +27,6 @@ namespace LR_Util
     }
 
     //====== newers and deleters========
-    //(arbitrary dimention will be supported in the future)
-
     /// @brief  new 2d pointer
     /// @tparam T
     /// @param size1
@@ -43,21 +41,6 @@ namespace LR_Util
         }
     };
 
-    /// @brief  new 3d pointer
-    /// @tparam T
-    /// @param size1
-    /// @param size2
-    /// @param size3
-    template <typename T>
-    void new_p3(T***& p3, size_t size1, size_t size2, size_t size3)
-    {
-        p3 = new T * *[size1];
-        for (size_t i = 0; i < size1; ++i)
-        {
-            new_p2(p3[i], size2, size3);
-        }
-    };
-
     /// @brief  delete 2d pointer 
     /// @tparam T 
     /// @param p2 
@@ -69,31 +52,12 @@ namespace LR_Util
         {
             for (size_t i = 0; i < size; ++i)
             {
-                if (p2[i] != nullptr) { delete[] p2[i];
-}
+                if (p2[i] != nullptr) { delete[] p2[i]; }
             }
             delete[] p2;
         }
     };
 
-    /// @brief  delete 3d pointer 
-    /// @tparam T 
-    /// @param p2 
-    /// @param size1
-    /// @param size2
-    template <typename T>
-    void delete_p3(T*** p3, size_t size1, size_t size2)
-    {
-        if (p3 != nullptr)
-        {
-            for (size_t i = 0; i < size1; ++i)
-            {
-                delete_p2(p3[i], size2);
-            }
-            delete[] p3;
-        }
-    };
-
     inline double get_conj(const double& x)
     {
         return x;
@@ -131,6 +95,13 @@ namespace LR_Util
 }
     }
 
+    /// get the Psi wrapper of the selected spin from the Psi object
+    template<typename T>
+    psi::Psi<T> get_psi_spin(const psi::Psi<T>& psi_in, const int& is, const int& nk)
+    {
+        return psi::Psi<T>(&psi_in(is * nk, 0, 0), psi_in, nk, psi_in.get_nbands());
+    }
+
     /// psi(nk=1, nbands=nb, nk * nbasis) -> psi(nb, nk, nbasis) without memory copy
     template<typename T, typename Device>
     psi::Psi<T, Device> k1_to_bfirst_wrapper(const psi::Psi<T, Device>& psi_kfirst, int nk_in, int nbasis_in)
diff --git a/source/module_lr/utils/lr_util_hcontainer.h b/source/module_lr/utils/lr_util_hcontainer.h
index 0e0b87d124..4ac4ccb1f3 100644
--- a/source/module_lr/utils/lr_util_hcontainer.h
+++ b/source/module_lr/utils/lr_util_hcontainer.h
@@ -44,4 +44,39 @@ namespace LR_Util
         hamilt::HContainer<std::complex<double>>& HR,
         const int& nat,
         const char& type = 'R');
+
+    template<typename T, typename TR>
+    void initialize_HR(hamilt::HContainer<TR>& hR, const UnitCell& ucell, Grid_Driver& gd, const std::vector<double>& orb_cutoff)
+    {
+        const auto& pmat = *hR.get_paraV();
+        for (int iat1 = 0; iat1 < ucell.nat; iat1++)
+        {
+            auto tau1 = ucell.get_tau(iat1);
+            int T1, I1;
+            ucell.iat2iait(iat1, &I1, &T1);
+            AdjacentAtomInfo adjs;
+            gd.Find_atom(ucell, tau1, T1, I1, &adjs);
+            for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+            {
+                const int T2 = adjs.ntype[ad];
+                const int I2 = adjs.natom[ad];
+                int iat2 = ucell.itia2iat(T2, I2);
+                if (pmat.get_row_size(iat1) <= 0 || pmat.get_col_size(iat2) <= 0) { continue; }
+                const ModuleBase::Vector3<int>& R_index = adjs.box[ad];
+                if (ucell.cal_dtau(iat1, iat2, R_index).norm() * ucell.lat0 >= orb_cutoff[T1] + orb_cutoff[T2]) { continue; }
+                hamilt::AtomPair<TR> tmp(iat1, iat2, R_index.x, R_index.y, R_index.z, &pmat);
+                hR.insert_pair(tmp);
+            }
+        }
+        hR.allocate(nullptr, true);
+        // hR.set_paraV(&pmat);
+        if (std::is_same<T, double>::value) { hR.fix_gamma(); }
+    }
+    template<typename T, typename TR>
+    void initialize_DMR(elecstate::DensityMatrix<T, TR>& dm, const Parallel_Orbitals& pmat, const UnitCell& ucell, Grid_Driver& gd, const std::vector<double>& orb_cutoff)
+    {
+        hamilt::HContainer<TR> hR_tmp(&pmat);
+        initialize_HR<T, TR>(hR_tmp, ucell, gd, orb_cutoff);
+        dm.init_DMR(hR_tmp);
+    }
 }
\ No newline at end of file
diff --git a/source/module_lr/utils/lr_util_print.h b/source/module_lr/utils/lr_util_print.h
index 8ef6981f97..80f6a704ec 100644
--- a/source/module_lr/utils/lr_util_print.h
+++ b/source/module_lr/utils/lr_util_print.h
@@ -12,6 +12,64 @@ namespace LR_Util
         return (std::abs(v) > threshold ? v : 0);
     }
 
+    template<typename T>
+    int read_value(std::ifstream& ifs, T* ptr, const int& size) { for (int i = 0;i < size;++i) { ifs >> ptr[i]; } return size; }
+    template<typename T, typename... Args>
+    int read_value(std::ifstream& ifs, T* ptr, const int& size, Args&&... args)
+    {
+        int size_now = 0;
+        for (int i = 0;i < size;++i) { size_now += read_value(ifs, ptr + size_now, args...); }
+        return size_now;
+    }
+    template<typename T, typename... Args>
+    int read_value(const std::string& file, T* ptr, const int& size, Args&&... args)
+    {
+        std::ifstream ifs(file);
+        const int res = read_value(ifs, ptr, size, args...);
+        ifs.close();
+        return res;
+    }
+
+    template<typename T>
+    int write_value(std::ofstream& ofs, const T* ptr, const int& size)
+    {
+        for (int i = 0;i < size;++i) { ofs << filter(ptr[i]) << " "; }
+        ofs << std::endl;
+        return size;
+    }
+    template<typename T, typename... Args>
+    int write_value(std::ofstream& ofs, const T* ptr, const int& size, Args&&... args)
+    {
+        int size_now = 0;
+        for (int i = 0;i < size;++i) { size_now += write_value(ofs, ptr + size_now, args...); }
+        ofs << std::endl;
+        return size_now;
+    }
+    template<typename T, typename... Args>
+    int write_value(const std::string& file, const int& prec, const T* ptr, const int& size, Args&&... args)
+    {
+        std::ofstream ofs(file);
+        ofs << std::setprecision(prec) << std::scientific;
+        const int res = write_value(ofs, ptr, size, args...);
+        ofs.close();
+        return res;
+    }
+    template<typename T>
+    int print_value(const T* ptr, const int& size)
+    {
+        for (int i = 0;i < size;++i) { std::cout << filter(ptr[i]) << " "; }
+        std::cout << std::endl;
+        return size;
+    }
+    template<typename T, typename... Args>
+    int print_value(const T* ptr, const int& size, Args&&... args)
+    {
+        int size_now = 0;
+        for (int i = 0;i < size;++i) { size_now += print_value(ptr + size_now, args...); }
+        std::cout << std::endl;
+        return size_now;
+    }
+
     template<typename T>
     void print_psi_bandfirst(const psi::Psi<T>& psi, const std::string& label, const int& ib, const double& threshold = 1e-10)
     {
@@ -34,17 +92,17 @@ namespace LR_Util
         std::ofstream ofs(filename + "_" + std::to_string(rank) + ".dat");
         ofs << std::setprecision(precision) << std::scientific;
         ofs << psi.get_nbands() << " " << psi.get_nk() << " " << psi.get_nbasis() << "\n";
-        for (int ib = 0;ib < psi.get_nbands();++ib)
-        {
-            for (int ik = 0;ik < psi.get_nk();++ik)
-            {
-                for (int i = 0;i < psi.get_nbasis();++i)
-                {
-                    ofs << filter(psi(ib, ik, i)) << " ";
-                }
-                ofs << "\n";
-            }
-        }
+        assert(psi.size() == write_value(ofs, &psi(0, 0, 0), psi.get_nbands(), psi.get_nk(), psi.get_nbasis()));
+        ofs.close();
+    }
+    template<typename T>
+    void write_psi_bandfirst(const T* psi, const int& nband, const int& nk, const int& nbasis,
+        const std::string& filename, const int& rank, const double& threshold = 1e-10, const int& precision = 8)
+    {
+        std::ofstream ofs(filename + "_" + std::to_string(rank) + ".dat");
+        ofs << std::setprecision(precision) << std::scientific;
+        ofs << nband << " " << nk << " " << nbasis << "\n";
+        assert(nband * nk * nbasis == write_value(ofs, psi, nband, nk, nbasis));
         ofs.close();
     }
     template<typename T>
@@ -54,13 +112,7 @@ namespace LR_Util
         int nbands, nks, nbasis;
         ifs >> nbands >> nks >> nbasis;
         psi::Psi<T> psi(nks, nbands, nbasis, nullptr, false);
-        for (int ib = 0;ib < psi.get_nbands();++ib) {
-            for (int ik = 0;ik < psi.get_nk();++ik) {
-                for (int i = 0;i < psi.get_nbasis();++i) {
-                    ifs >> psi(ib, ik, i);
-}
-}
-}
+        assert(psi.size() == read_value(ifs, psi.get_pointer(), nbands, nks, nbasis));
         ifs.close();
         return psi;
     }
@@ -101,9 +153,7 @@ namespace LR_Util
     {
         std::cout << "first " << nnz << " non-zero elements of " << label << "\n";
         int inz = 0;int i = 0;
-        while (inz < nnz && i < nrxx) {
-            if (rho[++i] - 0.0 > threshold) { std::cout << rho[i] << " ";++inz; }
-};
+        while (inz < nnz && i < nrxx) { if (rho[++i] - 0.0 > threshold) { std::cout << rho[i] << " ";++inz; } };
     }
 
 
diff --git a/source/module_lr/utils/test/lr_util_algorithms_test.cpp b/source/module_lr/utils/test/lr_util_algorithms_test.cpp
index f47f46e941..f33105d32b 100644
--- a/source/module_lr/utils/test/lr_util_algorithms_test.cpp
+++ b/source/module_lr/utils/test/lr_util_algorithms_test.cpp
@@ -1,6 +1,7 @@
 #include <gtest/gtest.h>
 
 #include "../lr_util.h"
+#include "../lr_util_print.h"
 
 TEST(LR_Util, PsiWrapper)
 {
@@ -130,6 +131,27 @@ TEST(LR_Util, MatSymComplex)
     }
 }
 
+TEST(LR_Util, RWValue)
+{
+    const std::string file = "RWValue.txt";
+    std::ofstream ofs(file);
+    std::vector<int> vec(2 * 3 * 4 * 5, 0);
+    for (int i = 0;i < vec.size();++i) vec[i] = i;
+    LR_Util::write_value(ofs, vec.data(), 2, 3, 4, 5);
+    ofs.close();
+
+    std::vector<int> vec1(2 * 3 * 4 * 5, 0);
+    std::ifstream ifs1(file);
+    EXPECT_EQ(LR_Util::read_value(ifs1, vec1.data(), 2, 3, 4, 5), 120);
+    ifs1.close();
+    for (int i = 0;i < vec1.size();++i) { EXPECT_EQ(vec1[i], vec[i]); };
+    std::vector<int> vec2(2 * 3 * 4 * 5, 0);
+    std::ifstream ifs2(file);
+    EXPECT_EQ(LR_Util::read_value(ifs2, vec2.data(), 2 * 3, 4 * 5), 120);
+    ifs2.close();
+    for (int i = 0;i < vec2.size();++i) { EXPECT_EQ(vec2[i], vec[i]); };
+}
+
 int main(int argc, char** argv)
 {
     srand(time(NULL));  // for random number generator
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index a4c7cfbe8c..17284d4bb9 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -302,7 +302,8 @@ struct Input_para
     std::string lr_solver = "dav"; ///< the eigensolver for LR-TDDFT
     double lr_thr = 1e-2;          ///< convergence threshold of the LR-TDDFT eigensolver
     bool out_wfc_lr = false; ///< whether to output the eigenvectors (excitation amplitudes) in the particle-hole basis
-    std::vector<double> abs_wavelen_range = {0., 0.}; ///< the range of wavelength(nm) to output the absorption spectrum
+    bool lr_unrestricted = false; ///< whether to use the unrestricted construction for LR-TDDFT
+    std::vector<double> abs_wavelen_range = {}; ///< the range of wavelength(nm) to output the absorption spectrum
     double abs_broadening = 0.01;                     ///< the broadening (eta) for LR-TDDFT absorption spectrum
     std::string ri_hartree_benchmark = "none"; ///< whether to use the RI approximation for the Hartree potential in LR-TDDFT for benchmark (with FHI-aims/ABACUS read-in style)
     std::vector<int> aims_nbasis = {};  ///< the number of basis functions for each atom type used in FHI-aims (for benchmark)
diff --git a/tests/integrate/291_NO_KP_LR/INPUT b/tests/integrate/291_NO_KP_LR/INPUT
index df499ddc16..1826d4f618 100644
--- a/tests/integrate/291_NO_KP_LR/INPUT
+++ b/tests/integrate/291_NO_KP_LR/INPUT
@@ -6,6 +6,7 @@ orbital_dir                 ../../../tests/PP_ORB
 calculation             scf
 nbands                6
 symmetry               	-1
+nspin                   2
 
 #Parameters (2.Iteration)
 ecutwfc                 20
@@ -26,12 +27,12 @@ mixing_beta             0.4
 
 lr_nstates 2 # for test/debug, you can try a smaller one like 2
 xc_kernel lda
-lr_solver dav
+lr_solver dav_subspace
 lr_thr 1e-2
 pw_diag_ndim 4
 
 esolver_type ks-lr
 
 nvirt 2
-abs_wavelen_range  100 175
+abs_wavelen_range  200 600
 
diff --git a/tests/integrate/291_NO_KP_LR/result.ref b/tests/integrate/291_NO_KP_LR/result.ref
index bcd8824485..3c5a8347b0 100644
--- a/tests/integrate/291_NO_KP_LR/result.ref
+++ b/tests/integrate/291_NO_KP_LR/result.ref
@@ -1,2 +1 @@
-totexcitationenergyref 0.391462
-totaltimeref 
+totexcitationenergyref 0.784963
\ No newline at end of file
diff --git a/tests/integrate/391_NO_GO_LR/INPUT b/tests/integrate/391_NO_GO_LR/INPUT
index 62a139c69d..38476bd933 100644
--- a/tests/integrate/391_NO_GO_LR/INPUT
+++ b/tests/integrate/391_NO_GO_LR/INPUT
@@ -6,6 +6,7 @@ orbital_dir                 ../../../tests/PP_ORB
 calculation             scf
 nbands                6
 symmetry               	-1
+nspin                   2
 
 #Parameters (2.Iteration)
 ecutwfc                 10
diff --git a/tests/integrate/391_NO_GO_LR/result.ref b/tests/integrate/391_NO_GO_LR/result.ref
index e40418c026..8d6a343a51 100644
--- a/tests/integrate/391_NO_GO_LR/result.ref
+++ b/tests/integrate/391_NO_GO_LR/result.ref
@@ -1,2 +1 @@
-totexcitationenergyref 1.313946
-totaltimeref 
+totexcitationenergyref 2.641295
\ No newline at end of file
diff --git a/tests/integrate/392_NO_GO_LR_HF/INPUT b/tests/integrate/392_NO_GO_LR_HF/INPUT
index 15c3503dfc..e48285b572 100644
--- a/tests/integrate/392_NO_GO_LR_HF/INPUT
+++ b/tests/integrate/392_NO_GO_LR_HF/INPUT
@@ -6,6 +6,7 @@ orbital_dir                 ../../../tests/PP_ORB
 calculation             scf
 nbands                6
 symmetry               	-1
+nspin                   2
 
 #Parameters (2.Iteration)
 ecutwfc                10
diff --git a/tests/integrate/392_NO_GO_LR_HF/result.ref b/tests/integrate/392_NO_GO_LR_HF/result.ref
index 7c37d7c788..e5caf54572 100644
--- a/tests/integrate/392_NO_GO_LR_HF/result.ref
+++ b/tests/integrate/392_NO_GO_LR_HF/result.ref
@@ -1 +1 @@
-totexcitationenergyref 1.587504
\ No newline at end of file
+totexcitationenergyref 3.067979
\ No newline at end of file
diff --git a/tests/integrate/393_NO_GO_ULR_HF/INPUT b/tests/integrate/393_NO_GO_ULR_HF/INPUT
new file mode 100644
index 0000000000..e9acbcda6d
--- /dev/null
+++ b/tests/integrate/393_NO_GO_ULR_HF/INPUT
@@ -0,0 +1,44 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix                  autotest
+pseudo_dir              ../../../tests/PP_ORB
+orbital_dir                 ../../../tests/PP_ORB
+calculation             scf
+nbands                6
+symmetry               	-1
+nspin                   2
+
+#Parameters (2.Iteration)
+ecutwfc                10
+scf_thr                     1e-2
+scf_nmax                   20
+exx_hybrid_step 1
+
+#Parameters (3.Basis)
+basis_type              lcao 
+gamma_only              1
+
+#Parameters (4.Smearing)
+smearing_method                gaussian
+smearing_sigma                   0.02
+
+#Parameters (5.Mixing)
+mixing_type             pulay
+mixing_beta             0.4
+mixing_gg0 0
+
+dft_functional lda
+exx_real_number 1
+exx_ccp_rmesh_times 1
+
+lr_nstates 3
+xc_kernel lda
+lr_solver lapack
+lr_thr 1e-2
+lr_unrestricted 1
+pw_diag_ndim 2
+
+esolver_type ks-lr
+
+nvirt 2
+nocc 2
\ No newline at end of file
diff --git a/tests/integrate/393_NO_GO_ULR_HF/STRU b/tests/integrate/393_NO_GO_ULR_HF/STRU
new file mode 100644
index 0000000000..ceb8aaa84c
--- /dev/null
+++ b/tests/integrate/393_NO_GO_ULR_HF/STRU
@@ -0,0 +1,29 @@
+ATOMIC_SPECIES
+H 1.008 H_ONCV_PBE-1.0.upf
+O 15.9994 O_ONCV_PBE-1.0.upf
+
+NUMERICAL_ORBITAL
+H_gga_8au_60Ry_2s1p.orb
+O_gga_7au_60Ry_2s2p1d.orb
+
+
+LATTICE_CONSTANT
+1
+
+LATTICE_VECTORS
+28 0 0
+0 28 0
+0 0  28
+
+ATOMIC_POSITIONS
+Cartesian
+
+H
+0
+2
+-12.046787058887078 18.76558614676448 8.395247471328744 1 1 1
+-14.228868795885418 20.61549300274637 7.611989524516571 1 1 1
+O
+0
+1
+-13.486789117423204 19.684192208418636 8.958321352749174 1 1 1
diff --git a/tests/integrate/393_NO_GO_ULR_HF/jd b/tests/integrate/393_NO_GO_ULR_HF/jd
new file mode 100644
index 0000000000..ef8a41364a
--- /dev/null
+++ b/tests/integrate/393_NO_GO_ULR_HF/jd
@@ -0,0 +1 @@
+test for linear response TDHF using open-shell solver
\ No newline at end of file
diff --git a/tests/integrate/393_NO_GO_ULR_HF/result.ref b/tests/integrate/393_NO_GO_ULR_HF/result.ref
new file mode 100644
index 0000000000..e395c0fb9d
--- /dev/null
+++ b/tests/integrate/393_NO_GO_ULR_HF/result.ref
@@ -0,0 +1 @@
+totexcitationenergyref 1.940703
\ No newline at end of file
diff --git a/tests/integrate/CASES_CPU.txt b/tests/integrate/CASES_CPU.txt
index 62baa154ce..4c1929ce32 100644
--- a/tests/integrate/CASES_CPU.txt
+++ b/tests/integrate/CASES_CPU.txt
@@ -244,6 +244,7 @@
 386_NO_GO_MD_S1_HSE
 391_NO_GO_LR
 392_NO_GO_LR_HF
+393_NO_GO_ULR_HF
 401_NP_KP_sp
 401_NP_KP_spd
 #501_NO_neighboring_GaAs512
diff --git a/tests/integrate/tools/catch_properties.sh b/tests/integrate/tools/catch_properties.sh
index 403330ad98..13af75d5df 100755
--- a/tests/integrate/tools/catch_properties.sh
+++ b/tests/integrate/tools/catch_properties.sh
@@ -539,7 +539,7 @@ if [ $is_lr == 1 ]; then
 	lr_path=OUT.autotest/running_lr.log
 	lrns=$(get_input_key_value "lr_nstates" "INPUT")
 	lrns1=`echo "$lrns + 1" |bc`
-	grep -A$lrns1 "Excitation Energy" $lr_path | tail -$lrns | awk '{print $2}' > lr_eig.txt
+	grep -A$lrns1 "Excitation Energy" $lr_path | awk 'NR > 2 && $2 ~ /^[0-9]+\.[0-9]+$/ {print $2}' > lr_eig.txt
 	lreig_tot=`sum_file lr_eig.txt`
 	echo "totexcitationenergyref $lreig_tot" >>$1
 fi