diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 65a5f4f3d88..69af1c9530f 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -9,7 +9,8 @@ set(SOURCES
     preconditioner/block_jacobi.cpp
     solver/bicgstab.cpp
     solver/fcg.cpp
-    solver/cg.cpp)
+    solver/cg.cpp
+    solver/tfqmr.cpp)
 
 add_subdirectory(devices)  # basic device functionalities, always compiled
 add_subdirectory(device_hooks)  # placeholders for disabled modules
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index c48d04d92bb..96d32f8b593 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -40,6 +40,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/solver/bicgstab_kernels.hpp"
 #include "core/solver/cg_kernels.hpp"
 #include "core/solver/fcg_kernels.hpp"
+#include "core/solver/tfqmr_kernels.hpp"
 
 
 #ifndef GKO_HOOK_MODULE
@@ -198,6 +199,53 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_STEP_3_KERNEL);
 }  // namespace bicgstab
 
 
+namespace tfqmr {
+
+
+template <typename ValueType>
+GKO_DECLARE_TFQMR_INITIALIZE_KERNEL(ValueType)
+NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_INITIALIZE_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_TFQMR_STEP_1_KERNEL(ValueType)
+NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_1_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_TFQMR_STEP_2_KERNEL(ValueType)
+NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_2_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_TFQMR_STEP_3_KERNEL(ValueType)
+NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_3_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_TFQMR_STEP_4_KERNEL(ValueType)
+NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_4_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_TFQMR_STEP_5_KERNEL(ValueType)
+NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_5_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_TFQMR_STEP_6_KERNEL(ValueType)
+NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_6_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_TFQMR_STEP_7_KERNEL(ValueType)
+NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_7_KERNEL);
+
+
+}  // namespace tfqmr
+
+
 namespace csr {
 
 
diff --git a/core/solver/tfqmr.cpp b/core/solver/tfqmr.cpp
new file mode 100644
index 00000000000..1b15612ca99
--- /dev/null
+++ b/core/solver/tfqmr.cpp
@@ -0,0 +1,248 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright 2017-2018
+
+Karlsruhe Institute of Technology
+Universitat Jaume I
+University of Tennessee
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/tfqmr.hpp"
+
+
+#include "core/base/exception.hpp"
+#include "core/base/exception_helpers.hpp"
+#include "core/base/executor.hpp"
+#include "core/base/math.hpp"
+#include "core/base/utils.hpp"
+#include "core/solver/tfqmr_kernels.hpp"
+
+
+namespace gko {
+namespace solver {
+namespace {
+
+
+template <typename ValueType>
+struct TemplatedOperation {
+    GKO_REGISTER_OPERATION(initialize, tfqmr::initialize<ValueType>);
+    GKO_REGISTER_OPERATION(step_1, tfqmr::step_1<ValueType>);
+    GKO_REGISTER_OPERATION(step_2, tfqmr::step_2<ValueType>);
+    GKO_REGISTER_OPERATION(step_3, tfqmr::step_3<ValueType>);
+    GKO_REGISTER_OPERATION(step_4, tfqmr::step_4<ValueType>);
+    GKO_REGISTER_OPERATION(step_5, tfqmr::step_5<ValueType>);
+    GKO_REGISTER_OPERATION(step_6, tfqmr::step_6<ValueType>);
+    GKO_REGISTER_OPERATION(step_7, tfqmr::step_7<ValueType>);
+};
+
+
+/**
+ * Checks whether the required residual goal has been reached or not.
+ *
+ * @param tau  Residual of the iteration.
+ * @param orig_tau  Original residual.
+ * @param r  Relative residual goal.
+ */
+template <typename ValueType>
+bool has_converged(const matrix::Dense<ValueType> *tau,
+                   const matrix::Dense<ValueType> *orig_tau,
+                   remove_complex<ValueType> r)
+{
+    using std::abs;
+    for (size_type i = 0; i < tau->get_num_rows(); ++i) {
+        if (!(abs(tau->at(i, 0)) < r * abs(orig_tau->at(i, 0)))) {
+            return false;
+        }
+    }
+    return true;
+}
+
+
+}  // namespace
+
+
+template <typename ValueType>
+void Tfqmr<ValueType>::apply(const LinOp *b, LinOp *x) const
+{
+    using std::swap;
+    using Vector = matrix::Dense<ValueType>;
+    ASSERT_CONFORMANT(system_matrix_, b);
+    ASSERT_EQUAL_DIMENSIONS(b, x);
+
+    auto exec = this->get_executor();
+
+    auto one_op = initialize<Vector>({one<ValueType>()}, exec);
+    auto neg_one_op = initialize<Vector>({-one<ValueType>()}, exec);
+
+    auto dense_b = as<Vector>(b);
+    auto dense_x = as<Vector>(x);
+    auto r = Vector::create_with_config_of(dense_b);
+    auto r0 = Vector::create_with_config_of(dense_b);
+    auto d = Vector::create_with_config_of(dense_b);
+    auto v = Vector::create_with_config_of(dense_b);
+    auto u_m = Vector::create_with_config_of(dense_b);
+    auto u_mp1 = Vector::create_with_config_of(dense_b);
+    auto w = Vector::create_with_config_of(dense_b);
+    auto Ad = Vector::create_with_config_of(dense_b);
+    auto Au = Vector::create_with_config_of(dense_b);
+    auto Au_new = Vector::create_with_config_of(dense_b);
+    auto pu_m = Vector::create_with_config_of(dense_b);
+
+    auto alpha = Vector::create(exec, 1, dense_b->get_num_cols());
+    auto beta = Vector::create_with_config_of(alpha.get());
+    auto sigma = Vector::create_with_config_of(alpha.get());
+    auto rho_old = Vector::create_with_config_of(alpha.get());
+    auto rho = Vector::create_with_config_of(alpha.get());
+    auto taut = Vector::create_with_config_of(alpha.get());
+    auto tau = Vector::create_with_config_of(alpha.get());
+    auto nomw = Vector::create_with_config_of(alpha.get());
+    auto theta = Vector::create_with_config_of(alpha.get());
+    auto eta = Vector::create_with_config_of(alpha.get());
+    auto rov = Vector::create_with_config_of(alpha.get());
+
+    auto master_tau =
+        Vector::create(exec->get_master(), 1, dense_b->get_num_cols());
+    auto starting_tau = Vector::create_with_config_of(master_tau.get());
+
+    // TODO: replace this with automatic merged kernel generator
+    exec->run(TemplatedOperation<ValueType>::make_initialize_operation(
+        dense_b, r.get(), r0.get(), u_m.get(), u_mp1.get(), pu_m.get(),
+        Au.get(), Ad.get(), w.get(), v.get(), d.get(), taut.get(),
+        rho_old.get(), rho.get(), alpha.get(), beta.get(), taut.get(),
+        sigma.get(), rov.get(), eta.get(), nomw.get(), theta.get()));
+    // r = dense_b
+    // r0 = u_m = w = r
+    // Ad = d = 0
+    // theta = eta = rov = alpha = beta = sigma = 1.0
+
+    system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(), r.get());
+    r0->copy_from(r.get());
+    r->compute_dot(r.get(), tau.get());
+    w->copy_from(r.get());
+    u_m->copy_from(r.get());
+    starting_tau->copy_from(tau.get());
+    rho->copy_from(tau.get());
+    rho_old->copy_from(tau.get());
+    taut->copy_from(tau.get());
+    preconditioner_->apply(u_m.get(), pu_m.get());
+    system_matrix_->apply(pu_m.get(), v.get());
+    Au->copy_from(v.get());
+    Ad->copy_from(d.get());
+
+    for (int iter = 0; iter < max_iters_; ++iter) {
+        if (iter % 2 == 0) {
+            r0->compute_dot(v.get(), rov.get());
+            exec->run(TemplatedOperation<ValueType>::make_step_1_operation(
+                alpha.get(), rov.get(), rho.get(), v.get(), u_m.get(),
+                u_mp1.get()));
+            // alpha = rho / rov
+            // u_mp1 = u_m - alpha * v
+        }
+        exec->run(TemplatedOperation<ValueType>::make_step_2_operation(
+            theta.get(), alpha.get(), eta.get(), sigma.get(), Au.get(),
+            pu_m.get(), w.get(), d.get(), Ad.get()));
+        // sigma = (theta^2 / alpha) * eta;
+        // w = w - alpha * Au
+        // d = pu_m + sigma * d
+        // Ad = Au + sigma * Ad
+        w->compute_dot(w.get(), nomw.get());
+        exec->run(TemplatedOperation<ValueType>::make_step_3_operation(
+            theta.get(), nomw.get(), taut.get(), eta.get(), alpha.get()));
+        // theta = nomw / taut
+        // c_mp1 = 1 / (1 + theta)
+        // taut = taut * sqrt(theta) * c_mp1;
+        // eta = c_mp1 * c_mp1 * alpha;
+        exec->run(TemplatedOperation<ValueType>::make_step_4_operation(
+            eta.get(), d.get(), Ad.get(), dense_x, r.get()));
+        // x = x + eta * d
+        // r = r - eta * Ad
+        r->compute_dot(r.get(), tau.get());
+        master_tau->copy_from(tau.get());
+        if (has_converged(master_tau.get(), starting_tau.get(),
+                          rel_residual_goal_)) {
+            break;
+        }
+        if (iter % 2 != 0) {
+            r0->compute_dot(w.get(), rho.get());
+            exec->run(TemplatedOperation<ValueType>::make_step_5_operation(
+                beta.get(), rho_old.get(), rho.get(), w.get(), u_m.get(),
+                u_mp1.get()));
+            // beta = rho / rho_old
+            // u_mp1 = w + beta * u_m
+            // this is equivalent to step1 only different input
+            rho_old->copy_from(rho.get());
+        }
+        preconditioner_->apply(u_mp1.get(), pu_m.get());
+        system_matrix_->apply(pu_m.get(), Au_new.get());
+        if (iter % 2 != 0) {
+            exec->run(TemplatedOperation<ValueType>::make_step_6_operation(
+                beta.get(), Au_new.get(), Au.get(), v.get()));
+            // v = Au_new + beta * (Au + beta * v)
+        }
+        exec->run(TemplatedOperation<ValueType>::make_step_7_operation(
+            Au_new.get(), u_mp1.get(), Au.get(), u_m.get()));
+        // Au = Au_new
+        // u_m = u_mp1
+    }
+}
+
+
+template <typename ValueType>
+void Tfqmr<ValueType>::apply(const LinOp *alpha, const LinOp *b,
+                             const LinOp *beta, LinOp *x) const
+{
+    auto dense_x = as<matrix::Dense<ValueType>>(x);
+    auto x_clone = dense_x->clone();
+    this->apply(b, x_clone.get());
+    dense_x->scale(beta);
+    dense_x->add_scaled(alpha, x_clone.get());
+}
+
+
+template <typename ValueType>
+std::unique_ptr<LinOp> TfqmrFactory<ValueType>::generate(
+    std::shared_ptr<const LinOp> base) const
+{
+    ASSERT_EQUAL_DIMENSIONS(base,
+                            size(base->get_num_cols(), base->get_num_rows()));
+    auto tfqmr = std::unique_ptr<Tfqmr<ValueType>>(Tfqmr<ValueType>::create(
+        this->get_executor(), max_iters_, rel_residual_goal_, base));
+    tfqmr->set_preconditioner(precond_factory_->generate(base));
+    return std::move(tfqmr);
+}
+
+
+#define GKO_DECLARE_TFQMR(_type) class Tfqmr<_type>
+#define GKO_DECLARE_TFQMR_FACTORY(_type) class TfqmrFactory<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_FACTORY);
+#undef GKO_DECLARE_TFQMR
+#undef GKO_DECLARE_TFQMR_FACTORY
+
+
+}  // namespace solver
+}  // namespace gko
diff --git a/core/solver/tfqmr.hpp b/core/solver/tfqmr.hpp
new file mode 100644
index 00000000000..a45f4f2c5ad
--- /dev/null
+++ b/core/solver/tfqmr.hpp
@@ -0,0 +1,193 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright 2017-2018
+
+Karlsruhe Institute of Technology
+Universitat Jaume I
+University of Tennessee
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_SOLVER_TFQMR_HPP_
+#define GKO_CORE_SOLVER_TFQMR_HPP_
+
+
+#include "core/base/array.hpp"
+#include "core/base/lin_op.hpp"
+#include "core/base/math.hpp"
+#include "core/base/types.hpp"
+#include "core/matrix/identity.hpp"
+
+
+namespace gko {
+namespace solver {
+
+
+template <typename>
+class TfqmrFactory;
+
+/**
+ * TFQMR or the Transpose-Free Quasi-Minimal Residual is a Krylov subspace
+ * solver that derives from QMR by avoiding the need of transposing the system
+ * matrix.
+ *
+ * Being a generic solver, it is capable of solving general matrices, including
+ * non-s.p.d matrices.
+ *
+ * @tparam ValueType precision of the elements of the system matrix.
+ */
+template <typename ValueType = default_precision>
+class Tfqmr : public BasicLinOp<Tfqmr<ValueType>>, public PreconditionedMethod {
+    friend class BasicLinOp<Tfqmr>;
+    friend class TfqmrFactory<ValueType>;
+
+public:
+    using BasicLinOp<Tfqmr>::convert_to;
+    using BasicLinOp<Tfqmr>::move_to;
+
+    using value_type = ValueType;
+
+    void apply(const LinOp *b, LinOp *x) const override;
+
+    void apply(const LinOp *alpha, const LinOp *b, const LinOp *beta,
+               LinOp *x) const override;
+
+    /**
+     * Gets the system matrix of the linear system.
+     *
+     * @return  The system matrix.
+     */
+    std::shared_ptr<const LinOp> get_system_matrix() const
+    {
+        return system_matrix_;
+    }
+
+    /**
+     * Gets the maximum number of iterations of the TFQMR solver.
+     *
+     * @return  The maximum number of iterations.
+     */
+    int get_max_iters() const { return max_iters_; }
+
+    /**
+     * Gets the relative residual goal of the solver.
+     *
+     * @return  The relative residual goal.
+     */
+    remove_complex<value_type> get_rel_residual_goal() const
+    {
+        return rel_residual_goal_;
+    }
+
+protected:
+    using BasicLinOp<Tfqmr>::create;
+
+    explicit Tfqmr(std::shared_ptr<const Executor> exec)
+        : BasicLinOp<Tfqmr>(exec, 0, 0, 0)
+    {}
+
+    Tfqmr(std::shared_ptr<const Executor> exec, int max_iters,
+          remove_complex<value_type> rel_residual_goal,
+          std::shared_ptr<const LinOp> system_matrix)
+        : BasicLinOp<Tfqmr>(
+              exec, system_matrix->get_num_cols(),
+              system_matrix->get_num_rows(),
+              system_matrix->get_num_rows() * system_matrix->get_num_cols()),
+          system_matrix_(std::move(system_matrix)),
+          max_iters_(max_iters),
+          rel_residual_goal_(rel_residual_goal)
+    {}
+
+private:
+    std::shared_ptr<const LinOp> system_matrix_{};
+    int max_iters_{};
+    remove_complex<value_type> rel_residual_goal_{};
+};
+
+
+/**
+ * Creates the TFQMR solver.
+ *
+ * @param exec The executor on which the TFQMR solver is to be created.
+ * @param max_iters  The maximum number of iterations to be pursued.
+ * @param rel_residual_goal  The relative residual required for
+ * convergence.
+ *
+ * @return The newly created TFQMR solver.
+ */
+template <typename ValueType = default_precision>
+class TfqmrFactory : public LinOpFactory, public PreconditionedMethodFactory {
+public:
+    using value_type = ValueType;
+
+    static std::unique_ptr<TfqmrFactory> create(
+        std::shared_ptr<const Executor> exec, int max_iters,
+        remove_complex<value_type> rel_residual_goal)
+    {
+        return std::unique_ptr<TfqmrFactory>(
+            new TfqmrFactory(std::move(exec), max_iters, rel_residual_goal));
+    }
+
+    std::unique_ptr<LinOp> generate(
+        std::shared_ptr<const LinOp> base) const override;
+
+    /**
+     * Gets the maximum number of iterations of the TFQMR solver.
+     *
+     * @return  The maximum number of iterations.
+     */
+    int get_max_iters() const { return max_iters_; }
+
+    /**
+     * Gets the relative residual goal of the solver.
+     *
+     * @return  The relative residual goal.
+     */
+    remove_complex<value_type> get_rel_residual_goal() const
+    {
+        return rel_residual_goal_;
+    }
+
+protected:
+    TfqmrFactory(std::shared_ptr<const Executor> exec, int max_iters,
+                 remove_complex<value_type> rel_residual_goal)
+        : LinOpFactory(exec),
+          PreconditionedMethodFactory(
+              matrix::IdentityFactory<ValueType>::create(std::move(exec))),
+          max_iters_(max_iters),
+          rel_residual_goal_(rel_residual_goal)
+    {}
+
+    int max_iters_;
+    remove_complex<value_type> rel_residual_goal_;
+};
+
+
+}  // namespace solver
+}  // namespace gko
+
+
+#endif  // GKO_CORE_SOLVER_TFQMR_HPP
diff --git a/core/solver/tfqmr_kernels.hpp b/core/solver/tfqmr_kernels.hpp
new file mode 100644
index 00000000000..e4806d50c59
--- /dev/null
+++ b/core/solver/tfqmr_kernels.hpp
@@ -0,0 +1,170 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright 2017-2018
+
+Karlsruhe Institute of Technology
+Universitat Jaume I
+University of Tennessee
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_SOLVER_TFQMR_KERNELS_HPP_
+#define GKO_CORE_SOLVER_TFQMR_KERNELS_HPP_
+
+
+#include "core/base/types.hpp"
+#include "core/matrix/dense.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace tfqmr {
+
+
+#define GKO_DECLARE_TFQMR_INITIALIZE_KERNEL(_type)                            \
+    void initialize(std::shared_ptr<const DefaultExecutor> exec,              \
+                    const matrix::Dense<_type> *b, matrix::Dense<_type> *r,   \
+                    matrix::Dense<_type> *r0, matrix::Dense<_type> *u_m,      \
+                    matrix::Dense<_type> *u_mp1, matrix::Dense<_type> *pu_m,  \
+                    matrix::Dense<_type> *Au, matrix::Dense<_type> *Ad,       \
+                    matrix::Dense<_type> *w, matrix::Dense<_type> *v,         \
+                    matrix::Dense<_type> *d, matrix::Dense<_type> *taut,      \
+                    matrix::Dense<_type> *rho_old, matrix::Dense<_type> *rho, \
+                    matrix::Dense<_type> *alpha, matrix::Dense<_type> *beta,  \
+                    matrix::Dense<_type> *tau, matrix::Dense<_type> *sigma,   \
+                    matrix::Dense<_type> *rov, matrix::Dense<_type> *eta,     \
+                    matrix::Dense<_type> *nomw, matrix::Dense<_type> *theta)
+
+
+#define GKO_DECLARE_TFQMR_STEP_1_KERNEL(_type)                                \
+    void step_1(std::shared_ptr<const DefaultExecutor> exec,                  \
+                matrix::Dense<_type> *alpha, const matrix::Dense<_type> *rov, \
+                const matrix::Dense<_type> *rho,                              \
+                const matrix::Dense<_type> *v,                                \
+                const matrix::Dense<_type> *u_m, matrix::Dense<_type> *u_mp1)
+
+
+#define GKO_DECLARE_TFQMR_STEP_2_KERNEL(_type)                                \
+    void step_2(std::shared_ptr<const DefaultExecutor> exec,                  \
+                const matrix::Dense<_type> *theta,                            \
+                const matrix::Dense<_type> *alpha,                            \
+                const matrix::Dense<_type> *eta, matrix::Dense<_type> *sigma, \
+                const matrix::Dense<_type> *Au,                               \
+                const matrix::Dense<_type> *pu_m, matrix::Dense<_type> *w,    \
+                matrix::Dense<_type> *d, matrix::Dense<_type> *Ad)
+
+
+#define GKO_DECLARE_TFQMR_STEP_3_KERNEL(_type)                                 \
+    void step_3(std::shared_ptr<const DefaultExecutor> exec,                   \
+                matrix::Dense<_type> *theta, const matrix::Dense<_type> *nomw, \
+                matrix::Dense<_type> *taut, matrix::Dense<_type> *eta,         \
+                const matrix::Dense<_type> *alpha)
+
+
+#define GKO_DECLARE_TFQMR_STEP_4_KERNEL(_type)                                 \
+    void step_4(std::shared_ptr<const DefaultExecutor> exec,                   \
+                const matrix::Dense<_type> *eta,                               \
+                const matrix::Dense<_type> *d, const matrix::Dense<_type> *Ad, \
+                matrix::Dense<_type> *x, matrix::Dense<_type> *r)
+
+
+#define GKO_DECLARE_TFQMR_STEP_5_KERNEL(_type)                           \
+    void step_5(                                                         \
+        std::shared_ptr<const DefaultExecutor> exec,                     \
+        matrix::Dense<_type> *beta, const matrix::Dense<_type> *rho_old, \
+        const matrix::Dense<_type> *rho, const matrix::Dense<_type> *w,  \
+        const matrix::Dense<_type> *u_m, matrix::Dense<_type> *u_mp1)
+
+
+#define GKO_DECLARE_TFQMR_STEP_6_KERNEL(_type)               \
+    void step_6(std::shared_ptr<const DefaultExecutor> exec, \
+                const matrix::Dense<_type> *beta,            \
+                const matrix::Dense<_type> *Au_new,          \
+                const matrix::Dense<_type> *Au, matrix::Dense<_type> *v)
+
+#define GKO_DECLARE_TFQMR_STEP_7_KERNEL(_type)                               \
+    void step_7(std::shared_ptr<const DefaultExecutor> exec,                 \
+                const matrix::Dense<_type> *Au_new,                          \
+                const matrix::Dense<_type> *u_mp1, matrix::Dense<_type> *Au, \
+                matrix::Dense<_type> *u_m)
+
+#define DECLARE_ALL_AS_TEMPLATES                    \
+    template <typename ValueType>                   \
+    GKO_DECLARE_TFQMR_INITIALIZE_KERNEL(ValueType); \
+    template <typename ValueType>                   \
+    GKO_DECLARE_TFQMR_STEP_1_KERNEL(ValueType);     \
+    template <typename ValueType>                   \
+    GKO_DECLARE_TFQMR_STEP_2_KERNEL(ValueType);     \
+    template <typename ValueType>                   \
+    GKO_DECLARE_TFQMR_STEP_3_KERNEL(ValueType);     \
+    template <typename ValueType>                   \
+    GKO_DECLARE_TFQMR_STEP_4_KERNEL(ValueType);     \
+    template <typename ValueType>                   \
+    GKO_DECLARE_TFQMR_STEP_5_KERNEL(ValueType);     \
+    template <typename ValueType>                   \
+    GKO_DECLARE_TFQMR_STEP_6_KERNEL(ValueType);     \
+    template <typename ValueType>                   \
+    GKO_DECLARE_TFQMR_STEP_7_KERNEL(ValueType)
+
+
+}  // namespace tfqmr
+
+
+namespace cpu {
+namespace tfqmr {
+
+DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace tfqmr
+}  // namespace cpu
+
+
+namespace gpu {
+namespace tfqmr {
+
+DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace tfqmr
+}  // namespace gpu
+
+
+namespace reference {
+namespace tfqmr {
+
+DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace tfqmr
+}  // namespace reference
+
+
+#undef DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_SOLVER_TFQMR_KERNELS_HPP_
diff --git a/core/test/solver/CMakeLists.txt b/core/test/solver/CMakeLists.txt
index 2c94d727025..5ec3e2dbb81 100644
--- a/core/test/solver/CMakeLists.txt
+++ b/core/test/solver/CMakeLists.txt
@@ -1,3 +1,4 @@
 create_test(cg)
 create_test(fcg)
 create_test(bicgstab)
+create_test(tfqmr)
diff --git a/core/test/solver/tfqmr.cpp b/core/test/solver/tfqmr.cpp
new file mode 100644
index 00000000000..160969fe5ff
--- /dev/null
+++ b/core/test/solver/tfqmr.cpp
@@ -0,0 +1,190 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright 2017-2018
+
+Karlsruhe Institute of Technology
+Universitat Jaume I
+University of Tennessee
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <core/solver/tfqmr.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <core/base/executor.hpp>
+#include <core/matrix/dense.hpp>
+
+
+namespace {
+
+
+class Tfqmr : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    using Solver = gko::solver::Tfqmr<>;
+
+    Tfqmr()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::initialize<Mtx>(
+              {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)),
+          tfqmr_factory(gko::solver::TfqmrFactory<>::create(exec, 3, 1e-6)),
+          solver(tfqmr_factory->generate(mtx))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::shared_ptr<Mtx> mtx;
+    std::unique_ptr<gko::solver::TfqmrFactory<>> tfqmr_factory;
+    std::unique_ptr<gko::LinOp> solver;
+
+    static void assert_same_matrices(const Mtx *m1, const Mtx *m2)
+    {
+        ASSERT_EQ(m1->get_num_rows(), m2->get_num_rows());
+        ASSERT_EQ(m1->get_num_cols(), m2->get_num_cols());
+        for (gko::size_type i = 0; i < m1->get_num_rows(); ++i) {
+            for (gko::size_type j = 0; j < m2->get_num_cols(); ++j) {
+                EXPECT_EQ(m1->at(i, j), m2->at(i, j));
+            }
+        }
+    }
+};
+
+
+TEST_F(Tfqmr, TfqmrFactoryKnowsItsExecutor)
+{
+    ASSERT_EQ(tfqmr_factory->get_executor(), exec);
+}
+
+
+TEST_F(Tfqmr, TfqmrFactoryKnowsItsIterationLimit)
+{
+    ASSERT_EQ(tfqmr_factory->get_max_iters(), 3);
+}
+
+
+TEST_F(Tfqmr, TfqmrFactoryKnowsItsRelResidualGoal)
+{
+    ASSERT_EQ(tfqmr_factory->get_rel_residual_goal(), 1e-6);
+}
+
+
+TEST_F(Tfqmr, TfqmrFactoryCreatesCorrectSolver)
+{
+    ASSERT_EQ(solver->get_num_rows(), 3);
+    ASSERT_EQ(solver->get_num_cols(), 3);
+    ASSERT_EQ(solver->get_num_stored_elements(), 9);
+    auto tfqmr_solver = static_cast<Solver *>(solver.get());
+    ASSERT_EQ(tfqmr_solver->get_max_iters(), 3);
+    ASSERT_EQ(tfqmr_solver->get_rel_residual_goal(), 1e-6);
+    ASSERT_NE(tfqmr_solver->get_system_matrix(), nullptr);
+    ASSERT_EQ(tfqmr_solver->get_system_matrix(), mtx);
+}
+
+
+TEST_F(Tfqmr, CanBeCopied)
+{
+    auto copy = tfqmr_factory->generate(Mtx::create(exec));
+
+    copy->copy_from(solver.get());
+
+    ASSERT_EQ(copy->get_num_rows(), 3);
+    ASSERT_EQ(copy->get_num_cols(), 3);
+    ASSERT_EQ(copy->get_num_stored_elements(), 9);
+    auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
+    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+}
+
+
+TEST_F(Tfqmr, CanBeMoved)
+{
+    auto copy = tfqmr_factory->generate(Mtx::create(exec));
+
+    copy->copy_from(std::move(solver));
+
+    ASSERT_EQ(copy->get_num_rows(), 3);
+    ASSERT_EQ(copy->get_num_cols(), 3);
+    ASSERT_EQ(copy->get_num_stored_elements(), 9);
+    auto copy_mtx = static_cast<Solver *>(copy.get())->get_system_matrix();
+    assert_same_matrices(static_cast<const Mtx *>(copy_mtx.get()), mtx.get());
+}
+
+
+TEST_F(Tfqmr, CanBeCloned)
+{
+    auto clone = solver->clone();
+
+    ASSERT_EQ(clone->get_num_rows(), 3);
+    ASSERT_EQ(clone->get_num_cols(), 3);
+    ASSERT_EQ(clone->get_num_stored_elements(), 9);
+    auto clone_mtx = static_cast<Solver *>(clone.get())->get_system_matrix();
+    assert_same_matrices(static_cast<const Mtx *>(clone_mtx.get()), mtx.get());
+}
+
+
+TEST_F(Tfqmr, CanBeCleared)
+{
+    solver->clear();
+
+    ASSERT_EQ(solver->get_num_rows(), 0);
+    ASSERT_EQ(solver->get_num_cols(), 0);
+    ASSERT_EQ(solver->get_num_stored_elements(), 0);
+    auto solver_mtx = static_cast<Solver *>(solver.get())->get_system_matrix();
+    ASSERT_EQ(solver_mtx, nullptr);
+}
+
+
+TEST_F(Tfqmr, CanSetPreconditioner)
+{
+    std::shared_ptr<Mtx> precond =
+        gko::initialize<Mtx>({{1.0, 0.0, 0.0, 0.0, 1.0, 0.0}}, exec);
+    auto tfqmr_solver = static_cast<gko::solver::Tfqmr<> *>(solver.get());
+
+    tfqmr_solver->set_preconditioner(precond);
+
+    ASSERT_EQ(tfqmr_solver->get_preconditioner(), precond);
+}
+
+
+TEST_F(Tfqmr, CanSetPreconditionerGenertor)
+{
+    tfqmr_factory->set_preconditioner(
+        gko::solver::TfqmrFactory<>::create(exec, 3, 0.0));
+    auto solver = tfqmr_factory->generate(mtx);
+    auto precond = dynamic_cast<const gko::solver::Tfqmr<> *>(
+        static_cast<gko::solver::Tfqmr<> *>(solver.get())
+            ->get_preconditioner()
+            .get());
+
+    ASSERT_NE(precond, nullptr);
+    ASSERT_EQ(precond->get_num_rows(), 3);
+    ASSERT_EQ(precond->get_num_cols(), 3);
+    ASSERT_EQ(precond->get_system_matrix(), mtx);
+}
+
+
+}  // namespace
diff --git a/cpu/solver/tfqmr_kernels.cpp b/cpu/solver/tfqmr_kernels.cpp
new file mode 100644
index 00000000000..82bc75264dd
--- /dev/null
+++ b/cpu/solver/tfqmr_kernels.cpp
@@ -0,0 +1,98 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright 2017-2018
+
+Karlsruhe Institute of Technology
+Universitat Jaume I
+University of Tennessee
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/tfqmr_kernels.hpp"
+
+
+#include "core/base/exception_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cpu {
+namespace tfqmr {
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const CpuExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *rr, matrix::Dense<ValueType> *y,
+                matrix::Dense<ValueType> *s, matrix::Dense<ValueType> *t,
+                matrix::Dense<ValueType> *z, matrix::Dense<ValueType> *v,
+                matrix::Dense<ValueType> *p, matrix::Dense<ValueType> *prev_rho,
+                matrix::Dense<ValueType> *rho, matrix::Dense<ValueType> *alpha,
+                matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *gamma,
+                matrix::Dense<ValueType> *omega) NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const CpuExecutor> exec,
+            const matrix::Dense<ValueType> *r, matrix::Dense<ValueType> *p,
+            const matrix::Dense<ValueType> *v,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *prev_rho,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Dense<ValueType> *omega) NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const CpuExecutor> exec,
+            const matrix::Dense<ValueType> *r, matrix::Dense<ValueType> *s,
+            const matrix::Dense<ValueType> *v,
+            const matrix::Dense<ValueType> *rho,
+            matrix::Dense<ValueType> *alpha,
+            const matrix::Dense<ValueType> *beta) NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_2_KERNEL);
+
+
+template <typename ValueType>
+void step_3(
+    std::shared_ptr<const CpuExecutor> exec, matrix::Dense<ValueType> *x,
+    matrix::Dense<ValueType> *r, const matrix::Dense<ValueType> *s,
+    const matrix::Dense<ValueType> *t, const matrix::Dense<ValueType> *y,
+    const matrix::Dense<ValueType> *z, const matrix::Dense<ValueType> *alpha,
+    const matrix::Dense<ValueType> *beta, const matrix::Dense<ValueType> *gamma,
+    matrix::Dense<ValueType> *omega) NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_3_KERNEL);
+
+
+}  // namespace tfqmr
+}  // namespace cpu
+}  // namespace kernels
+}  // namespace gko
diff --git a/gpu/CMakeLists.txt b/gpu/CMakeLists.txt
index 5436c0c9790..73edc0ad710 100644
--- a/gpu/CMakeLists.txt
+++ b/gpu/CMakeLists.txt
@@ -8,7 +8,8 @@ set(SOURCES
     preconditioner/block_jacobi_kernels.cu
     solver/bicgstab_kernels.cu
     solver/fcg_kernels.cu
-    solver/cg_kernels.cu)
+    solver/cg_kernels.cu
+    solver/tfqmr_kernels.cu)
 
 set(CUDA_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 find_library(CUDA_RUNTIME_LIBS cudart
diff --git a/gpu/solver/tfqmr_kernels.cu b/gpu/solver/tfqmr_kernels.cu
new file mode 100644
index 00000000000..2e72b6f13af
--- /dev/null
+++ b/gpu/solver/tfqmr_kernels.cu
@@ -0,0 +1,325 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright 2017-2018
+
+Karlsruhe Institute of Technology
+Universitat Jaume I
+University of Tennessee
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/tfqmr_kernels.hpp"
+
+
+#include "core/base/exception_helpers.hpp"
+#include "core/base/math.hpp"
+#include "gpu/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace gpu {
+namespace tfqmr {
+
+
+constexpr int default_block_size = 512;
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void initialize_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ b, ValueType *__restrict__ r,
+    ValueType *__restrict__ rr, ValueType *__restrict__ y,
+    ValueType *__restrict__ s, ValueType *__restrict__ t,
+    ValueType *__restrict__ z, ValueType *__restrict__ v,
+    ValueType *__restrict__ p, ValueType *__restrict__ prev_rho,
+    ValueType *__restrict__ rho, ValueType *__restrict__ alpha,
+    ValueType *__restrict__ beta, ValueType *__restrict__ gamma,
+    ValueType *__restrict__ omega)
+{
+    const auto tidx =
+        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
+
+    if (tidx < num_cols) {
+        prev_rho[tidx] = one<ValueType>();
+        rho[tidx] = one<ValueType>();
+        alpha[tidx] = one<ValueType>();
+        beta[tidx] = one<ValueType>();
+        gamma[tidx] = one<ValueType>();
+        omega[tidx] = one<ValueType>();
+    }
+
+    if (tidx < num_rows * stride) {
+        r[tidx] = b[tidx];
+        rr[tidx] = zero<ValueType>();
+        y[tidx] = zero<ValueType>();
+        s[tidx] = zero<ValueType>();
+        t[tidx] = zero<ValueType>();
+        z[tidx] = zero<ValueType>();
+        v[tidx] = zero<ValueType>();
+        p[tidx] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const GpuExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *r0, matrix::Dense<ValueType> *u_m,
+                matrix::Dense<ValueType> *u_mp1, matrix::Dense<ValueType> *pu_m,
+                matrix::Dense<ValueType> *Au, matrix::Dense<ValueType> *Ad,
+                matrix::Dense<ValueType> *w, matrix::Dense<ValueType> *v,
+                matrix::Dense<ValueType> *d, matrix::Dense<ValueType> *taut,
+                matrix::Dense<ValueType> *rho_old,
+                matrix::Dense<ValueType> *rho, matrix::Dense<ValueType> *alpha,
+                matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *tau,
+                matrix::Dense<ValueType> *sigma, matrix::Dense<ValueType> *rov,
+                matrix::Dense<ValueType> *eta, matrix::Dense<ValueType> *nomw,
+                matrix::Dense<ValueType> *theta) NOT_IMPLEMENTED
+    /*
+    {
+        const dim3 block_size(default_block_size, 1, 1);
+        const dim3 grid_size(
+            ceildiv(b->get_num_rows() * b->get_stride(), block_size.x), 1, 1);
+
+        initialize_kernel<<<grid_size, block_size, 0, 0>>>(
+            b->get_num_rows(), b->get_num_cols(), b->get_stride(),
+            as_cuda_type(b->get_const_values()), as_cuda_type(r->get_values()),
+            as_cuda_type(rr->get_values()), as_cuda_type(y->get_values()),
+            as_cuda_type(s->get_values()), as_cuda_type(t->get_values()),
+            as_cuda_type(z->get_values()), as_cuda_type(v->get_values()),
+            as_cuda_type(p->get_values()), as_cuda_type(prev_rho->get_values()),
+            as_cuda_type(rho->get_values()), as_cuda_type(alpha->get_values()),
+            as_cuda_type(beta->get_values()), as_cuda_type(gamma->get_values()),
+            as_cuda_type(omega->get_values()));
+    }
+    */
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_1_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ r, ValueType *__restrict__ p,
+    const ValueType *__restrict__ v, const ValueType *__restrict__ rho,
+    const ValueType *__restrict__ prev_rho, const ValueType *__restrict__ alpha,
+    const ValueType *__restrict__ omega)
+{
+    const auto tidx =
+        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
+    const auto col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride) {
+        return;
+    }
+    auto res = r[tidx];
+    if (prev_rho[col] * omega[col] != zero<ValueType>()) {
+        const auto tmp = (rho[col] / prev_rho[col]) * (alpha[col] / omega[col]);
+        res += tmp * (p[tidx] - omega[col] * v[tidx]);
+    }
+    p[tidx] = res;
+}
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const GpuExecutor> exec,
+            matrix::Dense<ValueType> *alpha,
+            const matrix::Dense<ValueType> *rov,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *v,
+            const matrix::Dense<ValueType> *u_m,
+            matrix::Dense<ValueType> *u_mp1) NOT_IMPLEMENTED
+    /*
+    {
+        const dim3 block_size(default_block_size, 1, 1);
+        const dim3 grid_size(
+            ceildiv(r->get_num_rows() * r->get_stride(), block_size.x), 1, 1);
+
+        step_1_kernel<<<grid_size, block_size, 0, 0>>>(
+            r->get_num_rows(), r->get_num_cols(), r->get_stride(),
+            as_cuda_type(r->get_const_values()), as_cuda_type(p->get_values()),
+            as_cuda_type(v->get_const_values()),
+            as_cuda_type(rho->get_const_values()),
+            as_cuda_type(prev_rho->get_const_values()),
+            as_cuda_type(alpha->get_const_values()),
+            as_cuda_type(omega->get_const_values()));
+    }
+    */
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_2_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType *__restrict__ r, ValueType *__restrict__ s,
+    const ValueType *__restrict__ v, const ValueType *__restrict__ rho,
+    ValueType *__restrict__ alpha, const ValueType *__restrict__ beta)
+{
+    const size_type tidx =
+        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
+    const size_type col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride) {
+        return;
+    }
+    auto t_alpha = zero<ValueType>();
+    auto t_s = r[tidx];
+    if (beta[col] != zero<ValueType>()) {
+        t_alpha = rho[col] / beta[col];
+        t_s -= t_alpha * v[tidx];
+    }
+    alpha[col] = t_alpha;
+    s[tidx] = t_s;
+}
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const GpuExecutor> exec,
+            const matrix::Dense<ValueType> *theta,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Dense<ValueType> *eta,
+            matrix::Dense<ValueType> *sigma, const matrix::Dense<ValueType> *Au,
+            const matrix::Dense<ValueType> *pu_m, matrix::Dense<ValueType> *w,
+            matrix::Dense<ValueType> *d,
+            matrix::Dense<ValueType> *Ad) NOT_IMPLEMENTED
+    /*
+    {
+        const dim3 block_size(default_block_size, 1, 1);
+        const dim3 grid_size(
+            ceildiv(r->get_num_rows() * r->get_stride(), block_size.x), 1, 1);
+
+        step_2_kernel<<<grid_size, block_size, 0, 0>>>(
+            r->get_num_rows(), r->get_num_cols(), r->get_stride(),
+            as_cuda_type(r->get_const_values()), as_cuda_type(s->get_values()),
+            as_cuda_type(v->get_const_values()),
+            as_cuda_type(rho->get_const_values()),
+            as_cuda_type(alpha->get_values()),
+            as_cuda_type(beta->get_const_values()));
+    }
+    */
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_2_KERNEL);
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_3_kernel(
+    size_type num_rows, size_type num_cols, size_type stride,
+    size_type x_stride, ValueType *__restrict__ x, ValueType *__restrict__ r,
+    const ValueType *__restrict__ s, const ValueType *__restrict__ t,
+    const ValueType *__restrict__ y, const ValueType *__restrict__ z,
+    const ValueType *__restrict__ alpha, const ValueType *__restrict__ beta,
+    const ValueType *__restrict__ gamma, ValueType *__restrict__ omega)
+{
+    const auto tidx =
+        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
+    const auto row = tidx / stride;
+    const auto col = tidx % stride;
+    if (col >= num_cols || tidx >= num_rows * stride) {
+        return;
+    }
+    const auto x_pos = row * x_stride + col;
+    auto t_omega = zero<ValueType>();
+    auto t_x = x[x_pos] + alpha[col] * y[tidx];
+    auto t_r = s[tidx];
+    if (beta[col] != zero<ValueType>()) {
+        t_omega = gamma[col] / beta[col];
+        t_x += t_omega * z[tidx];
+        t_r -= t_omega * t[tidx];
+    }
+    omega[col] = t_omega;
+    x[x_pos] = t_x;
+    r[tidx] = t_r;
+}
+
+template <typename ValueType>
+void step_3(std::shared_ptr<const GpuExecutor> exec,
+            matrix::Dense<ValueType> *theta,
+            const matrix::Dense<ValueType> *nomw,
+            matrix::Dense<ValueType> *taut, matrix::Dense<ValueType> *eta,
+            const matrix::Dense<ValueType> *alpha) NOT_IMPLEMENTED
+    /*
+    {
+        const dim3 block_size(default_block_size, 1, 1);
+        const dim3 grid_size(
+            ceildiv(r->get_num_rows() * r->get_stride(), block_size.x), 1, 1);
+
+        step_3_kernel<<<grid_size, block_size, 0, 0>>>(
+            r->get_num_rows(), r->get_num_cols(), r->get_stride(),
+    x->get_stride(), as_cuda_type(x->get_values()),
+    as_cuda_type(r->get_values()), as_cuda_type(s->get_const_values()),
+            as_cuda_type(t->get_const_values()),
+            as_cuda_type(y->get_const_values()),
+            as_cuda_type(z->get_const_values()),
+            as_cuda_type(alpha->get_const_values()),
+            as_cuda_type(beta->get_const_values()),
+            as_cuda_type(gamma->get_const_values()),
+            as_cuda_type(omega->get_values()));
+    }
+    */
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_3_KERNEL);
+
+
+template <typename ValueType>
+void step_4(std::shared_ptr<const GpuExecutor> exec,
+            const matrix::Dense<ValueType> *eta,
+            const matrix::Dense<ValueType> *d,
+            const matrix::Dense<ValueType> *Ad, matrix::Dense<ValueType> *x,
+            matrix::Dense<ValueType> *r) NOT_IMPLEMENTED
+
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_4_KERNEL);
+
+
+template <typename ValueType>
+void step_5(std::shared_ptr<const GpuExecutor> exec,
+            matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *rho_old,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *w,
+            const matrix::Dense<ValueType> *u_m,
+            matrix::Dense<ValueType> *u_mp1) NOT_IMPLEMENTED
+
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_5_KERNEL);
+
+
+template <typename ValueType>
+void step_6(std::shared_ptr<const GpuExecutor> exec,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *Au_new,
+            const matrix::Dense<ValueType> *Au,
+            matrix::Dense<ValueType> *v) NOT_IMPLEMENTED
+
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_6_KERNEL);
+
+template <typename ValueType>
+void step_7(std::shared_ptr<const GpuExecutor> exec,
+            const matrix::Dense<ValueType> *Au_new,
+            const matrix::Dense<ValueType> *u_mp1, matrix::Dense<ValueType> *Au,
+            matrix::Dense<ValueType> *u_m) NOT_IMPLEMENTED
+
+    GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_7_KERNEL);
+
+
+}  // namespace tfqmr
+}  // namespace gpu
+}  // namespace kernels
+}  // namespace gko
diff --git a/gpu/test/solver/tfqmr_kernels.cpp b/gpu/test/solver/tfqmr_kernels.cpp
new file mode 100644
index 00000000000..8d6cd653e54
--- /dev/null
+++ b/gpu/test/solver/tfqmr_kernels.cpp
@@ -0,0 +1,320 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright 2017-2018
+
+Karlsruhe Institute of Technology
+Universitat Jaume I
+University of Tennessee
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <core/solver/tfqmr.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <random>
+
+
+#include <core/base/exception.hpp>
+#include <core/base/executor.hpp>
+#include <core/matrix/dense.hpp>
+#include <core/solver/tfqmr_kernels.hpp>
+#include <core/test/utils.hpp>
+
+
+namespace {
+
+
+class Tfqmr : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    Tfqmr() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::GpuExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        gpu = gko::GpuExecutor::create(0, ref);
+
+        mtx = gen_mtx(123, 123);
+        make_diag_dominant(mtx.get());
+        d_mtx = Mtx::create(gpu);
+        d_mtx->copy_from(mtx.get());
+        gpu_tfqmr_factory =
+            gko::solver::TfqmrFactory<>::create(gpu, 246, 1e-15);
+        ref_tfqmr_factory =
+            gko::solver::TfqmrFactory<>::create(ref, 246, 1e-15);
+    }
+
+    void TearDown()
+    {
+        if (gpu != nullptr) {
+            ASSERT_NO_THROW(gpu->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            ref, num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(0.0, 1.0), rand_engine);
+    }
+
+    void initialize_data()
+    {
+        int m = 48;
+        int n = 17;
+        x = gen_mtx(m, n);
+        b = gen_mtx(m, n);
+        r = gen_mtx(m, n);
+        z = gen_mtx(m, n);
+        p = gen_mtx(m, n);
+        rr = gen_mtx(m, n);
+        s = gen_mtx(m, n);
+        t = gen_mtx(m, n);
+        y = gen_mtx(m, n);
+        v = gen_mtx(m, n);
+        prev_rho = gen_mtx(1, n);
+        rho = gen_mtx(1, n);
+        alpha = gen_mtx(1, n);
+        beta = gen_mtx(1, n);
+        gamma = gen_mtx(1, n);
+        omega = gen_mtx(1, n);
+
+        d_x = Mtx::create(gpu);
+        d_b = Mtx::create(gpu);
+        d_r = Mtx::create(gpu);
+        d_z = Mtx::create(gpu);
+        d_p = Mtx::create(gpu);
+        d_t = Mtx::create(gpu);
+        d_s = Mtx::create(gpu);
+        d_y = Mtx::create(gpu);
+        d_v = Mtx::create(gpu);
+        d_rr = Mtx::create(gpu);
+        d_prev_rho = Mtx::create(gpu);
+        d_rho = Mtx::create(gpu);
+        d_alpha = Mtx::create(gpu);
+        d_beta = Mtx::create(gpu);
+        d_gamma = Mtx::create(gpu);
+        d_omega = Mtx::create(gpu);
+        d_x->copy_from(x.get());
+        d_b->copy_from(b.get());
+        d_r->copy_from(r.get());
+        d_z->copy_from(z.get());
+        d_p->copy_from(p.get());
+        d_v->copy_from(v.get());
+        d_y->copy_from(y.get());
+        d_t->copy_from(t.get());
+        d_s->copy_from(s.get());
+        d_rr->copy_from(rr.get());
+        d_prev_rho->copy_from(prev_rho.get());
+        d_rho->copy_from(rho.get());
+        d_alpha->copy_from(alpha.get());
+        d_beta->copy_from(beta.get());
+        d_gamma->copy_from(gamma.get());
+        d_omega->copy_from(omega.get());
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_num_rows(); ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_num_cols(); ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::GpuExecutor> gpu;
+
+    std::ranlux48 rand_engine;
+
+    std::shared_ptr<Mtx> mtx;
+    std::shared_ptr<Mtx> d_mtx;
+    std::unique_ptr<gko::solver::TfqmrFactory<>> gpu_tfqmr_factory;
+    std::unique_ptr<gko::solver::TfqmrFactory<>> ref_tfqmr_factory;
+
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<Mtx> b;
+    std::unique_ptr<Mtx> r;
+    std::unique_ptr<Mtx> z;
+    std::unique_ptr<Mtx> p;
+    std::unique_ptr<Mtx> rr;
+    std::unique_ptr<Mtx> s;
+    std::unique_ptr<Mtx> t;
+    std::unique_ptr<Mtx> y;
+    std::unique_ptr<Mtx> v;
+    std::unique_ptr<Mtx> prev_rho;
+    std::unique_ptr<Mtx> rho;
+    std::unique_ptr<Mtx> alpha;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> gamma;
+    std::unique_ptr<Mtx> omega;
+
+    std::unique_ptr<Mtx> d_x;
+    std::unique_ptr<Mtx> d_b;
+    std::unique_ptr<Mtx> d_r;
+    std::unique_ptr<Mtx> d_z;
+    std::unique_ptr<Mtx> d_p;
+    std::unique_ptr<Mtx> d_t;
+    std::unique_ptr<Mtx> d_s;
+    std::unique_ptr<Mtx> d_y;
+    std::unique_ptr<Mtx> d_v;
+    std::unique_ptr<Mtx> d_rr;
+    std::unique_ptr<Mtx> d_prev_rho;
+    std::unique_ptr<Mtx> d_rho;
+    std::unique_ptr<Mtx> d_alpha;
+    std::unique_ptr<Mtx> d_beta;
+    std::unique_ptr<Mtx> d_gamma;
+    std::unique_ptr<Mtx> d_omega;
+};
+
+
+TEST_F(Tfqmr, GpuTfqmrInitializeIsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::tfqmr::initialize(
+        ref, b.get(), r.get(), rr.get(), y.get(), s.get(), t.get(), z.get(),
+        v.get(), p.get(), prev_rho.get(), rho.get(), alpha.get(), beta.get(),
+        gamma.get(), omega.get());
+    gko::kernels::gpu::tfqmr::initialize(
+        gpu, d_b.get(), d_r.get(), d_rr.get(), d_y.get(), d_s.get(), d_t.get(),
+        d_z.get(), d_v.get(), d_p.get(), d_prev_rho.get(), d_rho.get(),
+        d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get());
+
+    EXPECT_MTX_NEAR(d_r, r, 1e-14);
+    EXPECT_MTX_NEAR(d_z, z, 1e-14);
+    EXPECT_MTX_NEAR(d_p, p, 1e-14);
+    EXPECT_MTX_NEAR(d_y, y, 1e-14);
+    EXPECT_MTX_NEAR(d_t, t, 1e-14);
+    EXPECT_MTX_NEAR(d_s, s, 1e-14);
+    EXPECT_MTX_NEAR(d_rr, rr, 1e-14);
+    EXPECT_MTX_NEAR(d_v, v, 1e-14);
+    EXPECT_MTX_NEAR(d_prev_rho, prev_rho, 1e-14);
+    EXPECT_MTX_NEAR(d_rho, rho, 1e-14);
+    EXPECT_MTX_NEAR(d_alpha, alpha, 1e-14);
+    EXPECT_MTX_NEAR(d_beta, beta, 1e-14);
+    EXPECT_MTX_NEAR(d_gamma, gamma, 1e-14);
+    EXPECT_MTX_NEAR(d_omega, omega, 1e-14);
+}
+
+
+TEST_F(Tfqmr, GpuTfqmrStep1IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::tfqmr::step_1(ref, r.get(), p.get(), v.get(),
+                                           rho.get(), prev_rho.get(),
+                                           alpha.get(), omega.get());
+    gko::kernels::gpu::tfqmr::step_1(gpu, d_r.get(), d_p.get(), d_v.get(),
+                                     d_rho.get(), d_prev_rho.get(),
+                                     d_alpha.get(), d_omega.get());
+
+    ASSERT_MTX_NEAR(d_p, p, 1e-14);
+}
+
+
+TEST_F(Tfqmr, GpuTfqmrStep2IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::tfqmr::step_2(ref, r.get(), s.get(), v.get(),
+                                           rho.get(), alpha.get(), beta.get());
+    gko::kernels::gpu::tfqmr::step_2(gpu, d_r.get(), d_s.get(), d_v.get(),
+                                     d_rho.get(), d_alpha.get(), d_beta.get());
+
+    ASSERT_MTX_NEAR(d_alpha, alpha, 1e-14);
+    ASSERT_MTX_NEAR(d_s, s, 1e-14);
+}
+
+
+TEST_F(Tfqmr, GpuTfqmrStep3IsEquivalentToRef)
+{
+    initialize_data();
+
+    gko::kernels::reference::tfqmr::step_3(
+        ref, x.get(), r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(),
+        beta.get(), gamma.get(), omega.get());
+    gko::kernels::gpu::tfqmr::step_3(
+        gpu, d_x.get(), d_r.get(), d_s.get(), d_t.get(), d_y.get(), d_z.get(),
+        d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get());
+
+    ASSERT_MTX_NEAR(d_omega, omega, 1e-14);
+    ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    ASSERT_MTX_NEAR(d_r, r, 1e-14);
+}
+
+
+TEST_F(Tfqmr, GpuTfqmrApplyOneRHSIsEquivalentToRef)
+{
+    int m = 123;
+    int n = 1;
+    auto ref_solver = ref_tfqmr_factory->generate(mtx);
+    auto gpu_solver = gpu_tfqmr_factory->generate(d_mtx);
+    auto b = gen_mtx(m, n);
+    auto x = gen_mtx(m, n);
+    auto d_b = Mtx::create(gpu);
+    auto d_x = Mtx::create(gpu);
+    d_b->copy_from(b.get());
+    d_x->copy_from(x.get());
+
+    ref_solver->apply(b.get(), x.get());
+    gpu_solver->apply(d_b.get(), d_x.get());
+
+    ASSERT_MTX_NEAR(d_b, b, 1e-13);
+    ASSERT_MTX_NEAR(d_x, x, 1e-13);
+}
+
+
+TEST_F(Tfqmr, GpuTfqmrApplyMultipleRHSIsEquivalentToRef)
+{
+    int m = 123;
+    int n = 16;
+    auto gpu_solver = gpu_tfqmr_factory->generate(d_mtx);
+    auto ref_solver = ref_tfqmr_factory->generate(mtx);
+    auto b = gen_mtx(m, n);
+    auto x = gen_mtx(m, n);
+    auto d_b = Mtx::create(gpu);
+    auto d_x = Mtx::create(gpu);
+    d_b->copy_from(b.get());
+    d_x->copy_from(x.get());
+
+    ref_solver->apply(b.get(), x.get());
+    gpu_solver->apply(d_b.get(), d_x.get());
+
+    ASSERT_MTX_NEAR(d_b, b, 1e-13);
+    ASSERT_MTX_NEAR(d_x, x, 1e-13);
+}
+
+
+}  // namespace
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index fc138e7c01c..3795ab9c9ce 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -6,7 +6,8 @@ set(SOURCES
     preconditioner/block_jacobi_kernels.cpp
     solver/bicgstab_kernels.cpp
     solver/fcg_kernels.cpp
-    solver/cg_kernels.cpp)
+    solver/cg_kernels.cpp
+    solver/tfqmr_kernels.cpp)
 
 add_library(ginkgo_reference SHARED
     $<TARGET_OBJECTS:ginkgo_reference_device>
diff --git a/reference/solver/tfqmr_kernels.cpp b/reference/solver/tfqmr_kernels.cpp
new file mode 100644
index 00000000000..e6c98222126
--- /dev/null
+++ b/reference/solver/tfqmr_kernels.cpp
@@ -0,0 +1,248 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright 2017-2018
+
+Karlsruhe Institute of Technology
+Universitat Jaume I
+University of Tennessee
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/tfqmr_kernels.hpp"
+
+#include "core/base/exception_helpers.hpp"
+#include "core/base/math.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace tfqmr {
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const ReferenceExecutor> exec,
+                const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *r,
+                matrix::Dense<ValueType> *r0, matrix::Dense<ValueType> *u_m,
+                matrix::Dense<ValueType> *u_mp1, matrix::Dense<ValueType> *pu_m,
+                matrix::Dense<ValueType> *Au, matrix::Dense<ValueType> *Ad,
+                matrix::Dense<ValueType> *w, matrix::Dense<ValueType> *v,
+                matrix::Dense<ValueType> *d, matrix::Dense<ValueType> *taut,
+                matrix::Dense<ValueType> *rho_old,
+                matrix::Dense<ValueType> *rho, matrix::Dense<ValueType> *alpha,
+                matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *tau,
+                matrix::Dense<ValueType> *sigma, matrix::Dense<ValueType> *rov,
+                matrix::Dense<ValueType> *eta, matrix::Dense<ValueType> *nomw,
+                matrix::Dense<ValueType> *theta)
+{
+    for (size_type j = 0; j < b->get_num_cols(); ++j) {
+        taut->at(j) = one<ValueType>();
+        rho_old->at(j) = zero<ValueType>();
+        rho->at(j) = zero<ValueType>();
+        alpha->at(j) = zero<ValueType>();
+        beta->at(j) = zero<ValueType>();
+        tau->at(j) = zero<ValueType>();
+        sigma->at(j) = zero<ValueType>();
+        rov->at(j) = zero<ValueType>();
+        eta->at(j) = one<ValueType>();
+        nomw->at(j) = one<ValueType>();
+        theta->at(j) = zero<ValueType>();
+    }
+    for (size_type i = 0; i < b->get_num_rows(); ++i) {
+        for (size_type j = 0; j < b->get_num_cols(); ++j) {
+            r->at(i, j) = b->at(i, j);
+            r0->at(i, j) = b->at(i, j);
+            u_m->at(i, j) = b->at(i, j);
+            w->at(i, j) = b->at(i, j);
+            v->at(i, j) = b->at(i, j);
+            d->at(i, j) = zero<ValueType>();
+            u_mp1->at(i, j) = zero<ValueType>();
+            pu_m->at(i, j) = zero<ValueType>();
+            Au->at(i, j) = zero<ValueType>();
+            Ad->at(i, j) = zero<ValueType>();
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_INITIALIZE_KERNEL);
+
+
+template <typename ValueType>
+void step_1(std::shared_ptr<const DefaultExecutor> exec,
+            matrix::Dense<ValueType> *alpha,
+            const matrix::Dense<ValueType> *rov,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *v,
+            const matrix::Dense<ValueType> *u_m,
+            matrix::Dense<ValueType> *u_mp1)
+{
+    for (size_type j = 0; j < v->get_num_cols(); ++j) {
+        if (rov->at(j) != zero<ValueType>()) {
+            alpha->at(j) = rho->at(j) / rov->at(j);
+        } else {
+            alpha->at(j) = zero<ValueType>();
+        }
+    }
+    for (size_type i = 0; i < v->get_num_rows(); ++i) {
+        for (size_type j = 0; j < v->get_num_cols(); ++j) {
+            u_mp1->at(i, j) = u_m->at(i, j) - alpha->at(j) * v->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_1_KERNEL);
+
+
+template <typename ValueType>
+void step_2(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Dense<ValueType> *theta,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Dense<ValueType> *eta,
+            matrix::Dense<ValueType> *sigma, const matrix::Dense<ValueType> *Au,
+            const matrix::Dense<ValueType> *pu_m, matrix::Dense<ValueType> *w,
+            matrix::Dense<ValueType> *d, matrix::Dense<ValueType> *Ad)
+{
+    for (size_type j = 0; j < d->get_num_cols(); ++j) {
+        if (alpha->at(j) != zero<ValueType>()) {
+            sigma->at(j) = theta->at(j) / alpha->at(j) * eta->at(j);
+        } else {
+            sigma->at(j) = zero<ValueType>();
+        }
+    }
+    for (size_type i = 0; i < d->get_num_rows(); ++i) {
+        for (size_type j = 0; j < d->get_num_cols(); ++j) {
+            w->at(i, j) = w->at(i, j) - alpha->at(j) * Au->at(i, j);
+            d->at(i, j) = pu_m->at(i, j) + sigma->at(j) * d->at(i, j);
+            Ad->at(i, j) = Au->at(i, j) + sigma->at(j) * Ad->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_2_KERNEL);
+
+
+template <typename ValueType>
+void step_3(std::shared_ptr<const DefaultExecutor> exec,
+            matrix::Dense<ValueType> *theta,
+            const matrix::Dense<ValueType> *nomw,
+            matrix::Dense<ValueType> *taut, matrix::Dense<ValueType> *eta,
+            const matrix::Dense<ValueType> *alpha)
+{
+    for (size_type j = 0; j < alpha->get_num_rows(); ++j) {
+        if (taut->at(j) != zero<ValueType>()) {
+            theta->at(j) = nomw->at(j) / taut->at(j);
+        } else {
+            theta->at(j) = one<ValueType>();
+        }
+        auto tmp = one<ValueType>() / sqrt(one<ValueType>() + theta->at(j));
+        taut->at(j) = taut->at(j) * sqrt(theta->at(j)) * tmp;
+        eta->at(j) = alpha->at(j) * tmp * tmp;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_3_KERNEL);
+
+
+template <typename ValueType>
+void step_4(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Dense<ValueType> *eta,
+            const matrix::Dense<ValueType> *d,
+            const matrix::Dense<ValueType> *Ad, matrix::Dense<ValueType> *x,
+            matrix::Dense<ValueType> *r)
+{
+    for (size_type i = 0; i < d->get_num_rows(); ++i) {
+        for (size_type j = 0; j < d->get_num_cols(); ++j) {
+            x->at(i, j) = x->at(i, j) + eta->at(j) * d->at(i, j);
+            r->at(i, j) = r->at(i, j) - eta->at(j) * Ad->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_4_KERNEL);
+
+
+template <typename ValueType>
+void step_5(std::shared_ptr<const DefaultExecutor> exec,
+            matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *rho_old,
+            const matrix::Dense<ValueType> *rho,
+            const matrix::Dense<ValueType> *w,
+            const matrix::Dense<ValueType> *u_m,
+            matrix::Dense<ValueType> *u_mp1)
+{
+    for (size_type j = 0; j < w->get_num_cols(); ++j) {
+        if (rho_old->at(j) != zero<ValueType>()) {
+            beta->at(j) = rho->at(j) / rho_old->at(j);
+        } else {
+            beta->at(j) = zero<ValueType>();
+        }
+    }
+    for (size_type i = 0; i < w->get_num_rows(); ++i) {
+        for (size_type j = 0; j < w->get_num_cols(); ++j) {
+            u_mp1->at(i, j) = w->at(i, j) + beta->at(j) * u_m->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_5_KERNEL);
+
+template <typename ValueType>
+void step_6(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Dense<ValueType> *Au_new,
+            const matrix::Dense<ValueType> *Au, matrix::Dense<ValueType> *v)
+{
+    for (size_type i = 0; i < v->get_num_rows(); ++i) {
+        for (size_type j = 0; j < v->get_num_cols(); ++j) {
+            v->at(i, j) =
+                Au_new->at(i, j) +
+                beta->at(j) * (Au->at(i, j) + beta->at(j) * v->at(i, j));
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_6_KERNEL);
+
+template <typename ValueType>
+void step_7(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Dense<ValueType> *Au_new,
+            const matrix::Dense<ValueType> *u_mp1, matrix::Dense<ValueType> *Au,
+            matrix::Dense<ValueType> *u_m)
+{
+    for (size_type i = 0; i < u_m->get_num_rows(); ++i) {
+        for (size_type j = 0; j < u_m->get_num_cols(); ++j) {
+            Au->at(i, j) = Au_new->at(i, j);
+            u_m->at(i, j) = u_mp1->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TFQMR_STEP_7_KERNEL);
+
+}  // namespace tfqmr
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/test/solver/CMakeLists.txt b/reference/test/solver/CMakeLists.txt
index 7008b0afc9c..ce655857cba 100644
--- a/reference/test/solver/CMakeLists.txt
+++ b/reference/test/solver/CMakeLists.txt
@@ -1,3 +1,4 @@
 create_test(bicgstab_kernels)
 create_test(cg_kernels)
 create_test(fcg_kernels)
+create_test(tfqmr_kernels)
diff --git a/reference/test/solver/tfqmr_kernels.cpp b/reference/test/solver/tfqmr_kernels.cpp
new file mode 100644
index 00000000000..ff27d554b42
--- /dev/null
+++ b/reference/test/solver/tfqmr_kernels.cpp
@@ -0,0 +1,121 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright 2017-2018
+
+Karlsruhe Institute of Technology
+Universitat Jaume I
+University of Tennessee
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <core/solver/tfqmr.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <core/base/exception.hpp>
+#include <core/base/executor.hpp>
+#include <core/matrix/dense.hpp>
+#include <core/test/utils.hpp>
+
+
+namespace {
+
+
+class Tfqmr : public ::testing::Test {
+protected:
+    using Mtx = gko::matrix::Dense<>;
+    Tfqmr()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::initialize<Mtx>(
+              {{1.0, -3.0, 0.0}, {-4.0, 1.0, -3.0}, {2.0, -1.0, 2.0}}, exec)),
+          tfqmr_factory(gko::solver::TfqmrFactory<>::create(exec, 8, 1e-15))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::shared_ptr<Mtx> mtx;
+    std::unique_ptr<gko::solver::TfqmrFactory<>> tfqmr_factory;
+};
+
+
+TEST_F(Tfqmr, SolvesDenseSystem)
+{
+    auto solver = tfqmr_factory->generate(mtx);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
+    auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, exec);
+
+    solver->apply(b.get(), x.get());
+
+    ASSERT_MTX_NEAR(x, l({-4.0, -1.0, 4.0}), 1e-8);
+}
+
+
+TEST_F(Tfqmr, SolvesMultipleDenseSystems)
+{
+    auto solver = tfqmr_factory->generate(mtx);
+    auto b =
+        gko::initialize<Mtx>({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec);
+    auto x = gko::initialize<Mtx>({{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}, exec);
+
+    solver->apply(b.get(), x.get());
+
+    ASSERT_MTX_NEAR(x, l({{-4.0, 1.0}, {-1.0, 2.0}, {4.0, -1.0}}), 1e-8);
+}
+
+
+TEST_F(Tfqmr, SolvesDenseSystemUsingAdvancedApply)
+{
+    auto solver = tfqmr_factory->generate(mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, exec);
+    auto b = gko::initialize<Mtx>({-1.0, 3.0, 1.0}, exec);
+    auto x = gko::initialize<Mtx>({0.5, 1.0, 2.0}, exec);
+
+    solver->apply(alpha.get(), b.get(), beta.get(), x.get());
+
+
+    ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 1e-8);
+}
+
+
+TEST_F(Tfqmr, SolvesMultipleDenseSystemsUsingAdvancedApply)
+{
+    auto solver = tfqmr_factory->generate(mtx);
+    auto alpha = gko::initialize<Mtx>({2.0}, exec);
+    auto beta = gko::initialize<Mtx>({-1.0}, exec);
+    auto b =
+        gko::initialize<Mtx>({{-1.0, -5.0}, {3.0, 1.0}, {1.0, -2.0}}, exec);
+    auto x = gko::initialize<Mtx>({{0.5, 1.0}, {1.0, 2.0}, {2.0, 3.0}}, exec);
+
+    solver->apply(alpha.get(), b.get(), beta.get(), x.get());
+
+
+    ASSERT_MTX_NEAR(x, l({{-8.5, 1.0}, {-3.0, 2.0}, {6.0, -5.0}}), 1e-8);
+}
+
+
+}  // namespace