From b298cfb4945e12f1a3d666225c5efa0376bc0fac Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Fri, 19 Jul 2019 14:01:39 +0800
Subject: [PATCH 01/25] add initial float16 code and type system tests

---
 Etaler/Core/DType.hpp  | 11 ++++++++++-
 Etaler/Core/Half.hpp   | 16 ++++++++++++++++
 tests/common_tests.cpp | 19 +++++++++++++++++++
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 Etaler/Core/Half.hpp
diff --git a/Etaler/Core/DType.hpp b/Etaler/Core/DType.hpp
index a201924..1d05445 100644
--- a/Etaler/Core/DType.hpp
+++ b/Etaler/Core/DType.hpp
@@ -4,6 +4,8 @@
 #include <limits>
 #include <string>
 
+#include "Half.hpp"
+
 namespace et
 {
 
@@ -13,6 +15,7 @@ enum class DType
 	Bool = 0,
 	Int32,
 	Float,
+	Half,
 };
 
 template <typename T>
@@ -24,11 +27,13 @@ constexpr inline DType typeToDType()
 		return DType::Int32;
 	else if constexpr(std::is_same<T, bool>::value || std::is_same<T, uint8_t>::value)
 		return DType::Bool;
+	else if constexpr(std::is_same<T, float16>::value)
+		return DType::Half;
 	else
 		return DType::Unknown;
 }
 
-inline size_t dtypeToSize(DType dtype)
+inline constexpr size_t dtypeToSize(DType dtype)
 {
 	if(dtype == DType::Bool)
 		return sizeof(bool);
@@ -36,6 +41,8 @@ inline size_t dtypeToSize(DType dtype)
 		return sizeof(int32_t);
 	else if(dtype == DType::Float)
 		return sizeof(float);
+	else if(dtype == DType::Half)
+		return sizeof(float16);
 	return std::numeric_limits<size_t>::max();
 }
 
@@ -47,6 +54,8 @@ inline std::string to_ctype_string(DType dtype)
 		return "int";
 	else if(dtype == DType::Float)
 		return "float";
+	else if(dtype == DType::Half)
+		return "half";
 	return "Unknown";
 }
 
diff --git a/Etaler/Core/Half.hpp b/Etaler/Core/Half.hpp
new file mode 100644
index 0000000..3f50529
--- /dev/null
+++ b/Etaler/Core/Half.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstdint>
+
+namespace et
+{
+
+//A dummp float16 structure
+struct half
+{
+	uint16_t dummy_;
+};
+
+using float16 = half;
+
+}
\ No newline at end of file
diff --git a/tests/common_tests.cpp b/tests/common_tests.cpp
index a0e144b..209af54 100644
--- a/tests/common_tests.cpp
+++ b/tests/common_tests.cpp
@@ -17,6 +17,25 @@ TEST_CASE("default backend sanity")
 	REQUIRE(defaultBackend() != nullptr);
 }
 
+TEST_CASE("Type system")
+{
+	SECTION("Type sizes") {
+		REQUIRE(dtypeToSize(DType::Bool) == 1);
+		REQUIRE(dtypeToSize(DType::Int32) == 4);
+		REQUIRE(dtypeToSize(DType::Float) == 4);
+		REQUIRE(dtypeToSize(DType::Half) == 2);
+	}
+
+	SECTION("type to dtype") {
+		REQUIRE(typeToDType<int32_t>() == DType::Int32);
+		REQUIRE(typeToDType<float>() == DType::Float);
+		REQUIRE(typeToDType<bool>() == DType::Bool);
+		REQUIRE(typeToDType<half>() == DType::Half);
+	}
+
+	//TODO: Add type check for tensor operations
+}
+
 TEST_CASE("Testing Shape", "[Shape]")
 {
 	Shape s = {3,5};

From bfb4393dc48872c66212b4b4377e5c47aaaf1f1f Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Fri, 19 Jul 2019 17:07:13 +0800
Subject: [PATCH 02/25] implement initial float16 support for CPU

---
 .gitmodules                    | 3 +++
 Etaler/3rdparty/half_precision | 1 +
 Etaler/Backends/CPUBackend.cpp | 4 +++-
 Etaler/Backends/CPUBackend.hpp | 4 +++-
 Etaler/CMakeLists.txt          | 1 +
 Etaler/Core/Half.hpp           | 7 ++-----
 Etaler/Core/Tensor.cpp         | 6 ++++++
 7 files changed, 19 insertions(+), 7 deletions(-)
 create mode 160000 Etaler/3rdparty/half_precision

diff --git a/.gitmodules b/.gitmodules
index 2f2130f..7c12992 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "Etaler/3rdparty/pcg-cpp"]
 	path = Etaler/3rdparty/pcg-cpp
 	url = https://github.com/imneme/pcg-cpp
+[submodule "Etaler/3rdparty/half_precision"]
+	path = Etaler/3rdparty/half_precision
+	url = https://github.com/marty1885/half_precision
diff --git a/Etaler/3rdparty/half_precision b/Etaler/3rdparty/half_precision
new file mode 160000
index 0000000..b7ef0d8
--- /dev/null
+++ b/Etaler/3rdparty/half_precision
@@ -0,0 +1 @@
+Subproject commit b7ef0d8735888f2aa9bfa4448913c86a000114fd
diff --git a/Etaler/Backends/CPUBackend.cpp b/Etaler/Backends/CPUBackend.cpp
index 4f9bab8..bcb09c4 100644
--- a/Etaler/Backends/CPUBackend.cpp
+++ b/Etaler/Backends/CPUBackend.cpp
@@ -449,8 +449,10 @@ void dispatch(DType dtype, Func f)
 		f(float{});
 	else if(dtype == DType::Bool)
 		f(bool{});
+	else if(dtype == DType::Half)
+		f(half{});
 	else
-		throw EtError("Cannot realize such dtype");
+		throw EtError("Cannot dispatch such dtype");
 }
 
 template <typename T2, typename T1>
diff --git a/Etaler/Backends/CPUBackend.hpp b/Etaler/Backends/CPUBackend.hpp
index 7777270..d5dd946 100644
--- a/Etaler/Backends/CPUBackend.hpp
+++ b/Etaler/Backends/CPUBackend.hpp
@@ -23,6 +23,8 @@ struct CPUBuffer : public BufferImpl
 			storage_ = new int32_t[shape.volume()];
 		else if(dtype == DType::Float)
 			storage_ = new float[shape.volume()];
+		else if(dtype == DType::Half)
+			storage_ = new half[shape.volume()];
 		else
 			std::cerr << "Critical Warning: CPUBuffer Initialize failed. Unknown DType" << std::endl;
 	}
@@ -42,7 +44,7 @@ struct CPUBuffer : public BufferImpl
 	virtual void* data() const override;
 
 protected:
-	std::variant<bool*, int32_t*, float*> storage_;
+	std::variant<bool*, int32_t*, float*, half*> storage_;
 };
 
 struct CPUBackend : public Backend
diff --git a/Etaler/CMakeLists.txt b/Etaler/CMakeLists.txt
index 24d01a4..6d6f2ea 100644
--- a/Etaler/CMakeLists.txt
+++ b/Etaler/CMakeLists.txt
@@ -15,6 +15,7 @@ generate_export_header(Etaler EXPORT_FILE_NAME Etaler_export.h)
 
 target_include_directories(Etaler PRIVATE 3rdparty/pcg-cpp/include)
 target_include_directories(Etaler PRIVATE 3rdparty/cereal/include)
+target_include_directories(Etaler PRIVATE 3rdparty/half_precision)
 target_include_directories(Etaler PRIVATE 3rdparty)
 
 if(${TBB_FOUND})
diff --git a/Etaler/Core/Half.hpp b/Etaler/Core/Half.hpp
index 3f50529..8404891 100644
--- a/Etaler/Core/Half.hpp
+++ b/Etaler/Core/Half.hpp
@@ -1,15 +1,12 @@
 #pragma once
 
 #include <cstdint>
+#include <Etaler/3rdparty/half_precision/half.hpp>
 
 namespace et
 {
 
-//A dummp float16 structure
-struct half
-{
-	uint16_t dummy_;
-};
+using half = half_precision::half;
 
 using float16 = half;
 
diff --git a/Etaler/Core/Tensor.cpp b/Etaler/Core/Tensor.cpp
index 1346707..eae8a52 100644
--- a/Etaler/Core/Tensor.cpp
+++ b/Etaler/Core/Tensor.cpp
@@ -114,6 +114,8 @@ static void printNDArray(std::ostream& os, const void* ptr, Shape shape, DType d
 		prettyPrintTensor(os, (int32_t*)ptr, shape, 0, shape.size(), 0, truncate);
 	else if(dtype == DType::Bool)
 		prettyPrintTensor(os, (bool*)ptr, shape, 0, shape.size(), 0, truncate);
+	else if(dtype == DType::Half)
+		prettyPrintTensor(os, (half*)ptr, shape, 0, shape.size(), 0, truncate);
 	else
 		throw EtError("Printing tensor of this type is not supported.");
 }
@@ -220,6 +222,8 @@ Tensor et::zeros(const Shape& shape, DType dtype, Backend* backend)
 		return constant<int32_t>(shape, 0, backend);
 	else if(dtype == DType::Float)
 		return constant<float>(shape, 0, backend);
+	else if(dtype == DType::Half)
+		return constant<half>(shape, half(0), backend);
 	else
 		throw EtError("Cannot creatr a tensor of zeros of type " + to_ctype_string(dtype));
 }
@@ -232,6 +236,8 @@ Tensor et::ones(const Shape& shape, DType dtype, Backend* backend)
 		return constant<int32_t>(shape, 1, backend);
 	else if(dtype == DType::Float)
 		return constant<float>(shape, 1, backend);
+	else if(dtype == DType::Half)
+		return constant<half>(shape, half(1), backend);
 	else
 		throw EtError("Cannot creatr a tensor of ones of type " + to_ctype_string(dtype));
 }

From 82fe5f61f60c435f36453902e3f11f96503f2b58 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Fri, 19 Jul 2019 17:13:04 +0800
Subject: [PATCH 03/25] Add fp16 support to OpenCLBackend

---
 Etaler/Backends/OpenCLBackend.cpp | 3 +++
 Etaler/Backends/OpenCLBackend.hpp | 1 +
 2 files changed, 4 insertions(+)

diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index 56d9787..d7e6450 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -104,11 +104,14 @@ void OpenCLBackend::init(cl::Context context, cl::Platform platform, cl::Device
 	std::string device_name = device_.getInfo<CL_DEVICE_NAME>();
 	et_assert(isExtentionSupported("cl_khr_local_int32_base_atomics"), "cl_khr_local_int32_base_atomics is not supported by " + device_name);
 	et_assert(isExtentionSupported("cl_khr_local_int32_extended_atomics"), "cl_khr_local_int32_extended_atomics is not supported by " + device_name);
+
+	have_fp16_ = isExtentionSupported("cl_khr_fp16");
 }
 
 std::shared_ptr<TensorImpl> OpenCLBackend::createTensor(const Shape& shape, DType dtype, const void* data)
 {
 	et_assert(dtype != DType::Unknown);
+	et_assert(dtype != DType::Half || have_fp16_ == true, "Creating half(fp16) tensor but device have no fp16 capablity.");
 	size_t buf_size = shape.volume()*dtypeToSize(dtype);
 	cl::Buffer buf = allocBuffer(buf_size);
 
diff --git a/Etaler/Backends/OpenCLBackend.hpp b/Etaler/Backends/OpenCLBackend.hpp
index ca64106..75dc60a 100644
--- a/Etaler/Backends/OpenCLBackend.hpp
+++ b/Etaler/Backends/OpenCLBackend.hpp
@@ -187,6 +187,7 @@ struct ETALER_EXPORT OpenCLBackend : public Backend
 	cl_uint num_compute_units_;
 
 	std::vector<std::string> supported_extentions_;
+	bool have_fp16_ = false;
 };
 
 }

From f6bee2502b1f4a5a21ba311736d1fcc337db5d6c Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Fri, 19 Jul 2019 21:48:27 +0800
Subject: [PATCH 04/25] fix g_backend_hold linking error on Linux

---
 Etaler/Core/DefaultBackend.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Etaler/Core/DefaultBackend.hpp b/Etaler/Core/DefaultBackend.hpp
index 73ac070..0a21ad5 100644
--- a/Etaler/Core/DefaultBackend.hpp
+++ b/Etaler/Core/DefaultBackend.hpp
@@ -8,7 +8,7 @@ namespace et
 {
 
 extern ETALER_EXPORT Backend* g_default_backend;
-extern std::shared_ptr<Backend> g_default_backend_hold;
+extern ETALER_EXPORT std::shared_ptr<Backend> g_default_backend_hold;
 
 inline void setDefaultBackend(Backend* backend) {g_default_backend = backend;}
 inline void setDefaultBackend(std::shared_ptr<Backend> backend) {g_default_backend_hold = backend; g_default_backend = backend.get();}

From 9e4dc537964fd18701484a4959feefc7d14bd858 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Fri, 19 Jul 2019 22:14:50 +0800
Subject: [PATCH 05/25] templatize dispatch()

---
 Etaler/Backends/CPUBackend.cpp | 23 ++++++++++++-----------
 Etaler/Core/TypeList.hpp       | 30 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 11 deletions(-)
 create mode 100644 Etaler/Core/TypeList.hpp

diff --git a/Etaler/Backends/CPUBackend.cpp b/Etaler/Backends/CPUBackend.cpp
index bcb09c4..78bf627 100644
--- a/Etaler/Backends/CPUBackend.cpp
+++ b/Etaler/Backends/CPUBackend.cpp
@@ -1,6 +1,7 @@
 #include "CPUBackend.hpp"
 #include "Etaler/Core/Views.hpp"
 #include "Etaler/Core/Random.hpp"
+#include "Etaler/Core/TypeList.hpp"
 
 #include <numeric>
 #include <cmath>
@@ -440,19 +441,19 @@ const T* getPtrToValue(size_t parent_idx, const TensorImpl* t)
 	return ((const T*)t->data())+offset;
 }
 
-template <typename Func>
-void dispatch(DType dtype, Func f)
+template <typename Func, typename TypeList = type_list_t<int32_t, float, bool, half>>
+inline void dispatch(DType dtype, Func f)
 {
-	if(dtype == DType::Int32)
-		f(int32_t{});
-	else if(dtype == DType::Float)
-		f(float{});
-	else if(dtype == DType::Bool)
-		f(bool{});
-	else if(dtype == DType::Half)
-		f(half{});
+	if constexpr(std::is_same_v<TypeList, null_t> == false) {
+		using T = typename TypeList::head;
+		if(typeToDType<T>() == dtype) {
+			f(T());
+			return;
+		}
+		dispatch<Func, typename TypeList::tail>(dtype, f);
+	}
 	else
-		throw EtError("Cannot dispatch such dtype");
+		throw EtError("Cannot dispatch such dtype ID: " + (int)dtype + to_ctype_string(dtype));
 }
 
 template <typename T2, typename T1>
diff --git a/Etaler/Core/TypeList.hpp b/Etaler/Core/TypeList.hpp
new file mode 100644
index 0000000..57d9f75
--- /dev/null
+++ b/Etaler/Core/TypeList.hpp
@@ -0,0 +1,30 @@
+#include <type_traits>
+
+namespace et
+{
+//Using STL coding style in this file
+struct null_t {};
+
+template <typename T, typename U>
+struct type_list_node
+{
+    using head = T;
+    using tail = U;
+};
+
+template <typename ... Ts> struct type_list;
+
+// Case: Normal recursion. Consume one type per call.
+template <typename T, typename ... REST>
+struct type_list<T, REST...> { 
+    using type = type_list_node<T, typename type_list<REST...>::type>;
+};
+
+// Case: Recursion abort, because the list of types ran empty
+template <>
+struct type_list<> { using type = null_t; };
+
+template <typename ... Ts>
+using type_list_t = typename type_list<Ts...>::type;
+
+}
\ No newline at end of file

From 760851716778258fe65894a2e0f856578119f060 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Fri, 19 Jul 2019 22:39:19 +0800
Subject: [PATCH 06/25] add test for result of exp()

---
 tests/common_tests.cpp | 45 ++++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/tests/common_tests.cpp b/tests/common_tests.cpp
index 209af54..64cdf2a 100644
--- a/tests/common_tests.cpp
+++ b/tests/common_tests.cpp
@@ -17,25 +17,6 @@ TEST_CASE("default backend sanity")
 	REQUIRE(defaultBackend() != nullptr);
 }
 
-TEST_CASE("Type system")
-{
-	SECTION("Type sizes") {
-		REQUIRE(dtypeToSize(DType::Bool) == 1);
-		REQUIRE(dtypeToSize(DType::Int32) == 4);
-		REQUIRE(dtypeToSize(DType::Float) == 4);
-		REQUIRE(dtypeToSize(DType::Half) == 2);
-	}
-
-	SECTION("type to dtype") {
-		REQUIRE(typeToDType<int32_t>() == DType::Int32);
-		REQUIRE(typeToDType<float>() == DType::Float);
-		REQUIRE(typeToDType<bool>() == DType::Bool);
-		REQUIRE(typeToDType<half>() == DType::Half);
-	}
-
-	//TODO: Add type check for tensor operations
-}
-
 TEST_CASE("Testing Shape", "[Shape]")
 {
 	Shape s = {3,5};
@@ -602,6 +583,32 @@ TEST_CASE("SDRClassifer")
 		CHECK(classifer.compute(encoder::category(i, num_category, num_bits),0) == i);
 }
 
+TEST_CASE("Type system")
+{
+	SECTION("Type sizes") {
+		REQUIRE(dtypeToSize(DType::Bool) == 1);
+		REQUIRE(dtypeToSize(DType::Int32) == 4);
+		REQUIRE(dtypeToSize(DType::Float) == 4);
+		REQUIRE(dtypeToSize(DType::Half) == 2);
+	}
+
+	SECTION("type to dtype") {
+		REQUIRE(typeToDType<int32_t>() == DType::Int32);
+		REQUIRE(typeToDType<float>() == DType::Float);
+		REQUIRE(typeToDType<bool>() == DType::Bool);
+		REQUIRE(typeToDType<half>() == DType::Half);
+	}
+
+	SECTION("type of Tensor operatoins") {
+		SECTION("exp") {
+			CHECK(exp(ones({1}, DType::Bool)).dtype() == DType::Float);
+			CHECK(exp(ones({1}, DType::Int32)).dtype() == DType::Float);
+			CHECK(exp(ones({1}, DType::Float)).dtype() == DType::Float);
+			CHECK(exp(ones({1}, DType::Half)).dtype() == DType::Half);
+		}
+	}
+}
+
 // TEST_CASE("Serealize")
 // {
 // 	using namespace et;

From e15029f529447639f67e60f8e4e1f977142234ce Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Fri, 19 Jul 2019 22:40:12 +0800
Subject: [PATCH 07/25] fix wrong resulting type of uniary op on CPU

---
 Etaler/Backends/CPUBackend.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Etaler/Backends/CPUBackend.cpp b/Etaler/Backends/CPUBackend.cpp
index 78bf627..2f65349 100644
--- a/Etaler/Backends/CPUBackend.cpp
+++ b/Etaler/Backends/CPUBackend.cpp
@@ -473,7 +473,8 @@ static std::shared_ptr<TensorImpl> uniaryOp(const TensorImpl* src, Op op)
 			auto res = op(*ptr);
 			using ResType = decltype(res);
 			//We don't have support to double percition now. Cast it to float
-			using StoreType = typename std::conditional<std::is_same<ResType, double>::value, float, ResType>::type;
+			using StoreType = typename std::conditional_t<std::is_same_v<T, half>, half
+				, typename std::conditional_t<std::is_same_v<ResType, double>, float, ResType>>;
 			if(i == 0)
 				dest = src->backend()->createTensor(src->shape(), typeToDType<StoreType>());
 

From 65e7d9eab978ab8361a3350f62719a6a2cba7214 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Fri, 19 Jul 2019 22:40:44 +0800
Subject: [PATCH 08/25] Enable fp16 for GPU4

---
 Etaler/Backends/OpenCLBackend.cpp | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index d7e6450..d637e00 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -851,7 +851,11 @@ kernel void op(global T0* restrict x, global ResType* restrict y)
 
 )";
 
-	std::string res = f + "\n" + jitStridedView(x, 0) + "\n" + kernel;
+	std::string extention_decl;
+	if(x->dtype() == DType::Half)
+		extention_decl = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable";
+
+	std::string res = extention_decl + "\n" + f + "\n" + jitStridedView(x, 0) + "\n" + kernel;
 	replaceAll(res, "$SIZE", std::to_string(x->size()));
 	return res;
 }
@@ -872,7 +876,11 @@ kernel void op(global T0* restrict x1, global T1* restrict x2, global ResType* r
 }
 )";
 
-	std::string res = f + "\n" + jitStridedView(x1, 0) + "\n"  + jitStridedView(x2, 1) + "\n" + kernel;
+	std::string extention_decl;
+	if(x1->dtype() == DType::Half || x2->dtype() == DType::Half)
+		extention_decl = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable";
+
+	std::string res = extention_decl + "\n" + f + "\n" + jitStridedView(x1, 0) + "\n"  + jitStridedView(x2, 1) + "\n" + kernel;
 	replaceAll(res, "$SIZE", std::to_string(x1->size()));
 	return res;
 }
@@ -938,22 +946,26 @@ std::shared_ptr<TensorImpl> OpenCLBackend::applyBinaryOp(const TensorImpl* x1, c
 
 std::shared_ptr<TensorImpl> OpenCLBackend::exp(const TensorImpl* x)
 {
-	return applyUnaryOp(x, "#define f(x) (exp((float)x))", DType::Float);
+	DType result_type = x->dtype() == DType::Half ? DType::Half : DType::Float;
+	return applyUnaryOp(x, "#define f(x) (exp((float)x))", result_type);
 }
 
 std::shared_ptr<TensorImpl> OpenCLBackend::negate(const TensorImpl* x)
 {
-	return applyUnaryOp(x, "#define f(x) (-x)", x->dtype()==DType::Float? DType::Float : DType::Int32);
+	DType result_type = x->dtype() == DType::Bool ? DType::Int32 : x->dtype();
+	return applyUnaryOp(x, "#define f(x) (-x)", result_type);
 }
 
 std::shared_ptr<TensorImpl> OpenCLBackend::inverse(const TensorImpl* x)
 {
-	return applyUnaryOp(x, "#define f(x) (1.0f/(float)x)", DType::Float);
+	DType result_type = x->dtype() == DType::Half ? DType::Half : DType::Float;
+	return applyUnaryOp(x, "#define f(x) (1.0f/(float)x)", result_type);
 }
 
 std::shared_ptr<TensorImpl> OpenCLBackend::log(const TensorImpl* x)
 {
-	return applyUnaryOp(x, "#define f(x) (log((float)x))", DType::Float);
+	DType result_type = x->dtype() == DType::Half ? DType::Half : DType::Float;
+	return applyUnaryOp(x, "#define f(x) (log((float)x))", result_type);
 }
 
 std::shared_ptr<TensorImpl> OpenCLBackend::logical_not(const TensorImpl* x)

From ff0b0ceab0155e91dda5a088a1b08bd3a892b74e Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sat, 20 Jul 2019 08:58:17 +0800
Subject: [PATCH 09/25] add half percition info to deviceInfo

---
 Etaler/Backends/OpenCLBackend.cpp | 1 +
 Etaler/Backends/OpenCLBackend.hpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index d637e00..dfc580d 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -164,6 +164,7 @@ std::string OpenCLBackend::deviceInfo() const
 	res += "Local memory size: " + std::to_string(localMemorySize()/1024) + " KB\n";
 	res += "Local memory type: " + local_type[localMemoryType()] + "\n";
 	res += "Prefered work group size: " + std::to_string(kernel_manager_.kernel("__etaler_dummy__").getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device_)) + "\n";
+	res += "Half percision: " + std::string(isExtentionSupported("cl_khr_fp16") ? "Yes" : "No");
 	return res;
 }
 
diff --git a/Etaler/Backends/OpenCLBackend.hpp b/Etaler/Backends/OpenCLBackend.hpp
index 75dc60a..03fe640 100644
--- a/Etaler/Backends/OpenCLBackend.hpp
+++ b/Etaler/Backends/OpenCLBackend.hpp
@@ -138,7 +138,7 @@ struct ETALER_EXPORT OpenCLBackend : public Backend
 
 	inline cl::Context context() {return context_;}
 
-	inline bool isExtentionSupported(std::string ext)
+	inline bool isExtentionSupported(std::string ext) const
 	{
 		return (std::find(supported_extentions_.begin(), supported_extentions_.end(), ext)
 			!= supported_extentions_.end());

From 6b6259955b56c5c199f49b52a44d824918bc4ade Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sat, 20 Jul 2019 09:17:20 +0800
Subject: [PATCH 10/25] add tests for uniary operation types and make backend
 complicant

---
 Etaler/Backends/CPUBackend.cpp    |  5 +++--
 Etaler/Backends/OpenCLBackend.cpp |  3 ++-
 tests/common_tests.cpp            | 31 ++++++++++++++++++++++++++++++-
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/Etaler/Backends/CPUBackend.cpp b/Etaler/Backends/CPUBackend.cpp
index 2f65349..f803755 100644
--- a/Etaler/Backends/CPUBackend.cpp
+++ b/Etaler/Backends/CPUBackend.cpp
@@ -473,8 +473,9 @@ static std::shared_ptr<TensorImpl> uniaryOp(const TensorImpl* src, Op op)
 			auto res = op(*ptr);
 			using ResType = decltype(res);
 			//We don't have support to double percition now. Cast it to float
-			using StoreType = typename std::conditional_t<std::is_same_v<T, half>, half
-				, typename std::conditional_t<std::is_same_v<ResType, double>, float, ResType>>;
+			using StoreType = typename std::conditional_t<std::is_same_v<ResType, bool>, bool
+				, typename std::conditional_t<std::is_same_v<T, half>, half
+				, typename std::conditional_t<std::is_same_v<ResType, double>, float, ResType>>>;
 			if(i == 0)
 				dest = src->backend()->createTensor(src->shape(), typeToDType<StoreType>());
 
diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index dfc580d..1c0b479 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -111,7 +111,6 @@ void OpenCLBackend::init(cl::Context context, cl::Platform platform, cl::Device
 std::shared_ptr<TensorImpl> OpenCLBackend::createTensor(const Shape& shape, DType dtype, const void* data)
 {
 	et_assert(dtype != DType::Unknown);
-	et_assert(dtype != DType::Half || have_fp16_ == true, "Creating half(fp16) tensor but device have no fp16 capablity.");
 	size_t buf_size = shape.volume()*dtypeToSize(dtype);
 	cl::Buffer buf = allocBuffer(buf_size);
 
@@ -131,6 +130,8 @@ std::shared_ptr<TensorImpl> OpenCLBackend::createTensor(const Shape& shape, DTyp
 
 std::shared_ptr<TensorImpl> OpenCLBackend::createTensor(const Shape& shape, DType dtype, cl::Buffer buf)
 {
+	if(dtype == DType::Half && have_fp16_ == false)
+		throw EtError("Creating half(fp16) tensor but device have no fp16 capablity.");
 	auto ptr = std::shared_ptr<OpenCLBuffer>(new OpenCLBuffer(shape, dtype, buf, shared_from_this()), [this](OpenCLBuffer* ptr){releaseTensor(ptr);});
 	return std::make_shared<TensorImpl>(ptr, shape, shapeToStride(shape));
 }
diff --git a/tests/common_tests.cpp b/tests/common_tests.cpp
index 64cdf2a..d0f4b22 100644
--- a/tests/common_tests.cpp
+++ b/tests/common_tests.cpp
@@ -604,7 +604,36 @@ TEST_CASE("Type system")
 			CHECK(exp(ones({1}, DType::Bool)).dtype() == DType::Float);
 			CHECK(exp(ones({1}, DType::Int32)).dtype() == DType::Float);
 			CHECK(exp(ones({1}, DType::Float)).dtype() == DType::Float);
-			CHECK(exp(ones({1}, DType::Half)).dtype() == DType::Half);
+			//Disabled for now due to some GPU not supporting FP16
+			// CHECK(exp(ones({1}, DType::Half)).dtype() == DType::Half);
+		}
+
+		SECTION("negation") {
+			CHECK((-ones({1}, DType::Bool)).dtype() == DType::Int32);
+			CHECK((-ones({1}, DType::Int32)).dtype() == DType::Int32);
+			CHECK((-ones({1}, DType::Float)).dtype() == DType::Float);
+			// CHECK((-ones({1}, DType::Half)).dtype() == DType::Half);
+		}
+
+		SECTION("inverse") {
+			CHECK(inverse(ones({1}, DType::Bool)).dtype() == DType::Float);
+			CHECK(inverse(ones({1}, DType::Int32)).dtype() == DType::Float);
+			CHECK(inverse(ones({1}, DType::Float)).dtype() == DType::Float);
+			// CHECK(inverse(ones({1}, DType::Half)).dtype() == DType::Half);
+		}
+
+		SECTION("log") {
+			CHECK(log(ones({1}, DType::Bool)).dtype() == DType::Float);
+			CHECK(log(ones({1}, DType::Int32)).dtype() == DType::Float);
+			CHECK(log(ones({1}, DType::Float)).dtype() == DType::Float);
+			// CHECK(log(ones({1}, DType::Half)).dtype() == DType::Half);
+		}
+
+		SECTION("logical_not") {
+			CHECK(logical_not(ones({1}, DType::Bool)).dtype() == DType::Bool);
+			CHECK(logical_not(ones({1}, DType::Int32)).dtype() == DType::Bool);
+			CHECK(logical_not(ones({1}, DType::Float)).dtype() == DType::Bool);
+			// CHECK(logical_not(ones({1}, DType::Half)).dtype() == DType::Bool);
 		}
 	}
 }

From c3792bda15e77540bd8e1c72d2903647d33e917d Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sat, 20 Jul 2019 09:41:48 +0800
Subject: [PATCH 11/25] enable FP16 tests for supported cases

---
 tests/common_tests.cpp | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/tests/common_tests.cpp b/tests/common_tests.cpp
index d0f4b22..667a5db 100644
--- a/tests/common_tests.cpp
+++ b/tests/common_tests.cpp
@@ -600,40 +600,50 @@ TEST_CASE("Type system")
 	}
 
 	SECTION("type of Tensor operatoins") {
+		bool support_fp16 = [&](){
+			try {ones({1}, DType::Half);}
+			catch(const EtError&) {return false;}
+			return true;
+		}();
+
 		SECTION("exp") {
 			CHECK(exp(ones({1}, DType::Bool)).dtype() == DType::Float);
 			CHECK(exp(ones({1}, DType::Int32)).dtype() == DType::Float);
 			CHECK(exp(ones({1}, DType::Float)).dtype() == DType::Float);
-			//Disabled for now due to some GPU not supporting FP16
-			// CHECK(exp(ones({1}, DType::Half)).dtype() == DType::Half);
+			if(support_fp16)
+				CHECK(exp(ones({1}, DType::Half)).dtype() == DType::Half);
 		}
 
 		SECTION("negation") {
 			CHECK((-ones({1}, DType::Bool)).dtype() == DType::Int32);
 			CHECK((-ones({1}, DType::Int32)).dtype() == DType::Int32);
 			CHECK((-ones({1}, DType::Float)).dtype() == DType::Float);
-			// CHECK((-ones({1}, DType::Half)).dtype() == DType::Half);
+			if(support_fp16)
+				CHECK((-ones({1}, DType::Half)).dtype() == DType::Half);
 		}
 
 		SECTION("inverse") {
 			CHECK(inverse(ones({1}, DType::Bool)).dtype() == DType::Float);
 			CHECK(inverse(ones({1}, DType::Int32)).dtype() == DType::Float);
 			CHECK(inverse(ones({1}, DType::Float)).dtype() == DType::Float);
-			// CHECK(inverse(ones({1}, DType::Half)).dtype() == DType::Half);
+			if(support_fp16)
+				CHECK(inverse(ones({1}, DType::Half)).dtype() == DType::Half);
 		}
 
 		SECTION("log") {
 			CHECK(log(ones({1}, DType::Bool)).dtype() == DType::Float);
 			CHECK(log(ones({1}, DType::Int32)).dtype() == DType::Float);
 			CHECK(log(ones({1}, DType::Float)).dtype() == DType::Float);
-			// CHECK(log(ones({1}, DType::Half)).dtype() == DType::Half);
+			if(support_fp16)
+				CHECK(log(ones({1}, DType::Half)).dtype() == DType::Half);
 		}
 
 		SECTION("logical_not") {
 			CHECK(logical_not(ones({1}, DType::Bool)).dtype() == DType::Bool);
 			CHECK(logical_not(ones({1}, DType::Int32)).dtype() == DType::Bool);
 			CHECK(logical_not(ones({1}, DType::Float)).dtype() == DType::Bool);
-			// CHECK(logical_not(ones({1}, DType::Half)).dtype() == DType::Bool);
+			if(support_fp16)
+				CHECK(logical_not(ones({1}, DType::Half)).dtype() == DType::Bool);
 		}
 	}
 }

From fdf9ccf875fb405b51bbc57b56c381db44e34cbf Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sat, 20 Jul 2019 09:54:51 +0800
Subject: [PATCH 12/25] add type test for sum

---
 Etaler/Backends/CPUBackend.cpp    | 2 ++
 Etaler/Backends/OpenCLBackend.cpp | 4 ++++
 tests/common_tests.cpp            | 8 ++++++++
 3 files changed, 14 insertions(+)

diff --git a/Etaler/Backends/CPUBackend.cpp b/Etaler/Backends/CPUBackend.cpp
index f803755..ee9c5a3 100644
--- a/Etaler/Backends/CPUBackend.cpp
+++ b/Etaler/Backends/CPUBackend.cpp
@@ -572,6 +572,8 @@ std::shared_ptr<TensorImpl> CPUBackend::sum(const TensorImpl* x, size_t chunk_si
 			DType dtype = x->dtype();
 			if(dtype == DType::Bool || dtype == DType::Int32)
 				return DType::Int32;
+			else if(dtype == DType::Half)
+				return DType::Half;
 			else
 				return DType::Float;
 		}();
diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index 1c0b479..7cf2e6d 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -775,6 +775,8 @@ std::shared_ptr<TensorImpl> OpenCLBackend::sum(const TensorImpl* x, size_t chunk
 			DType dtype = x->dtype();
 			if(dtype == DType::Bool || dtype == DType::Int32)
 				return DType::Int32;
+			else if(dtype == DType::Half)
+				return DType::Half;
 			else
 				return DType::Float;
 		}();
@@ -783,6 +785,8 @@ std::shared_ptr<TensorImpl> OpenCLBackend::sum(const TensorImpl* x, size_t chunk
 	DType intermid_type = [](DType in, DType out) {
 		if(in == DType::Float)
 			return DType::Float;
+		if(out == DType::Half)
+			return DType::Half;
 		return DType::Int32;
 	}(x->dtype(), result_dtype);
 
diff --git a/tests/common_tests.cpp b/tests/common_tests.cpp
index 667a5db..625f701 100644
--- a/tests/common_tests.cpp
+++ b/tests/common_tests.cpp
@@ -645,6 +645,14 @@ TEST_CASE("Type system")
 			if(support_fp16)
 				CHECK(logical_not(ones({1}, DType::Half)).dtype() == DType::Bool);
 		}
+
+		SECTION("sum") {
+			CHECK(ones({1}, DType::Bool).sum().dtype() == DType::Int32);
+			CHECK(ones({1}, DType::Int32).sum().dtype() == DType::Int32);
+			CHECK(ones({1}, DType::Float).sum().dtype() == DType::Float);
+			if(support_fp16)
+				CHECK(ones({1}, DType::Half).sum().dtype() == DType::Half);
+		}
 	}
 }
 

From 6a72bf873bc1b3beaf70bfcc3c873d4a1d44d8f0 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sat, 20 Jul 2019 10:29:36 +0800
Subject: [PATCH 13/25] add test for binray operation result type

---
 tests/common_tests.cpp | 58 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/tests/common_tests.cpp b/tests/common_tests.cpp
index 625f701..99649ff 100644
--- a/tests/common_tests.cpp
+++ b/tests/common_tests.cpp
@@ -653,6 +653,64 @@ TEST_CASE("Type system")
 			if(support_fp16)
 				CHECK(ones({1}, DType::Half).sum().dtype() == DType::Half);
 		}
+
+		auto solve_binary_op_type = [](DType self, DType other)->DType {
+			// Implement a C++ like type promotion rule
+			if(other == DType::Float || self == DType::Float)
+					return DType::Float;
+				else if(other == DType::Half || self == DType::Half)
+					return DType::Half;
+				return DType::Int32; //Even bool is promoted to int in operation
+		};
+
+		std::vector<DType> types = {DType::Int32, DType::Bool, DType::Float, DType::Half};
+		SECTION("add") {
+			for(auto t1 : types) {
+				for(auto t2 : types) {
+					if((t1 == DType::Half || t2 == DType::Half) && support_fp16 == false)
+						continue;
+					Tensor t = ones({1}, t1);
+					Tensor q = ones({1}, t2);
+					CHECK((t+q).dtype() == solve_binary_op_type(t1, t2));
+				}
+			}
+		}
+
+		SECTION("subtract") {
+			for(auto t1 : types) {
+				for(auto t2 : types) {
+					if((t1 == DType::Half || t2 == DType::Half) && support_fp16 == false)
+						continue;
+					Tensor t = ones({1}, t1);
+					Tensor q = ones({1}, t2);
+					CHECK((t-q).dtype() == solve_binary_op_type(t1, t2));
+				}
+			}
+		}
+
+		SECTION("mul") {
+			for(auto t1 : types) {
+				for(auto t2 : types) {
+					if((t1 == DType::Half || t2 == DType::Half) && support_fp16 == false)
+						continue;
+					Tensor t = ones({1}, t1);
+					Tensor q = ones({1}, t2);
+					CHECK((t*q).dtype() == solve_binary_op_type(t1, t2));
+				}
+			}
+		}
+
+		SECTION("div") {
+			for(auto t1 : types) {
+				for(auto t2 : types) {
+					if((t1 == DType::Half || t2 == DType::Half) && support_fp16 == false)
+						continue;
+					Tensor t = ones({1}, t1);
+					Tensor q = ones({1}, t2);
+					CHECK((t/q).dtype() == solve_binary_op_type(t1, t2));
+				}
+			}
+		}
 	}
 }
 

From 6083d375b354d8be98ab4039741cbf2c8479008d Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sat, 20 Jul 2019 10:32:59 +0800
Subject: [PATCH 14/25] make OpenCLBackend binary op return correct type

---
 Etaler/Backends/OpenCLBackend.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index 7cf2e6d..9b9c628 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -983,6 +983,8 @@ static DType solveBinaryOpDType(DType t1, DType t2)
 {
 	if(t1 == DType::Float || t2 == DType::Float)
 		return DType::Float;
+	else if(t1 == DType::Half || t2 == DType::Half)
+		return DType::Half;
 	return DType::Int32;
 }
 

From a08d5f67cde04430830ab5ae1078ac726858ea6f Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sat, 20 Jul 2019 10:35:15 +0800
Subject: [PATCH 15/25] make some type tests static

---
 tests/common_tests.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/common_tests.cpp b/tests/common_tests.cpp
index 99649ff..b2eb1df 100644
--- a/tests/common_tests.cpp
+++ b/tests/common_tests.cpp
@@ -586,17 +586,17 @@ TEST_CASE("SDRClassifer")
 TEST_CASE("Type system")
 {
 	SECTION("Type sizes") {
-		REQUIRE(dtypeToSize(DType::Bool) == 1);
-		REQUIRE(dtypeToSize(DType::Int32) == 4);
-		REQUIRE(dtypeToSize(DType::Float) == 4);
-		REQUIRE(dtypeToSize(DType::Half) == 2);
+		STATIC_REQUIRE(dtypeToSize(DType::Bool) == 1);
+		STATIC_REQUIRE(dtypeToSize(DType::Int32) == 4);
+		STATIC_REQUIRE(dtypeToSize(DType::Float) == 4);
+		STATIC_REQUIRE(dtypeToSize(DType::Half) == 2);
 	}
 
 	SECTION("type to dtype") {
-		REQUIRE(typeToDType<int32_t>() == DType::Int32);
-		REQUIRE(typeToDType<float>() == DType::Float);
-		REQUIRE(typeToDType<bool>() == DType::Bool);
-		REQUIRE(typeToDType<half>() == DType::Half);
+		STATIC_REQUIRE(typeToDType<int32_t>() == DType::Int32);
+		STATIC_REQUIRE(typeToDType<float>() == DType::Float);
+		STATIC_REQUIRE(typeToDType<bool>() == DType::Bool);
+		STATIC_REQUIRE(typeToDType<half>() == DType::Half);
 	}
 
 	SECTION("type of Tensor operatoins") {

From 077525d63b6bcc8bdb6f48b23b4137afa177580a Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sat, 20 Jul 2019 10:53:50 +0800
Subject: [PATCH 16/25] serialize half

---
 Etaler/Core/Serialize.cpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/Etaler/Core/Serialize.cpp b/Etaler/Core/Serialize.cpp
index b0b04e3..b920128 100644
--- a/Etaler/Core/Serialize.cpp
+++ b/Etaler/Core/Serialize.cpp
@@ -30,6 +30,15 @@ void load(Archive & archive , Shape & s)
 	s = Shape(vec.begin(), vec.end());
 }
 
+template<class Archive>
+void serialize(Archive & archive,
+	half & m)
+{
+	archive(m.storage_);
+}
+
+
+
 template <class Archive>
 void save(Archive & archive, Tensor const & t)
 {
@@ -40,6 +49,8 @@ void save(Archive & archive, Tensor const & t)
 			return "float";
 		if(t.dtype() == DType::Int32)
 			return "int32";
+		if(t.dtype() == DType::Half)
+			return "half";
 
 		throw EtError("Cannot handle such dtype()");
 	}();
@@ -59,6 +70,10 @@ void save(Archive & archive, Tensor const & t)
 		std::vector<int32_t> arr = t.toHost<int32_t>();
 		archive(make_nvp("data", arr));
 	}
+	else if(t.dtype() == DType::Half) {
+		std::vector<half> arr = t.toHost<half>();
+		archive(make_nvp("data", arr));
+	}
 }
 
 template <class Archive>
@@ -85,6 +100,11 @@ void load(Archive & archive, Tensor & t)
 		archive(make_nvp("data", d));
 		t = Tensor(s, d.data());
 	}
+	else if(dtype == "half") {
+		std::vector<half> d(s.volume());
+		archive(make_nvp("data", d));
+		t = Tensor(s, d.data());
+	}
 }
 
 template <class Archive>
@@ -137,6 +157,8 @@ void save(Archive & archive ,StateDict const & item)
 			types.push_back("std::vector<int>");
 		else if(v.type() == typeid(std::vector<float>))
 			types.push_back("std::vector<float>");
+		else if(v.type() == typeid(std::vector<half>))
+			types.push_back("std::vector<half>");
 		else
 			throw EtError("Cannot save (mangled name:) type " + std::string(v.type().name()) + ", key " + k);
 	}
@@ -163,6 +185,8 @@ void save(Archive & archive ,StateDict const & item)
 			archive(std::any_cast<std::vector<int>>(v));
 		else if(v.type() == typeid(std::vector<float>))
 			archive(std::any_cast<std::vector<float>>(v));
+		else if(v.type() == typeid(std::vector<half>))
+			archive(std::any_cast<std::vector<half>>(v));
 		else
 			throw EtError("Cannot save type " + std::string(typeid(decltype(v)).name()) + ", key " + k);
 
@@ -211,6 +235,8 @@ void load(Archive & archive ,StateDict & item)
 			read_archive<std::vector<int>>(archive, item, key);
 		else if(type == "std::vector<float>")
 			read_archive<std::vector<float>>(archive, item, key);
+		else if(type == "std::vector<half>")
+			read_archive<std::vector<half>>(archive, item, key);
 		else
 			throw EtError("Cannot serealize type " + type);
 

From 5e828745f4d11b3b61b945b871495a2dbb218601 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sun, 21 Jul 2019 13:43:32 +0800
Subject: [PATCH 17/25] fix OpenCL sum not working with half

---
 Etaler/Backends/OpenCLBackend.cpp | 3 ++-
 kernels/sum.cl                    | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index 9b9c628..52d088c 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -790,7 +790,8 @@ std::shared_ptr<TensorImpl> OpenCLBackend::sum(const TensorImpl* x, size_t chunk
 		return DType::Int32;
 	}(x->dtype(), result_dtype);
 
-	std::string args = "-DInType=" + to_ctype_string(x->dtype()) + " -DOutType=" + to_ctype_string(result_dtype) + " -DIntermidType=" + to_ctype_string(intermid_type);
+	std::string args = "-DInType=" + to_ctype_string(x->dtype()) + " -DOutType=" + to_ctype_string(result_dtype) + " -DIntermidType=" + to_ctype_string(intermid_type)
+		+ (intermid_type==DType::Half? " -DIntermidIsHalf" : "");
 	std::string program_name = "sum" + hash_string(args);
 	kernel_manager_.compileFromFile("sum.cl", program_name, {"sum"}, false, args);
 
diff --git a/kernels/sum.cl b/kernels/sum.cl
index 7de6f02..8ac8bab 100644
--- a/kernels/sum.cl
+++ b/kernels/sum.cl
@@ -10,6 +10,10 @@
         #error IntermidType not defined
 #endif
 
+#ifdef IntermidIsHalf
+        #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 //InType: Input Data type
 //OutType: Output Data type
 //in_size: number of elements of the input

From 13a2e1f138d4ef04b64f18915a44cfbb2dfbc676 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sun, 21 Jul 2019 13:52:28 +0800
Subject: [PATCH 18/25] fix cast not supporting half

---
 Etaler/Backends/OpenCLBackend.cpp | 3 ++-
 kernels/cast.cl                   | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index 52d088c..19d3f34 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -349,7 +349,8 @@ std::shared_ptr<TensorImpl> OpenCLBackend::cast(const TensorImpl* x, DType toTyp
 {
 	et_assert(x->backend() == this);
 	et_assert(x->iscontiguous());
-	auto args = "-DInType="+to_ctype_string(x->dtype())+" -DOutType="+to_ctype_string(toType);
+	auto args = "-DInType="+to_ctype_string(x->dtype())+" -DOutType="+to_ctype_string(toType)
+		+ (x->dtype() == DType::Half || toType == DType::Half ? " -DHalfSupport" : "");
 	auto hash = hash_string(args);
 	auto program_name = "cast"+hash;
 	kernel_manager_.compileFromFile("cast.cl", program_name, {"cast"}, false, args);
diff --git a/kernels/cast.cl b/kernels/cast.cl
index 912149d..6105f35 100644
--- a/kernels/cast.cl
+++ b/kernels/cast.cl
@@ -6,6 +6,10 @@
 	#error OutType not defined
 #endif
 
+#ifdef HalfSupport
+        #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 //InType: Input Type
 //OutType: OutputType
 //global_size: arbitrary

From ebead29e8b07a6213ed232d3e210d4ee67f0f9e7 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Mon, 22 Jul 2019 12:51:39 +0800
Subject: [PATCH 19/25] clean up tensor property check

---
 Etaler/Backends/CPUBackend.cpp    |  97 ++++++++--------------------
 Etaler/Backends/OpenCLBackend.cpp | 103 +++++++++---------------------
 Etaler/Core/TensorImpl.hpp        |  48 ++++++++++++++
 3 files changed, 104 insertions(+), 144 deletions(-)

diff --git a/Etaler/Backends/CPUBackend.cpp b/Etaler/Backends/CPUBackend.cpp
index ee9c5a3..13adfff 100644
--- a/Etaler/Backends/CPUBackend.cpp
+++ b/Etaler/Backends/CPUBackend.cpp
@@ -59,16 +59,9 @@ std::shared_ptr<TensorImpl> CPUBackend::cellActivity(const TensorImpl* x, const
 	, float connected_permeance, size_t active_threshold, bool has_unconnected_synapse)
 {
 	//Checks the input are sane
-	et_assert(x->backend() == this);
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(x->iscontiguous());
-	et_assert(connections->iscontiguous());
-	et_assert(permeances->iscontiguous());
-
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
+	requireProperties(x, this, DType::Bool, IsContingous());
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
 	et_assert(connections->dimentions() >= 2);
 
@@ -119,21 +112,13 @@ std::shared_ptr<TensorImpl> CPUBackend::cellActivity(const TensorImpl* x, const
 void CPUBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* learn, const TensorImpl* connections, TensorImpl* permeances
 	, float perm_inc, float perm_dec, bool has_unconnected_synapse)
 {
-	et_assert(x->backend() == this);
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(learn->backend() == this);
-	et_assert(x->iscontiguous());
-	et_assert(learn->iscontiguous());
-	et_assert(connections->iscontiguous());
-	et_assert(permeances->iscontiguous());
+	requireProperties(x, this, DType::Bool, IsContingous());
+	requireProperties(learn, this, DType::Bool, IsContingous());
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
 
 	et_assert(connections->shape() == permeances->shape());
 	et_assert(x->shape() == learn->shape());
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(learn->dtype() == DType::Bool);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
 
 	const bool* input = (const bool*)x->data();
 	const bool* learning = (const bool*)learn->data();
@@ -167,10 +152,7 @@ void CPUBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* learn,
 
 std::shared_ptr<TensorImpl> CPUBackend::globalInhibition(const TensorImpl* x, float fraction)
 {
-	et_assert(x->backend() == this);
-	et_assert(x->iscontiguous());
-
-	et_assert(x->dtype() == DType::Int32);
+	requireProperties(x, this, DType::Int32, IsContingous());
 
 	auto y = createTensor(x->shape(), DType::Bool);
 
@@ -217,8 +199,7 @@ static std::vector<To> castData(const From* ptr, size_t n)
 
 std::shared_ptr<TensorImpl> CPUBackend::cast(const TensorImpl* x, DType toType)
 {
-	et_assert(x->backend() == this);
-	et_assert(x->iscontiguous());
+	requireProperties(x, this, IsContingous());
 	const CPUBuffer* p = dynamic_cast<const CPUBuffer*>(x->buffer().get());
 	const CPUBuffer& t = *p;
 	return run<std::shared_ptr<TensorImpl>>(t, [&x, toType, this](const auto* ptr){
@@ -242,27 +223,21 @@ std::shared_ptr<TensorImpl> CPUBackend::cast(const TensorImpl* x, DType toType)
 
 void CPUBackend::copyToHost(const TensorImpl* t, void* ptr)
 {
-	et_assert(points_to<CPUBuffer>(t->buffer().get()));
-	et_assert(t->iscontiguous());
+	requireProperties(t, this, IsContingous());
 	memcpy(ptr, t->data(), t->size()*dtypeToSize(t->dtype()));
 }
 
 std::shared_ptr<TensorImpl> CPUBackend::copy(const TensorImpl* x)
 {
-	et_assert(x->backend() == this);
-	et_assert(x->iscontiguous());
+	requireProperties(x, this, IsContingous());
 	return createTensor(x->shape(), x->dtype(), x->data());
 }
 
 void CPUBackend::sortSynapse(TensorImpl* connections, TensorImpl* permeances)
 {
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
-	et_assert(connections->iscontiguous());
-	et_assert(permeances->iscontiguous());
 
 	size_t max_synapse_per_cell = connections->shape().back();
 	size_t num_cells = connections->size()/max_synapse_per_cell;
@@ -287,12 +262,8 @@ void CPUBackend::sortSynapse(TensorImpl* connections, TensorImpl* permeances)
 
 std::shared_ptr<TensorImpl> CPUBackend::burst(const TensorImpl* x, const TensorImpl* s)
 {
-	et_assert(x->backend() == this);
-	et_assert(s->backend() == this);
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(s->dtype() == DType::Bool);
-	et_assert(x->iscontiguous());
-	et_assert(s->iscontiguous());
+	requireProperties(x, this, DType::Bool, IsContingous());
+	requireProperties(s, this, DType::Bool, IsContingous());
 
 	Shape shape = s->shape();
 	shape.pop_back();
@@ -320,9 +291,7 @@ std::shared_ptr<TensorImpl> CPUBackend::burst(const TensorImpl* x, const TensorI
 
 std::shared_ptr<TensorImpl> CPUBackend::reverseBurst(const TensorImpl* x)
 {
-	et_assert(x->backend() == this);
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(x->iscontiguous());
+	requireProperties(x, this, DType::Bool, IsContingous());
 
 	size_t cells_per_column = x->shape().back();
 	size_t num_columns = x->size()/cells_per_column;
@@ -349,19 +318,10 @@ std::shared_ptr<TensorImpl> CPUBackend::reverseBurst(const TensorImpl* x)
 void CPUBackend::growSynapses(const TensorImpl* x, const TensorImpl* y, TensorImpl* connections
 	, TensorImpl* permeances, float initial_perm)
 {
-	et_assert(x->backend() == this);
-	et_assert(y->backend() == this);
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(x->iscontiguous());
-	et_assert(y->iscontiguous());
-	et_assert(connections->iscontiguous());
-	et_assert(permeances->iscontiguous());
-
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(y->dtype() == DType::Bool);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
+	requireProperties(x, this, DType::Bool, IsContingous());
+	requireProperties(y, this, DType::Bool, IsContingous());
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
 
 	et_assert(connections->shape() == permeances->shape());
 	Shape s = connections->shape();
@@ -520,7 +480,7 @@ static std::shared_ptr<TensorImpl> binaryOp(const TensorImpl* src, const TensorI
 
 std::shared_ptr<TensorImpl> CPUBackend::realize(const TensorImpl* x)
 {
-	et_assert(x->backend() == this);
+	requireProperties(x, this);
 	et_assert(x->data() != nullptr);
 	auto res = createTensor(x->shape(), x->dtype());
 
@@ -537,8 +497,8 @@ std::shared_ptr<TensorImpl> CPUBackend::realize(const TensorImpl* x)
 
 void CPUBackend::assign(TensorImpl* dest, const TensorImpl* src)
 {
-	et_assert(dest->backend() == this);
-	et_assert(src->backend() == this);
+	requireProperties(dest, this);
+	requireProperties(src, this);
 
 	if(dest->shape() != src->shape())
 		throw EtError("Shape mismatch in tensor assignment. Shape "
@@ -561,9 +521,8 @@ void CPUBackend::assign(TensorImpl* dest, const TensorImpl* src)
 
 std::shared_ptr<TensorImpl> CPUBackend::sum(const TensorImpl* x, size_t chunk_size, DType dtype)
 {
-	et_assert(x->backend() == this);
+	requireProperties(x, this, IsContingous());
 	et_assert(x->size() % chunk_size == 0);
-	et_assert(x->iscontiguous());
 
 	DType result_dtype = dtype;
 
@@ -600,13 +559,9 @@ std::shared_ptr<TensorImpl> CPUBackend::sum(const TensorImpl* x, size_t chunk_si
 
 void CPUBackend::decaySynapses(TensorImpl* connections, TensorImpl* permeances, float threshold)
 {
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
-	et_assert(permeances->iscontiguous());
-	et_assert(connections->iscontiguous());
 
 	float* perms = (float*)permeances->data();
 	uint32_t* conns = (uint32_t*)connections->data();
diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index 19d3f34..8bdfe72 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -264,16 +264,9 @@ void KernelManager::addSearchPath(const std::string& path)
 std::shared_ptr<TensorImpl> OpenCLBackend::cellActivity(const TensorImpl* x, const TensorImpl* connections,
 	const TensorImpl* permeances, float connected_permeance, size_t active_threshold, bool has_unconnected_synapse)
 {
-	et_assert(x->backend() == this);
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(x->iscontiguous());
-	et_assert(connections->iscontiguous());
-	et_assert(permeances->iscontiguous());
-
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
+	requireProperties(x, this, DType::Bool, IsContingous());
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
 	et_assert(connections->dimentions() >= 2);
 
@@ -312,10 +305,7 @@ std::shared_ptr<TensorImpl> OpenCLBackend::cellActivity(const TensorImpl* x, con
 
 std::shared_ptr<TensorImpl> OpenCLBackend::globalInhibition(const TensorImpl* x, float fraction)
 {
-	et_assert(x->backend() == this);
-	et_assert(x->iscontiguous());
-
-	et_assert(x->dtype() == DType::Int32);
+	requireProperties(x, this, DType::Int32, IsContingous());
 
 	auto y = createTensor(x->shape(), DType::Bool);
 
@@ -347,8 +337,7 @@ std::shared_ptr<TensorImpl> OpenCLBackend::globalInhibition(const TensorImpl* x,
 
 std::shared_ptr<TensorImpl> OpenCLBackend::cast(const TensorImpl* x, DType toType)
 {
-	et_assert(x->backend() == this);
-	et_assert(x->iscontiguous());
+	requireProperties(x, this, IsContingous());
 	auto args = "-DInType="+to_ctype_string(x->dtype())+" -DOutType="+to_ctype_string(toType)
 		+ (x->dtype() == DType::Half || toType == DType::Half ? " -DHalfSupport" : "");
 	auto hash = hash_string(args);
@@ -374,8 +363,7 @@ void OpenCLBackend::sync() const
 
 std::shared_ptr<TensorImpl> OpenCLBackend::copy(const TensorImpl* x)
 {
-	et_assert(x->backend() == this);
-	et_assert(x->iscontiguous());
+	requireProperties(x, this, IsContingous());
 	size_t buf_size = x->size()*dtypeToSize(x->dtype());
 	cl::Buffer buf = allocBuffer(buf_size);
 	const cl::Buffer& src = std::static_pointer_cast<const OpenCLBuffer>(x->buffer())->buffer();
@@ -389,17 +377,13 @@ std::shared_ptr<TensorImpl> OpenCLBackend::copy(const TensorImpl* x)
 void OpenCLBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* learn, const TensorImpl* connections,
 	TensorImpl* permeances, float perm_inc, float perm_dec, bool has_unconnected_synapse)
 {
-	et_assert(x->backend() == this);
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(learn->backend() == this);
+	requireProperties(x, this, DType::Bool, IsContingous());
+	requireProperties(learn, this, DType::Bool, IsContingous());
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
 
 	et_assert(connections->shape() == permeances->shape());
 	et_assert(x->shape() == learn->shape());
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(learn->dtype() == DType::Bool);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
 
 	auto args = "-DINPUT_SIZE="+str(x->size())+" -DMAX_SYNAPSE_PER_CELL="+str(connections->shape().back())+" -DNO_UNUSED_SYNAPSE="+str(!has_unconnected_synapse)
 		+" -DOUTPUT_SIZE="+str(learn->size());
@@ -431,13 +415,9 @@ void OpenCLBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* lear
 
 void OpenCLBackend::sortSynapse(TensorImpl* connections, TensorImpl* permeances)
 {
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
-	et_assert(connections->iscontiguous());
-	et_assert(permeances->iscontiguous());
 
 	auto args = "-DMAX_SYNAPSE_PER_CELL="+str(connections->shape().back());
 	auto program_name = "sortSynapse"+hash_string(args);
@@ -463,12 +443,8 @@ void OpenCLBackend::sortSynapse(TensorImpl* connections, TensorImpl* permeances)
 
 std::shared_ptr<TensorImpl> OpenCLBackend::burst(const TensorImpl* x, const TensorImpl* s)
 {
-	et_assert(x->backend() == this);
-	et_assert(s->backend() == this);
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(s->dtype() == DType::Bool);
-	et_assert(x->iscontiguous());
-	et_assert(s->iscontiguous());
+	requireProperties(x, this, DType::Bool, IsContingous());
+	requireProperties(s, this, DType::Bool, IsContingous());
 
 	Shape shape = s->shape();
 	shape.pop_back();
@@ -495,9 +471,7 @@ std::shared_ptr<TensorImpl> OpenCLBackend::burst(const TensorImpl* x, const Tens
 
 std::shared_ptr<TensorImpl> OpenCLBackend::reverseBurst(const TensorImpl* x)
 {
-	et_assert(x->backend() == this);
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(x->iscontiguous());
+	requireProperties(x, this, DType::Bool, IsContingous());
 
 	size_t cells_per_column = x->shape().back();
 	size_t num_columns = x->size()/cells_per_column;
@@ -525,21 +499,11 @@ std::shared_ptr<TensorImpl> OpenCLBackend::reverseBurst(const TensorImpl* x)
 void OpenCLBackend::growSynapses(const TensorImpl* x, const TensorImpl* y, TensorImpl* connections
 		, TensorImpl* permeances, float initial_perm)
 {
-	et_assert(x->backend() == this);
-	et_assert(y->backend() == this);
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(x->iscontiguous());
-	et_assert(y->iscontiguous());
-	et_assert(connections->iscontiguous());
-	et_assert(permeances->iscontiguous());
-
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(y->dtype() == DType::Bool);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
-
-	et_assert(x->shape() == y->shape());
+	requireProperties(x, this, DType::Bool, IsContingous());
+	requireProperties(y, this, DType::Bool, IsContingous());
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
+
 	et_assert(connections->shape() == permeances->shape());
 	Shape s = connections->shape();
 	s.pop_back();
@@ -584,9 +548,7 @@ void OpenCLBackend::growSynapses(const TensorImpl* x, const TensorImpl* y, Tenso
 
 std::optional<cl::Buffer> OpenCLBackend::toSparse(const TensorImpl* x)
 {
-	et_assert(x->backend() == this);
-	et_assert(x->dtype() == DType::Bool);
-	et_assert(x->iscontiguous());
+	requireProperties(x, this, DType::Bool, IsContingous());
 
 	auto args = "-DINPUT_SIZE="+str(x->size());
 	auto program_name = "toSparse"+hash_string(args);
@@ -707,7 +669,7 @@ kernel void copy(global Type* restrict x, global Type* restrict y)
 
 std::shared_ptr<TensorImpl> OpenCLBackend::realize(const TensorImpl* x)
 {
-	et_assert(x->backend() == this);
+	requireProperties(x, this);
 	if(x->iscontiguous() == true)
 		return copy(x);
 
@@ -736,8 +698,8 @@ std::shared_ptr<TensorImpl> OpenCLBackend::realize(const TensorImpl* x)
 
 void OpenCLBackend::assign(TensorImpl* dest, const TensorImpl* src)
 {
-	et_assert(dest->backend() == this);
-	et_assert(src->backend() == this);
+	requireProperties(dest, this);
+	requireProperties(src, this);
 
 	if(dest->shape() != src->shape())
 	throw EtError("Shape mismatch in tensor assignment. Shape "
@@ -766,9 +728,8 @@ void OpenCLBackend::assign(TensorImpl* dest, const TensorImpl* src)
 
 std::shared_ptr<TensorImpl> OpenCLBackend::sum(const TensorImpl* x, size_t chunk_size, DType dtype)
 {
-	et_assert(x->backend() == this);
+	requireProperties(x, this, IsContingous());
 	et_assert(x->size() % chunk_size == 0);
-	et_assert(x->iscontiguous());
 
 	DType result_dtype = dtype;
 	if(dtype == DType::Unknown) {
@@ -815,13 +776,9 @@ std::shared_ptr<TensorImpl> OpenCLBackend::sum(const TensorImpl* x, size_t chunk
 
 void OpenCLBackend::decaySynapses(TensorImpl* connections, TensorImpl* permeances, float threshold)
 {
+	requireProperties(connections, this, DType::Int32, IsContingous());
+	requireProperties(permeances, this, DType::Float, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
-	et_assert(connections->backend() == this);
-	et_assert(permeances->backend() == this);
-	et_assert(connections->dtype() == DType::Int32);
-	et_assert(permeances->dtype() == DType::Float);
-	et_assert(connections->iscontiguous());
-	et_assert(permeances->iscontiguous());
 
 	size_t max_synapses_per_cell = connections->shape().back();
 	size_t input_cell_count = connections->size()/max_synapses_per_cell;
@@ -895,7 +852,7 @@ kernel void op(global T0* restrict x1, global T1* restrict x2, global ResType* r
 
 std::shared_ptr<TensorImpl> OpenCLBackend::applyUnaryOp(const TensorImpl* x, std::string f, DType resType)
 {
-	et_assert(x->backend() == this);
+	requireProperties(x, this);
 
 	std::string args = "-DT0="+to_ctype_string(x->dtype())+" -DResType="+to_ctype_string(resType);
 	std::string program_name = f+hash_string(args)+std::to_string(x->offset())+to_string(x->shape())+to_string(x->stride());
@@ -921,8 +878,8 @@ std::shared_ptr<TensorImpl> OpenCLBackend::applyUnaryOp(const TensorImpl* x, std
 
 std::shared_ptr<TensorImpl> OpenCLBackend::applyBinaryOp(const TensorImpl* x1, const TensorImpl* x2, std::string f, DType resType)
 {
-	et_assert(x1->backend() == this);
-	et_assert(x2->backend() == this);
+	requireProperties(x1, this);
+	requireProperties(x2, this);
 	et_assert(x1->shape() == x2->shape());
 
 	auto to_str = [](auto x){
diff --git a/Etaler/Core/TensorImpl.hpp b/Etaler/Core/TensorImpl.hpp
index 5b34853..d49121b 100644
--- a/Etaler/Core/TensorImpl.hpp
+++ b/Etaler/Core/TensorImpl.hpp
@@ -1,4 +1,5 @@
 #pragma once
+#include <cassert>
 
 #include "Shape.hpp"
 #include "DType.hpp"
@@ -52,4 +53,51 @@ struct TensorImpl : public std::enable_shared_from_this<TensorImpl>
 	size_t offset_;
 };
 
+struct IsContingous {};
+
+template <typename T>
+bool checkProperty(const TensorImpl* x, const T& value)
+{
+	if constexpr(std::is_base_of_v<Backend, std::remove_pointer_t<std::decay_t<T>>>)
+		return x->backend() == value;
+	else if constexpr(std::is_same_v<T, DType>)
+		return x->dtype() == value;
+	else if constexpr(std::is_same_v<T, IsContingous>)
+		return x->iscontiguous();
+	else
+		et_assert(false, "a non-supported value is passed into checkProperty");
+	return false;
+}
+
+template <typename T>
+void requireProperty(const TensorImpl* x, const T value, const std::string& line, const std::string& v_name)
+{
+	if(checkProperty(x, value) == true)
+		return;
+	
+	//Otherwise assertion failed
+	const std::string msg = line + " Tensor property requirment not match. Expecting " + v_name;
+	if constexpr(std::is_same_v<std::decay_t<T>, Backend*>)
+		throw EtError(msg + ".backend() == " + value->name());
+	else if constexpr(std::is_same_v<T, DType>)
+		throw EtError(msg + ".dtype() == " + to_ctype_string(value));
+	else if constexpr(std::is_same_v<T, IsContingous>)
+		throw EtError(msg + ".iscontiguous() == true");
+}
+
+template <typename ... Args>
+bool checkProperties(const TensorImpl* x, Args... args)
+{
+	return (checkProperty(x, args) && ...);
+}
+
+template <typename ... Args>
+void requirePropertiesInternal(const TensorImpl* x, const std::string& line, const std::string& v_name, Args... args)
+{
+	(requireProperty(x, args, line, v_name), ...);
 }
+
+}
+
+#define requireProperties(x, ...) requirePropertiesInternal(x, std::string(__FILE__)+":"+std::to_string(__LINE__)\
+	+":"+std::string(__func__)+"():", #x, __VA_ARGS__)

From 241bfa7e1a57a2c445c94c4cfd468428863131ec Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Mon, 22 Jul 2019 17:13:26 +0800
Subject: [PATCH 20/25] add support for some HtM algorithms in CPUBackend

---
 Etaler/3rdparty/half_precision      |  2 +-
 Etaler/Algorithms/SpatialPooler.hpp |  2 +-
 Etaler/Backends/CPUBackend.cpp      | 94 +++++++++++++++++++----------
 examples/spbench.cpp                |  1 +
 4 files changed, 65 insertions(+), 34 deletions(-)

diff --git a/Etaler/3rdparty/half_precision b/Etaler/3rdparty/half_precision
index b7ef0d8..82cdf3a 160000
--- a/Etaler/3rdparty/half_precision
+++ b/Etaler/3rdparty/half_precision
@@ -1 +1 @@
-Subproject commit b7ef0d8735888f2aa9bfa4448913c86a000114fd
+Subproject commit 82cdf3a4cf414f2c862232fa948e0208a760af6c
diff --git a/Etaler/Algorithms/SpatialPooler.hpp b/Etaler/Algorithms/SpatialPooler.hpp
index aee57f8..b96d0d8 100644
--- a/Etaler/Algorithms/SpatialPooler.hpp
+++ b/Etaler/Algorithms/SpatialPooler.hpp
@@ -71,7 +71,7 @@ struct ETALER_EXPORT SpatialPooler
 	{
 		return to(connections_.backend());
 	}
-protected:
+//protected:
 	float permanence_inc_ = 0.1;
 	float permanence_dec_ = 0.1;
 	float connected_permanence_ = 0.21;
diff --git a/Etaler/Backends/CPUBackend.cpp b/Etaler/Backends/CPUBackend.cpp
index 13adfff..d95677b 100644
--- a/Etaler/Backends/CPUBackend.cpp
+++ b/Etaler/Backends/CPUBackend.cpp
@@ -55,24 +55,43 @@ void* CPUBuffer::data() const
 	return std::visit([](const auto& v){return (void*)v;}, storage_);
 }
 
-std::shared_ptr<TensorImpl> CPUBackend::cellActivity(const TensorImpl* x, const TensorImpl* connections, const TensorImpl* permeances
-	, float connected_permeance, size_t active_threshold, bool has_unconnected_synapse)
+template <typename TypeList = type_list_t<int32_t, float, bool, half>, typename Func = void>
+inline void dispatch(DType dtype, Func f)
+{
+	static_assert(std::is_same_v<Func, void> == false); //void is just a dummy value
+	if constexpr(std::is_same_v<TypeList, null_t> == false) {
+		using T = typename TypeList::head;
+		if(typeToDType<T>() == dtype) {
+			f(T());
+			return;
+		}
+		dispatch<typename TypeList::tail, Func>(dtype, f);
+	}
+	else
+		throw EtError("Cannot dispatch such dtype: " + to_ctype_string(dtype));
+}
+
+namespace et::detail
+{
+template <typename PermType>
+static std::shared_ptr<TensorImpl> cellActivity(const TensorImpl* x, const TensorImpl* connections, const TensorImpl* permeances
+	, float connected_permeance, size_t active_threshold, bool has_unconnected_synapse, CPUBackend* backend)
 {
 	//Checks the input are sane
-	requireProperties(x, this, DType::Bool, IsContingous());
-	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
+	requireProperties(x, backend, DType::Bool, IsContingous());
+	requireProperties(connections, backend, DType::Int32, IsContingous());
+	requireProperties(permeances, backend, typeToDType<PermType>(), IsContingous());
 	et_assert(connections->shape() == permeances->shape());
 	et_assert(connections->dimentions() >= 2);
 
 	Shape s = connections->shape();
 	s.pop_back();
-	auto y = createTensor(s, DType::Int32);
+	auto y = backend->createTensor(s, DType::Int32);
 
 
 	const bool* input = (const bool*)x->data();
 	const int32_t* synapses = (const int32_t*)connections->data();
-	const float* synapse_strengths = (float*)permeances->data();
+	const PermType* synapse_strengths = (PermType*)permeances->data();
 	int32_t* result = (int32_t*)y->data();
 
 	size_t max_connections_per_cell = connections->shape().back();
@@ -95,7 +114,7 @@ std::shared_ptr<TensorImpl> CPUBackend::cellActivity(const TensorImpl* x, const
 				if(input[target] == false)
 					continue;
 
-				float strength = synapse_strengths[index];
+				PermType strength = synapse_strengths[index];
 				if(strength > connected_permeance)
 					sum += 1;
 			}
@@ -109,13 +128,14 @@ std::shared_ptr<TensorImpl> CPUBackend::cellActivity(const TensorImpl* x, const
 	return y;
 }
 
-void CPUBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* learn, const TensorImpl* connections, TensorImpl* permeances
-	, float perm_inc, float perm_dec, bool has_unconnected_synapse)
+template <typename PermType>
+void learnCorrilation(const TensorImpl* x, const TensorImpl* learn, const TensorImpl* connections, TensorImpl* permeances
+	, float perm_inc, float perm_dec, bool has_unconnected_synapse, CPUBackend* backend)
 {
-	requireProperties(x, this, DType::Bool, IsContingous());
-	requireProperties(learn, this, DType::Bool, IsContingous());
-	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
+	requireProperties(x, backend, DType::Bool, IsContingous());
+	requireProperties(learn, backend, DType::Bool, IsContingous());
+	requireProperties(connections, backend, DType::Int32, IsContingous());
+	requireProperties(permeances, backend, typeToDType<PermType>(), IsContingous());
 
 	et_assert(connections->shape() == permeances->shape());
 	et_assert(x->shape() == learn->shape());
@@ -123,7 +143,7 @@ void CPUBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* learn,
 	const bool* input = (const bool*)x->data();
 	const bool* learning = (const bool*)learn->data();
 	const int32_t* synapses = (const int32_t*)connections->data();
-	float* synapse_strengths = (float*)permeances->data();
+	PermType* synapse_strengths = (PermType*)permeances->data();
 
 	size_t max_connections_per_cell = connections->shape().back();
 	size_t num_cells = connections->size()/max_connections_per_cell;
@@ -139,16 +159,35 @@ void CPUBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* learn,
 				break;
 			ASSERT((size_t)connection < num_cells);
 
-			float& perm = synapse_strengths[idx];
+			PermType& perm = synapse_strengths[idx];
 			if(input[connection] == true)
 				perm += perm_inc;
 			else
 				perm -= perm_dec;
 
-			perm = std::max(std::min(perm, 1.f), 0.f);
+			perm = std::max(std::min(perm, PermType(1)), PermType(0));
 		}
 	});
 }
+}
+
+std::shared_ptr<TensorImpl> CPUBackend::cellActivity(const TensorImpl* x, const TensorImpl* connections, const TensorImpl* permeances
+	, float connected_permeance, size_t active_threshold, bool has_unconnected_synapse)
+{
+	std::shared_ptr<TensorImpl> res;
+	dispatch<type_list_t<float, half>>(permeances->dtype(), [&](auto v){
+		res = detail::cellActivity<decltype(v)>(x, connections, permeances, connected_permeance, active_threshold, has_unconnected_synapse, this);
+	});
+	return res;
+}
+
+void CPUBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* learn, const TensorImpl* connections, TensorImpl* permeances
+	, float perm_inc, float perm_dec, bool has_unconnected_synapse)
+{
+	dispatch<type_list_t<float, half>>(permeances->dtype(), [&](auto v){
+		detail::learnCorrilation<decltype(v)>(x, learn, connections, permeances, perm_inc, perm_dec, has_unconnected_synapse, this);
+	});
+}
 
 std::shared_ptr<TensorImpl> CPUBackend::globalInhibition(const TensorImpl* x, float fraction)
 {
@@ -187,6 +226,8 @@ static Ret run(const CPUBuffer& t, Op op)
 		return op((int32_t*)t.data());
 	else if(t.dtype() == DType::Float)
 		return op((float*)t.data());
+	else if(t.dtype() == DType::Half)
+		return op((half*)t.data());
 	else
 		throw EtError("Cannot cast");
 }
@@ -216,6 +257,10 @@ std::shared_ptr<TensorImpl> CPUBackend::cast(const TensorImpl* x, DType toType)
 			auto casted_data = castData<float>(ptr, x->size());
 			return createTensor(x->shape(), toType, casted_data.data());
 		}
+		else if(toType == DType::Half){
+			auto casted_data = castData<half>(ptr, x->size());
+			return createTensor(x->shape(), toType, casted_data.data());
+		}
 		else
 			throw EtError("Cannot cast");
 	});
@@ -401,21 +446,6 @@ const T* getPtrToValue(size_t parent_idx, const TensorImpl* t)
 	return ((const T*)t->data())+offset;
 }
 
-template <typename Func, typename TypeList = type_list_t<int32_t, float, bool, half>>
-inline void dispatch(DType dtype, Func f)
-{
-	if constexpr(std::is_same_v<TypeList, null_t> == false) {
-		using T = typename TypeList::head;
-		if(typeToDType<T>() == dtype) {
-			f(T());
-			return;
-		}
-		dispatch<Func, typename TypeList::tail>(dtype, f);
-	}
-	else
-		throw EtError("Cannot dispatch such dtype ID: " + (int)dtype + to_ctype_string(dtype));
-}
-
 template <typename T2, typename T1>
 void write(T2* ptr, T1 v)
 {
diff --git a/examples/spbench.cpp b/examples/spbench.cpp
index 61331c3..30b2a76 100644
--- a/examples/spbench.cpp
+++ b/examples/spbench.cpp
@@ -12,6 +12,7 @@ using namespace et;
 float benchmarkSpatialPooler(const Shape& out_shape, const std::vector<Tensor>& x, size_t num_epoch)
 {
 	SpatialPooler sp(x[0].shape(), out_shape);
+	sp.permanences_ = sp.permanences_.cast(DType::Half);
 
 	//To make the OpenCL backen ptr-compile the kernels
 	Tensor t = zeros(x[0].shape(), DType::Bool);

From 5eb8ee4e3407c49b9df03bcf1503f94bd0f42e9f Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Sun, 28 Jul 2019 00:17:45 +0800
Subject: [PATCH 21/25] fix CMake warning message for newer versions

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e2c247..4ab87da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,6 @@
 cmake_minimum_required(VERSION 3.0)
+project(Etaler)
+
 
 SET(BUILD_SHARED_LIBS ON)
 

From a00fc145d768d3ef49eb1f78e59028ee534ade67 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Thu, 1 Aug 2019 10:59:52 +0800
Subject: [PATCH 22/25] support fp16 in HTM algorithms in CPUBackend

---
 Etaler/Backends/CPUBackend.cpp | 274 ++++++++++++++++++---------------
 1 file changed, 150 insertions(+), 124 deletions(-)

diff --git a/Etaler/Backends/CPUBackend.cpp b/Etaler/Backends/CPUBackend.cpp
index d95677b..bf954e4 100644
--- a/Etaler/Backends/CPUBackend.cpp
+++ b/Etaler/Backends/CPUBackend.cpp
@@ -169,6 +169,150 @@ void learnCorrilation(const TensorImpl* x, const TensorImpl* learn, const Tensor
 		}
 	});
 }
+
+template <typename PermType>
+void sortSynapse(TensorImpl* connections, TensorImpl* permeances, CPUBackend* backend)
+{
+	requireProperties(connections, backend, DType::Int32, IsContingous());
+	requireProperties(permeances, backend, typeToDType<PermType>(), IsContingous());
+	et_assert(connections->shape() == permeances->shape());
+
+	size_t max_synapse_per_cell = connections->shape().back();
+	size_t num_cells = connections->size()/max_synapse_per_cell;
+
+	uint32_t* conns = (uint32_t*)connections->data(); //HACK: -1s should be at the end of the arrays.
+	PermType* perms = (PermType*)permeances->data();
+
+	tbb::parallel_for(size_t(0), num_cells, [&](size_t i) {
+		size_t start_index = i*max_synapse_per_cell;
+		size_t end_index = (i+1)*max_synapse_per_cell;
+
+		std::vector<size_t> sort_indices(max_synapse_per_cell);
+		std::iota(sort_indices.begin(), sort_indices.end(), 0);
+		std::sort(sort_indices.begin(), sort_indices.end(),
+			[&](size_t i, size_t j)->bool {
+				return conns[i+start_index] < conns[j+start_index];
+			});
+		apply_permutation_in_place(conns+start_index, conns+end_index, sort_indices);
+		apply_permutation_in_place(perms+start_index, perms+end_index, sort_indices);
+	});
+}
+
+template <typename PermType>
+void growSynapses(const TensorImpl* x, const TensorImpl* y, TensorImpl* connections
+	, TensorImpl* permeances, float initial_perm, CPUBackend* backend)
+{
+	requireProperties(x, backend, DType::Bool, IsContingous());
+	requireProperties(y, backend, DType::Bool, IsContingous());
+	requireProperties(connections, backend, DType::Int32, IsContingous());
+	requireProperties(permeances, backend, typeToDType<PermType>(), IsContingous());
+
+	et_assert(connections->shape() == permeances->shape());
+	Shape s = connections->shape();
+	s.pop_back();
+	et_assert(s == y->shape());
+
+	size_t max_synapses_per_cell = connections->shape().back();
+	size_t input_cell_count = x->size();
+
+	const bool* in = (const bool*) x->data();
+	const bool* out = (const bool*) y->data();
+	int32_t* conns = (int32_t*)connections->data();
+	PermType* perms = (PermType*)permeances->data();
+
+	std::vector<uint32_t> on_bits;
+	on_bits.reserve(input_cell_count*0.1);
+	for(size_t i=0;i<input_cell_count;i++) {
+		if(in[i] == true)
+			on_bits.push_back(i);
+	}
+
+	size_t block_size = std::min(size_t(16), (size_t)y->shape().back());
+	tbb::parallel_for(tbb::blocked_range<size_t>(size_t(0), y->size(), block_size), [&](const auto& r) {
+		for(size_t i=r.begin();i!=r.end();i++) {
+			if(out[i] == 0)
+				continue;
+
+			uint32_t* synapses = (uint32_t*)conns+i*max_synapses_per_cell;
+			PermType* strengths = perms+i*max_synapses_per_cell;
+			uint32_t* end = synapses+max_synapses_per_cell;
+
+			if(synapses[max_synapses_per_cell-1] != uint32_t(-1)) //If there is no space for new synapse. Ignore
+				continue;
+
+			uint32_t* it = std::lower_bound(synapses, end, uint32_t(-1));
+			size_t used_space = it - synapses;
+
+			size_t write_idx = it - synapses;
+			size_t read_idx = 0;
+
+			for(size_t j=0;write_idx!=max_synapses_per_cell && j < on_bits.size();j++) {
+				bool connected = false;
+				for(;read_idx<used_space;read_idx++) {
+					if(synapses[read_idx] == on_bits[j]) {
+						connected = true;
+						break;
+					}
+					if(synapses[read_idx] > on_bits[j])
+						break;
+				}
+
+				if(connected == false) {
+					synapses[write_idx] = on_bits[j];
+					strengths[write_idx] = initial_perm;
+					write_idx++;
+				}
+			}
+
+			std::vector<size_t> sort_indices(write_idx);
+			std::iota(sort_indices.begin(), sort_indices.begin()+write_idx, 0);
+			std::sort(sort_indices.begin(), sort_indices.begin()+write_idx,
+				[&](size_t i, size_t j)->bool {
+					return ((uint32_t*)synapses)[i] < ((uint32_t*)synapses)[j];
+				});
+			apply_permutation_in_place(synapses, synapses+write_idx, sort_indices);
+			apply_permutation_in_place(strengths, strengths+write_idx, sort_indices);
+		}
+	});
+}
+
+template <typename PermType>
+void decaySynapses(TensorImpl* connections, TensorImpl* permeances, float threshold, CPUBackend* backend)
+{
+	requireProperties(connections, backend, DType::Int32, IsContingous());
+	requireProperties(permeances, backend, typeToDType<PermType>(), IsContingous());
+	et_assert(connections->shape() == permeances->shape());
+
+	PermType* perms = (PermType*)permeances->data();
+	uint32_t* conns = (uint32_t*)connections->data();
+
+	size_t max_synapses_per_cell = connections->shape().back();
+	size_t input_cell_count = connections->size()/max_synapses_per_cell;
+
+	tbb::parallel_for(size_t(0), input_cell_count, [&](size_t i) {
+		uint32_t* synapses = (uint32_t*)conns+i*max_synapses_per_cell;
+		PermType* strengths = perms+i*max_synapses_per_cell;
+		uint32_t* end = synapses+max_synapses_per_cell;
+
+		uint32_t* it = std::lower_bound(synapses, end, uint32_t(-1));
+		size_t used_space = it - synapses;
+
+		for(size_t j=0;j<used_space;j++) {
+			if(strengths[j] < threshold)
+				synapses[j] = uint32_t(-1);
+		}
+
+		std::vector<size_t> sort_indices(used_space);
+		std::iota(sort_indices.begin(), sort_indices.begin()+used_space, 0);
+		std::sort(sort_indices.begin(), sort_indices.begin()+used_space,
+			[&](size_t i, size_t j)->bool {
+				return ((uint32_t*)synapses)[i] < ((uint32_t*)synapses)[j];
+			});
+		apply_permutation_in_place(synapses, synapses+used_space, sort_indices);
+		apply_permutation_in_place(strengths, strengths+used_space, sort_indices);
+	});
+}
+
 }
 
 std::shared_ptr<TensorImpl> CPUBackend::cellActivity(const TensorImpl* x, const TensorImpl* connections, const TensorImpl* permeances
@@ -280,28 +424,8 @@ std::shared_ptr<TensorImpl> CPUBackend::copy(const TensorImpl* x)
 
 void CPUBackend::sortSynapse(TensorImpl* connections, TensorImpl* permeances)
 {
-	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
-	et_assert(connections->shape() == permeances->shape());
-
-	size_t max_synapse_per_cell = connections->shape().back();
-	size_t num_cells = connections->size()/max_synapse_per_cell;
-
-	uint32_t* conns = (uint32_t*)connections->data(); //HACK: -1s should be at the end of the arrays.
-	float* perms = (float*)permeances->data();
-
-	tbb::parallel_for(size_t(0), num_cells, [&](size_t i) {
-		size_t start_index = i*max_synapse_per_cell;
-		size_t end_index = (i+1)*max_synapse_per_cell;
-
-		std::vector<size_t> sort_indices(max_synapse_per_cell);
-		std::iota(sort_indices.begin(), sort_indices.end(), 0);
-		std::sort(sort_indices.begin(), sort_indices.end(),
-			[&](size_t i, size_t j)->bool {
-				return conns[i+start_index] < conns[j+start_index];
-			});
-		apply_permutation_in_place(conns+start_index, conns+end_index, sort_indices);
-		apply_permutation_in_place(perms+start_index, perms+end_index, sort_indices);
+	dispatch<type_list_t<float, half>>(permeances->dtype(), [&](auto v) {
+		detail::sortSynapse<decltype(v)>(connections, permeances, this);
 	});
 }
 
@@ -363,77 +487,8 @@ std::shared_ptr<TensorImpl> CPUBackend::reverseBurst(const TensorImpl* x)
 void CPUBackend::growSynapses(const TensorImpl* x, const TensorImpl* y, TensorImpl* connections
 	, TensorImpl* permeances, float initial_perm)
 {
-	requireProperties(x, this, DType::Bool, IsContingous());
-	requireProperties(y, this, DType::Bool, IsContingous());
-	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
-
-	et_assert(connections->shape() == permeances->shape());
-	Shape s = connections->shape();
-	s.pop_back();
-	et_assert(s == y->shape());
-
-	size_t max_synapses_per_cell = connections->shape().back();
-	size_t input_cell_count = x->size();
-
-	const bool* in = (const bool*) x->data();
-	const bool* out = (const bool*) y->data();
-	int32_t* conns = (int32_t*)connections->data();
-	float* perms = (float*)permeances->data();
-
-	std::vector<uint32_t> on_bits;
-	on_bits.reserve(input_cell_count*0.1);
-	for(size_t i=0;i<input_cell_count;i++) {
-		if(in[i] == true)
-			on_bits.push_back(i);
-	}
-
-	size_t block_size = std::min(size_t(16), (size_t)y->shape().back());
-	tbb::parallel_for(tbb::blocked_range<size_t>(size_t(0), y->size(), block_size), [&](const auto& r) {
-		for(size_t i=r.begin();i!=r.end();i++) {
-			if(out[i] == 0)
-				continue;
-
-			uint32_t* synapses = (uint32_t*)conns+i*max_synapses_per_cell;
-			float* strengths = perms+i*max_synapses_per_cell;
-			uint32_t* end = synapses+max_synapses_per_cell;
-
-			if(synapses[max_synapses_per_cell-1] != uint32_t(-1)) //If there is no space for new synapse. Ignore
-				continue;
-
-			uint32_t* it = std::lower_bound(synapses, end, uint32_t(-1));
-			size_t used_space = it - synapses;
-
-			size_t write_idx = it - synapses;
-			size_t read_idx = 0;
-
-			for(size_t j=0;write_idx!=max_synapses_per_cell && j < on_bits.size();j++) {
-				bool connected = false;
-				for(;read_idx<used_space;read_idx++) {
-					if(synapses[read_idx] == on_bits[j]) {
-						connected = true;
-						break;
-					}
-					if(synapses[read_idx] > on_bits[j])
-						break;
-				}
-
-				if(connected == false) {
-					synapses[write_idx] = on_bits[j];
-					strengths[write_idx] = initial_perm;
-					write_idx++;
-				}
-			}
-
-			std::vector<size_t> sort_indices(write_idx);
-			std::iota(sort_indices.begin(), sort_indices.begin()+write_idx, 0);
-			std::sort(sort_indices.begin(), sort_indices.begin()+write_idx,
-				[&](size_t i, size_t j)->bool {
-					return ((uint32_t*)synapses)[i] < ((uint32_t*)synapses)[j];
-				});
-			apply_permutation_in_place(synapses, synapses+write_idx, sort_indices);
-			apply_permutation_in_place(strengths, strengths+write_idx, sort_indices);
-		}
+	dispatch<type_list_t<float, half>>(permeances->dtype(), [&](auto v) {
+		detail::growSynapses<decltype(v)>(x, y, connections, permeances, initial_perm, this);
 	});
 }
 
@@ -589,37 +644,8 @@ std::shared_ptr<TensorImpl> CPUBackend::sum(const TensorImpl* x, size_t chunk_si
 
 void CPUBackend::decaySynapses(TensorImpl* connections, TensorImpl* permeances, float threshold)
 {
-	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
-	et_assert(connections->shape() == permeances->shape());
-
-	float* perms = (float*)permeances->data();
-	uint32_t* conns = (uint32_t*)connections->data();
-
-	size_t max_synapses_per_cell = connections->shape().back();
-	size_t input_cell_count = connections->size()/max_synapses_per_cell;
-
-	tbb::parallel_for(size_t(0), input_cell_count, [&](size_t i) {
-		uint32_t* synapses = (uint32_t*)conns+i*max_synapses_per_cell;
-		float* strengths = perms+i*max_synapses_per_cell;
-		uint32_t* end = synapses+max_synapses_per_cell;
-
-		uint32_t* it = std::lower_bound(synapses, end, uint32_t(-1));
-		size_t used_space = it - synapses;
-
-		for(size_t j=0;j<used_space;j++) {
-			if(strengths[j] < threshold)
-				synapses[j] = uint32_t(-1);
-		}
-
-		std::vector<size_t> sort_indices(used_space);
-		std::iota(sort_indices.begin(), sort_indices.begin()+used_space, 0);
-		std::sort(sort_indices.begin(), sort_indices.begin()+used_space,
-			[&](size_t i, size_t j)->bool {
-				return ((uint32_t*)synapses)[i] < ((uint32_t*)synapses)[j];
-			});
-		apply_permutation_in_place(synapses, synapses+used_space, sort_indices);
-		apply_permutation_in_place(strengths, strengths+used_space, sort_indices);
+	dispatch<type_list_t<float, half>>(permeances->dtype(), [&](auto v) {
+		detail::decaySynapses<decltype(v)>(connections, permeances, threshold, this);
 	});
 }
 

From 90a20192917226dbd2994d80d9f53fd0508856b5 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Thu, 1 Aug 2019 13:23:49 +0800
Subject: [PATCH 23/25] add property to test and implement check of tensor
 being one of the listed type

---
 Etaler/Core/TensorImpl.hpp  | 25 ++++++++++++++++++++++---
 Etaler/Core/TypeHelpers.hpp | 11 +++++++++++
 tests/common_tests.cpp      | 11 +++++++++++
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/Etaler/Core/TensorImpl.hpp b/Etaler/Core/TensorImpl.hpp
index d49121b..62ff353 100644
--- a/Etaler/Core/TensorImpl.hpp
+++ b/Etaler/Core/TensorImpl.hpp
@@ -4,6 +4,7 @@
 #include "Shape.hpp"
 #include "DType.hpp"
 #include "Backend.hpp"
+#include "TypeHelpers.hpp"
 
 #include <memory>
 
@@ -55,6 +56,18 @@ struct TensorImpl : public std::enable_shared_from_this<TensorImpl>
 
 struct IsContingous {};
 
+template <typename Storage>
+struct IsDType
+{      
+	Storage types;
+};
+
+template<typename _Tp, typename... _Up>
+	IsDType(_Tp, _Up...)
+	-> IsDType<std::array<std::enable_if_t<(std::is_same_v<_Tp, _Up> && ...), _Tp>,
+		1 + sizeof...(_Up)>>;
+
+
 template <typename T>
 bool checkProperty(const TensorImpl* x, const T& value)
 {
@@ -64,6 +77,8 @@ bool checkProperty(const TensorImpl* x, const T& value)
 		return x->dtype() == value;
 	else if constexpr(std::is_same_v<T, IsContingous>)
 		return x->iscontiguous();
+	else if constexpr(is_specialization<std::remove_pointer_t<std::decay_t<T>>, IsDType>::value)
+		return (std::find(value.types.begin(), value.types.end(), x->dtype()) != value.types.end());
 	else
 		et_assert(false, "a non-supported value is passed into checkProperty");
 	return false;
@@ -77,12 +92,16 @@ void requireProperty(const TensorImpl* x, const T value, const std::string& line
 	
 	//Otherwise assertion failed
 	const std::string msg = line + " Tensor property requirment not match. Expecting " + v_name;
-	if constexpr(std::is_same_v<std::decay_t<T>, Backend*>)
+	if constexpr(std::is_base_of_v<Backend, std::remove_pointer_t<std::decay_t<T>>>)
 		throw EtError(msg + ".backend() == " + value->name());
 	else if constexpr(std::is_same_v<T, DType>)
 		throw EtError(msg + ".dtype() == " + to_ctype_string(value));
 	else if constexpr(std::is_same_v<T, IsContingous>)
 		throw EtError(msg + ".iscontiguous() == true");
+	else if constexpr(is_specialization<std::remove_pointer_t<std::decay_t<T>>, IsDType>::value) {
+		throw EtError(msg + ".dtype() is in {" + std::accumulate(value.types.begin(), value.types.end(), std::string()
+			, [](auto v, auto a){return v + to_ctype_string(a) + ", ";}));
+	}
 }
 
 template <typename ... Args>
@@ -99,5 +118,5 @@ void requirePropertiesInternal(const TensorImpl* x, const std::string& line, con
 
 }
 
-#define requireProperties(x, ...) requirePropertiesInternal(x, std::string(__FILE__)+":"+std::to_string(__LINE__)\
-	+":"+std::string(__func__)+"():", #x, __VA_ARGS__)
+#define requireProperties(x, ...) (requirePropertiesInternal(x, std::string(__FILE__)+":"+std::to_string(__LINE__)\
+	+":"+std::string(__func__)+"():", #x, __VA_ARGS__))
diff --git a/Etaler/Core/TypeHelpers.hpp b/Etaler/Core/TypeHelpers.hpp
index f74c7f2..1103448 100644
--- a/Etaler/Core/TypeHelpers.hpp
+++ b/Etaler/Core/TypeHelpers.hpp
@@ -15,9 +15,20 @@
 		return r;\
 }()
 
+
+
 namespace et
 {
 
 std::string ETALER_EXPORT demangle(const char* name);
 
+template<typename Test, template<typename...> class Ref>
+struct is_specialization : std::false_type {};
+
+template<template<typename...> class Ref, typename... Args>
+struct is_specialization<Ref<Args...>, Ref>: std::true_type {};
+
+template<template<typename...> class Ref, typename... Args>
+const bool is_specialization_v = is_specialization<Ref<Args...>, Ref>::value;
+
 }
\ No newline at end of file
diff --git a/tests/common_tests.cpp b/tests/common_tests.cpp
index b2eb1df..fd0ab17 100644
--- a/tests/common_tests.cpp
+++ b/tests/common_tests.cpp
@@ -140,6 +140,17 @@ TEST_CASE("Testing Tensor", "[Tensor]")
 		CHECK(q.isSame(r));
 	}
 
+	SECTION("Property Check") {
+		CHECK_NOTHROW(requireProperties(ones(Shape{1}, DType::Int32).pimpl(), DType::Int32));
+		CHECK_THROWS(requireProperties(ones(Shape{1}, DType::Float).pimpl(), DType::Int32));
+		CHECK_THROWS(requireProperties(ones(Shape{1}, DType::Float).pimpl(), IsDType{DType::Int32, DType::Bool}));
+
+		CHECK_NOTHROW(requireProperties(ones(Shape{1}, DType::Int32).pimpl(), defaultBackend()));
+
+		CHECK_NOTHROW(requireProperties(ones(Shape{1}, DType::Int32).pimpl(), IsContingous()));
+		CHECK_THROWS(requireProperties(ones(Shape{4,4}, DType::Int32).view({range(2), range(2)}).pimpl(), IsContingous()));
+	}
+
 	SECTION("Views") {
 		std::vector<int> data(16);
 		for(size_t i=0;i<data.size();i++)

From ecea0514c80294499c55f1faeaf37dc5e9007a37 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Thu, 1 Aug 2019 14:45:09 +0800
Subject: [PATCH 24/25] enable HTM algorithms to run on OpenCL with half
 percision

---
 Etaler/Backends/OpenCLBackend.cpp            | 63 +++++++++++---------
 Etaler/Backends/OpenCLBackend.hpp            |  4 +-
 kernels/cellActivity.cl                      |  6 +-
 kernels/cellActivity_compressed_local.cl     |  6 +-
 kernels/cellActivity_global.cl               |  6 +-
 kernels/decaySynapses.cl                     |  6 +-
 kernels/growSynapses.cl                      |  6 +-
 kernels/learnCorrilation.cl                  |  6 +-
 kernels/learnCorrilation_compressed_local.cl |  6 +-
 kernels/learnCorrilation_global.cl           |  6 +-
 kernels/sort.cl                              | 16 +++--
 11 files changed, 87 insertions(+), 44 deletions(-)

diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index 8bdfe72..bebca2b 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -224,17 +224,17 @@ void KernelManager::compileKernel(const std::vector<std::string>& srcs, const st
 }
 
 void KernelManager::compileFromFile(const std::string& path, const std::string& program_name, const std::vector<std::string>& kernel_names
-	, bool force_override, const std::string& flags)
+	, bool force_override, const std::string& flags, const std::string& prepend)
 {
-	compileFromFile(std::vector<std::string>{path}, program_name, kernel_names, force_override, flags);
+	compileFromFile(std::vector<std::string>{path}, program_name, kernel_names, force_override, flags, prepend);
 }
 
 void KernelManager::compileFromFile(const std::vector<std::string>& paths, const std::string& program_name, const std::vector<std::string>& kernel_names
-	, bool force_override, const std::string& flags)
+	, bool force_override, const std::string& flags, const std::string& prepend)
 {
 	std::vector<std::string> sources;
 	for(const auto& path : paths)
-		sources.emplace_back(readKernel(path));
+		sources.emplace_back((prepend!=""?"\n":"") + readKernel(path));
 	compileKernel(sources, program_name, kernel_names, force_override, flags);
 }
 
@@ -266,7 +266,7 @@ std::shared_ptr<TensorImpl> OpenCLBackend::cellActivity(const TensorImpl* x, con
 {
 	requireProperties(x, this, DType::Bool, IsContingous());
 	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
+	requireProperties(permeances, this, IsDType{DType::Float, DType::Int32}, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
 	et_assert(connections->dimentions() >= 2);
 
@@ -274,16 +274,18 @@ std::shared_ptr<TensorImpl> OpenCLBackend::cellActivity(const TensorImpl* x, con
 	s.pop_back();
 	auto y = createTensor(s, DType::Int32);
 
-	auto args = "-DINPUT_SIZE="+str(x->size())+" -DMAX_SYNAPSE_PER_CELL="+str(connections->shape().back())+" -DNO_UNUSED_SYNAPSE=" + str(!has_unconnected_synapse);
-	auto hash = hash_string(args);
+	auto args = "-DINPUT_SIZE="+str(x->size())+" -DMAX_SYNAPSE_PER_CELL="+str(connections->shape().back())+" -DNO_UNUSED_SYNAPSE=" + str(!has_unconnected_synapse)
+		+ " -DPERM_TYPE="+to_ctype_string(permeances->dtype());
+	auto prepend = (permeances->dtype()==DType::Half?"#pragma OPENCL EXTENSION cl_khr_fp16 : enable":"");
+	auto hash = hash_string(args + prepend);
 	auto program_name = "overlapScore"+hash;
 
 	if(x->size() < localMemorySize() && localMemoryType() == CL_LOCAL)
-		kernel_manager_.compileFromFile("cellActivity.cl", program_name, {"cellActivity"}, false, args);
+		kernel_manager_.compileFromFile("cellActivity.cl", program_name, {"cellActivity"}, false, args, prepend);
 	else if(x->size() < localMemorySize()*8-8 && localMemoryType() == CL_LOCAL)
-		kernel_manager_.compileFromFile("cellActivity_compressed_local.cl", program_name, {"cellActivity"}, false, args);
+		kernel_manager_.compileFromFile("cellActivity_compressed_local.cl", program_name, {"cellActivity"}, false, args, prepend);
 	else
-		kernel_manager_.compileFromFile("cellActivity_global.cl", program_name, {"cellActivity"}, false, args);
+		kernel_manager_.compileFromFile("cellActivity_global.cl", program_name, {"cellActivity"}, false, args, prepend);
 	cl::Kernel k = kernel_manager_.kernel(program_name, "cellActivity");
 
 	k.setArg(0, std::static_pointer_cast<const OpenCLBuffer>(x->buffer())->buffer());
@@ -380,22 +382,23 @@ void OpenCLBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* lear
 	requireProperties(x, this, DType::Bool, IsContingous());
 	requireProperties(learn, this, DType::Bool, IsContingous());
 	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
+	requireProperties(permeances, this, IsDType{DType::Float, DType::Half}, IsContingous());
 
 	et_assert(connections->shape() == permeances->shape());
 	et_assert(x->shape() == learn->shape());
 
 	auto args = "-DINPUT_SIZE="+str(x->size())+" -DMAX_SYNAPSE_PER_CELL="+str(connections->shape().back())+" -DNO_UNUSED_SYNAPSE="+str(!has_unconnected_synapse)
-		+" -DOUTPUT_SIZE="+str(learn->size());
-	auto hash = hash_string(args);
+		+" -DOUTPUT_SIZE="+str(learn->size()) + " -DPERM_TYPE="+to_ctype_string(permeances->dtype());
+	auto prepend = (permeances->dtype()==DType::Half?"#pragma OPENCL EXTENSION cl_khr_fp16 : enable":"");
+	auto hash = hash_string(args+prepend);
 	auto program_name = "learnCorrilation"+hash;
 
 	if(x->size() < localMemorySize() && localMemoryType() == CL_LOCAL)
-		kernel_manager_.compileFromFile("learnCorrilation.cl", program_name, {"learnCorrilation"}, false, args);
+		kernel_manager_.compileFromFile("learnCorrilation.cl", program_name, {"learnCorrilation"}, false, args, prepend);
 	else if(x->size() < localMemorySize()*8-8 && localMemoryType() == CL_LOCAL)
-		kernel_manager_.compileFromFile("learnCorrilation_compressed_local.cl", program_name, {"learnCorrilation"}, false, args);
+		kernel_manager_.compileFromFile("learnCorrilation_compressed_local.cl", program_name, {"learnCorrilation"}, false, args, prepend);
 	else
-		kernel_manager_.compileFromFile("learnCorrilation_global.cl", program_name, {"learnCorrilation"}, false, args);
+		kernel_manager_.compileFromFile("learnCorrilation_global.cl", program_name, {"learnCorrilation"}, false, args, prepend);
 	cl::Kernel k = kernel_manager_.kernel(program_name, "learnCorrilation");
 
 	k.setArg(0, std::static_pointer_cast<const OpenCLBuffer>(x->buffer())->buffer());
@@ -416,12 +419,13 @@ void OpenCLBackend::learnCorrilation(const TensorImpl* x, const TensorImpl* lear
 void OpenCLBackend::sortSynapse(TensorImpl* connections, TensorImpl* permeances)
 {
 	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
+	requireProperties(permeances, this, IsDType{DType::Float, DType::Int32}, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
 
-	auto args = "-DMAX_SYNAPSE_PER_CELL="+str(connections->shape().back());
-	auto program_name = "sortSynapse"+hash_string(args);
-	kernel_manager_.compileFromFile("sort.cl", program_name, {"sortSynapse"}, false, args);
+	auto args = "-DMAX_SYNAPSE_PER_CELL="+str(connections->shape().back()) + " -DPERM_TYPE="+to_ctype_string(permeances->dtype());
+	auto prepend = (permeances->dtype()==DType::Half?"#pragma OPENCL EXTENSION cl_khr_fp16 : enable":"");
+	auto program_name = "sortSynapse"+hash_string(args+prepend);
+	kernel_manager_.compileFromFile("sort.cl", program_name, {"sortSynapse"}, false, args, prepend);
 	cl::Kernel k = kernel_manager_.kernel(program_name, "sortSynapse");
 
 	int num_cells = connections->size()/connections->shape().back();
@@ -502,7 +506,7 @@ void OpenCLBackend::growSynapses(const TensorImpl* x, const TensorImpl* y, Tenso
 	requireProperties(x, this, DType::Bool, IsContingous());
 	requireProperties(y, this, DType::Bool, IsContingous());
 	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
+	requireProperties(permeances, this, IsDType{DType::Float, DType::Int32}, IsContingous());
 
 	et_assert(connections->shape() == permeances->shape());
 	Shape s = connections->shape();
@@ -512,9 +516,11 @@ void OpenCLBackend::growSynapses(const TensorImpl* x, const TensorImpl* y, Tenso
 	size_t max_synapses_per_cell = connections->shape().back();
 	size_t input_cell_count = x->size();
 
-	auto args = "-DNUM_CELLS="+str(y->size())+" -DNUM_INPUT_BITS="+str(x->size())+" -DMAX_SYNAPSE_PER_CELL="+str(max_synapses_per_cell);
-	auto program_name = "growSynapses"+hash_string(args);
-	kernel_manager_.compileFromFile("growSynapses.cl", program_name, {"growSynapses"}, false, args);
+	auto args = "-DNUM_CELLS="+str(y->size())+" -DNUM_INPUT_BITS="+str(x->size())+" -DMAX_SYNAPSE_PER_CELL="+str(max_synapses_per_cell)
+		+" -DPERM_TYPE="+to_ctype_string(permeances->dtype());
+	auto prepend = (permeances->dtype()==DType::Half?"#pragma OPENCL EXTENSION cl_khr_fp16 : enable":"");
+	auto program_name = "growSynapses"+hash_string(args+prepend);
+	kernel_manager_.compileFromFile("growSynapses.cl", program_name, {"growSynapses"}, false, args, prepend);
 	cl::Kernel k = kernel_manager_.kernel(program_name, "growSynapses");
 
 	size_t local_size = 32;
@@ -777,15 +783,16 @@ std::shared_ptr<TensorImpl> OpenCLBackend::sum(const TensorImpl* x, size_t chunk
 void OpenCLBackend::decaySynapses(TensorImpl* connections, TensorImpl* permeances, float threshold)
 {
 	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, DType::Float, IsContingous());
+	requireProperties(permeances, this, IsDType{DType::Float, DType::Half}, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
 
 	size_t max_synapses_per_cell = connections->shape().back();
 	size_t input_cell_count = connections->size()/max_synapses_per_cell;
 
-	auto args = "-DNUM_CELLS="+str(input_cell_count) + " -DMAX_SYNAPSE_PER_CELL="+str(max_synapses_per_cell);
-	std::string program_name = "sum" + hash_string(args);
-	kernel_manager_.compileFromFile("decaySynapses.cl", program_name, {"decaySynapses"}, false, args);
+	auto args = "-DNUM_CELLS="+str(input_cell_count) + " -DMAX_SYNAPSE_PER_CELL="+str(max_synapses_per_cell) + " -DPERM_TYPE="+to_ctype_string(permeances->dtype());
+	auto prepend = (permeances->dtype()==DType::Half?"#pragma OPENCL EXTENSION cl_khr_fp16 : enable":"");
+	std::string program_name = "sum" + hash_string(args+prepend);
+	kernel_manager_.compileFromFile("decaySynapses.cl", program_name, {"decaySynapses"}, false, args, prepend);
 
 	cl::Kernel k = kernel_manager_.kernel(program_name, "decaySynapses");
 
diff --git a/Etaler/Backends/OpenCLBackend.hpp b/Etaler/Backends/OpenCLBackend.hpp
index 03fe640..85f7c35 100644
--- a/Etaler/Backends/OpenCLBackend.hpp
+++ b/Etaler/Backends/OpenCLBackend.hpp
@@ -52,9 +52,9 @@ struct KernelManager
 	void compileKernel(const std::vector<std::string>& srcs, const std::string& program_name, const std::vector<std::string>& kernel_names
 		, bool force_override=false, const std::string& flags="");
 	void compileFromFile(const std::string& paths, const std::string& program_name, const std::vector<std::string>& kernel_names
-		, bool force_override=false, const std::string& flags="");
+		, bool force_override=false, const std::string& flags="", const std::string& prepend="");
 	void compileFromFile(const std::vector<std::string>& paths, const std::string& program_name, const std::vector<std::string>& kernel_names
-		, bool force_override=false, const std::string& flags="");
+		, bool force_override=false, const std::string& flags="", const std::string& prepend="");
 	inline bool exists(const std::string& program_name, const std::string& kernel_name)
 	{
 		auto it = apps_.find(program_name);
diff --git a/kernels/cellActivity.cl b/kernels/cellActivity.cl
index 3a865eb..e77ebe6 100644
--- a/kernels/cellActivity.cl
+++ b/kernels/cellActivity.cl
@@ -6,6 +6,10 @@
 	#error "MAX_SYNAPSE_PER_CELL not defined"
 #endif
 
+#ifndef PERM_TYPE
+	#error "PERM_TYPE not defined"
+#endif
+
 #ifndef NO_UNUSED_SYNAPSE
 	#define NO_UNUSED_SYNAPSE false
 #endif
@@ -15,7 +19,7 @@
 //INPUT_SIZE: The size of input SDR, must be smaller then CL_DEVICE_LOCAL_MEMORY_SIZE
 //NO_UNUSED_SYNAPSE: If there are unised synapses. Useful for sparial pooler, accelerates ~30%
 kernel void cellActivity(global bool* restrict x, global int* restrict synapses
-	, global float* restrict permeances, global int* restrict y
+	, global PERM_TYPE* restrict permeances, global int* restrict y
 	, float connected_perm, int active_threshold, int output_size)
 {
 	//Load input state into local memory for faster access
diff --git a/kernels/cellActivity_compressed_local.cl b/kernels/cellActivity_compressed_local.cl
index 7042a24..f869647 100644
--- a/kernels/cellActivity_compressed_local.cl
+++ b/kernels/cellActivity_compressed_local.cl
@@ -6,6 +6,10 @@
 	#error "MAX_SYNAPSE_PER_CELL not defined"
 #endif
 
+#ifndef PERM_TYPE
+	#error "PERM_TYPE not defined"
+#endif
+
 #ifndef NO_UNUSED_SYNAPSE
 	#define NO_UNUSED_SYNAPSE false
 #endif
@@ -20,7 +24,7 @@ int up_round(int v, int mul)
 //INPUT_SIZE: The size of input SDR, must be smaller then CL_DEVICE_LOCAL_MEMORY_SIZE*8-8
 //NO_UNUSED_SYNAPSE: If there are unised synapses. Useful for sparial pooler, accelerates ~30%
 kernel void cellActivity(global bool* restrict x, global int* restrict synapses
-	, global float* restrict permeances, global int* restrict y
+	, global PERM_TYPE* restrict permeances, global int* restrict y
 	, float connected_perm, int active_threshold, int output_size)
 {
 	//Load input state into local memory for faster access
diff --git a/kernels/cellActivity_global.cl b/kernels/cellActivity_global.cl
index d45cb4d..d7b11e6 100644
--- a/kernels/cellActivity_global.cl
+++ b/kernels/cellActivity_global.cl
@@ -6,6 +6,10 @@
 	#error "MAX_SYNAPSE_PER_CELL not defined"
 #endif
 
+#ifndef PERM_TYPE
+	#error "PERM_TYPE not defined"
+#endif
+
 #ifndef NO_UNUSED_SYNAPSE
 	#define NO_UNUSED_SYNAPSE false
 #endif
@@ -15,7 +19,7 @@
 //INPUT_SIZE: The size of input SDR, must be smaller then CL_DEVICE_LOCAL_MEMORY_SIZE
 //NO_UNUSED_SYNAPSE: If there are unised synapses. Useful for sparial pooler, accelerates ~30%
 kernel void cellActivity(global bool* restrict x, global int* restrict synapses
-	, global float* restrict permeances, global int* restrict y
+	, global PERM_TYPE* restrict permeances, global int* restrict y
 	, float connected_perm, int active_threshold, int output_size)
 {
 	int global_size = get_global_size(0);
diff --git a/kernels/decaySynapses.cl b/kernels/decaySynapses.cl
index 6304c92..be9240e 100644
--- a/kernels/decaySynapses.cl
+++ b/kernels/decaySynapses.cl
@@ -6,8 +6,12 @@
 	#error "MAX_SYNAPSE_PER_CELL is not defined"
 #endif
 
+#ifndef PERM_TYPE
+	#error "PERM_TYPE not defined"
+#endif
+
 
-kernel void decaySynapses(global int* restrict connections, global float* restrict permeances, float threshold)
+kernel void decaySynapses(global int* restrict connections, global PERM_TYPE* restrict permeances, float threshold)
 {
 	int global_size = get_global_size(0);
 	int global_id = get_global_id(0);
diff --git a/kernels/growSynapses.cl b/kernels/growSynapses.cl
index 113db48..27ef15d 100644
--- a/kernels/growSynapses.cl
+++ b/kernels/growSynapses.cl
@@ -10,6 +10,10 @@
 	#error "MAX_SYNAPSE_PER_CELL is not defined"
 #endif
 
+#ifndef PERM_TYPE
+	#error "PERM_TYPE not defined"
+#endif
+
 //NOTE: The old version of this kernel might perform better with more cells
 
 //global_size: Arbitrary
@@ -20,7 +24,7 @@
 //x: The input SDR **IN SPARSE FORMAT**
 //aux: temporary buffer for storage, must be size of NUM_INPUT_BITS*global_size[0]
 kernel void growSynapses(global int* restrict x, global bool* restrict y, global int* restrict connections
-	, global float* restrict permeances, float initial_perm, int num_input_on_bits, global bool* restrict aux)
+	, global PERM_TYPE* restrict permeances, float initial_perm, int num_input_on_bits, global bool* restrict aux)
 {
 	int global_size = get_global_size(0);
 	int global_id = get_global_id(0);
diff --git a/kernels/learnCorrilation.cl b/kernels/learnCorrilation.cl
index e7b3b13..4c01251 100644
--- a/kernels/learnCorrilation.cl
+++ b/kernels/learnCorrilation.cl
@@ -10,6 +10,10 @@
 	#error "OUTPUT_SIZE not defined"
 #endif
 
+#ifndef PERM_TYPE
+	#error "PERM_TYPE not defined"
+#endif
+
 #ifndef NO_UNUSED_SYNAPSE
 	#define NO_UNUSED_SYNAPSE false
 #endif
@@ -19,7 +23,7 @@
 //INPUT_SIZE: The size of input SDR, must be smaller then CL_DEVICE_LOCAL_MEMORY_SIZE
 //NO_UNUSED_SYNAPSE: If there are unised synapses. Useful for sparial pooler, accelerates ~30%
 kernel void learnCorrilation(global bool* restrict x, global bool* restrict y
-	, global int* restrict synapses, global float* restrict permeances
+	, global int* restrict synapses, global PERM_TYPE* restrict permeances
 	, float permeance_inc, float permeance_dec)
 {
 	local char xl[INPUT_SIZE];
diff --git a/kernels/learnCorrilation_compressed_local.cl b/kernels/learnCorrilation_compressed_local.cl
index b1b5943..a8cfb5e 100644
--- a/kernels/learnCorrilation_compressed_local.cl
+++ b/kernels/learnCorrilation_compressed_local.cl
@@ -10,6 +10,10 @@
 	#error "OUTPUT_SIZE not defined"
 #endif
 
+#ifndef PERM_TYPE
+	#error "PERM_TYPE not defined"
+#endif
+
 #ifndef NO_UNUSED_SYNAPSE
 	#define NO_UNUSED_SYNAPSE false
 #endif
@@ -19,7 +23,7 @@
 //INPUT_SIZE: The size of input SDR, must be smaller then CL_DEVICE_LOCAL_MEMORY_SIZE*8-8
 //NO_UNUSED_SYNAPSE: If there are unised synapses. Useful for sparial pooler, accelerates ~30%
 kernel void learnCorrilation(global bool* restrict x, global bool* restrict y
-	, global int* restrict synapses, global float* restrict permeances
+	, global int* restrict synapses, global PERM_TYPE* restrict permeances
 	, float permeance_inc, float permeance_dec)
 {
 	local char xl[INPUT_SIZE/8+1];
diff --git a/kernels/learnCorrilation_global.cl b/kernels/learnCorrilation_global.cl
index c08d300..dd2086c 100644
--- a/kernels/learnCorrilation_global.cl
+++ b/kernels/learnCorrilation_global.cl
@@ -10,6 +10,10 @@
 	#error "OUTPUT_SIZE not defined"
 #endif
 
+#ifndef PERM_TYPE
+	#error "PERM_TYPE not defined"
+#endif
+
 #ifndef NO_UNUSED_SYNAPSE
 	#define NO_UNUSED_SYNAPSE false
 #endif
@@ -19,7 +23,7 @@
 //INPUT_SIZE: The size of input SDR, must be smaller then CL_DEVICE_LOCAL_MEMORY_SIZE
 //NO_UNUSED_SYNAPSE: If there are unised synapses. Useful for sparial pooler, accelerates ~30%
 kernel void learnCorrilation(global bool* restrict x, global bool* restrict y
-	, global int* restrict synapses, global float* restrict permeances
+	, global int* restrict synapses, global PERM_TYPE* restrict permeances
 	, float permeance_inc, float permeance_dec)
 {
 	int global_size = get_global_size(0);
diff --git a/kernels/sort.cl b/kernels/sort.cl
index 03638de..7b23a7d 100644
--- a/kernels/sort.cl
+++ b/kernels/sort.cl
@@ -3,16 +3,20 @@
 	#error "MAX_SYNAPSE_PER_CELL not defined"
 #endif
 
-void merge(global unsigned int* restrict a1, global float* restrict a2, int l, int m, int r
-	, global unsigned int* restrict aux_buffer1, global float* restrict aux_buffer2)
+#ifndef PERM_TYPE
+	#error "PERM_TYPE not defined"
+#endif
+
+void merge(global unsigned int* restrict a1, global PERM_TYPE* restrict a2, int l, int m, int r
+	, global unsigned int* restrict aux_buffer1, global PERM_TYPE* restrict aux_buffer2)
 {
 	int n1 = m - l + 1;
 	int n2 =  r - m;
 
 	global unsigned int* restrict L = aux_buffer1;
 	global unsigned int* restrict R = aux_buffer1+n1;
-	global float* restrict L2 = aux_buffer2;
-	global float* restrict R2 = aux_buffer2+n1;
+	global PERM_TYPE* restrict L2 = aux_buffer2;
+	global PERM_TYPE* restrict R2 = aux_buffer2+n1;
 
 	for (int i=0;i<n1;i++) {
 		L[i] = a1[l+i];
@@ -56,8 +60,8 @@ void merge(global unsigned int* restrict a1, global float* restrict a2, int l, i
 	}
 }
 
-void mergeSort(global unsigned int* restrict connections, global float* restrict permeances, int n
-	,global unsigned int* restrict aux_buffer1, global float* restrict aux_buffer2)
+void mergeSort(global unsigned int* restrict connections, global PERM_TYPE* restrict permeances, int n
+	,global unsigned int* restrict aux_buffer1, global PERM_TYPE* restrict aux_buffer2)
 {
 	for (int curr_size=1; curr_size<=n-1; curr_size = 2*curr_size) {
 	   for (int left_start=0; left_start<n-1; left_start += 2*curr_size) {

From a50608aa8401e56704a0f074d9955f2bf5ea0852 Mon Sep 17 00:00:00 2001
From: marty1885 <marty188586@gmail.com>
Date: Thu, 1 Aug 2019 15:40:24 +0800
Subject: [PATCH 25/25] fix fp16 not working on OpenCL

---
 Etaler/Backends/OpenCLBackend.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
index bebca2b..54e2dbe 100644
--- a/Etaler/Backends/OpenCLBackend.cpp
+++ b/Etaler/Backends/OpenCLBackend.cpp
@@ -234,7 +234,7 @@ void KernelManager::compileFromFile(const std::vector<std::string>& paths, const
 {
 	std::vector<std::string> sources;
 	for(const auto& path : paths)
-		sources.emplace_back((prepend!=""?"\n":"") + readKernel(path));
+		sources.emplace_back(prepend + (prepend!=""?"\n":"") + readKernel(path));
 	compileKernel(sources, program_name, kernel_names, force_override, flags);
 }
 
@@ -266,7 +266,7 @@ std::shared_ptr<TensorImpl> OpenCLBackend::cellActivity(const TensorImpl* x, con
 {
 	requireProperties(x, this, DType::Bool, IsContingous());
 	requireProperties(connections, this, DType::Int32, IsContingous());
-	requireProperties(permeances, this, IsDType{DType::Float, DType::Int32}, IsContingous());
+	requireProperties(permeances, this, IsDType{DType::Float, DType::Half}, IsContingous());
 	et_assert(connections->shape() == permeances->shape());
 	et_assert(connections->dimentions() >= 2);