diff --git a/mshadow/extension/implicit_gemm.h b/mshadow/extension/implicit_gemm.h index 8b32d93b..64a7b3e4 100644 --- a/mshadow/extension/implicit_gemm.h +++ b/mshadow/extension/implicit_gemm.h @@ -77,7 +77,7 @@ struct Plan, DType> { for (index_t j = 0; j < Packet::kSize; ++j) { rhs_temp[j] = rhs_.Eval(i + j, x); } - sum = sum + Packet::Load(lhs_temp) * Packet::Load(rhs_temp); + sum = sum + Packet::LoadUnAligned(lhs_temp) * Packet::LoadUnAligned(rhs_temp); } DType ret_result = sum.Sum(); diff --git a/mshadow/packet/plain-inl.h b/mshadow/packet/plain-inl.h index b28671f5..ab2453f9 100644 --- a/mshadow/packet/plain-inl.h +++ b/mshadow/packet/plain-inl.h @@ -30,6 +30,10 @@ struct Packet { MSHADOW_CINLINE static Packet Load(const DType* src) { return Packet(*src); } + // load from address + MSHADOW_CINLINE static Packet LoadUnAligned(const DType* src) { + return Packet(*src); + } // fill it with value s MSHADOW_CINLINE Packet& operator=(DType s) { data_ = s; diff --git a/mshadow/packet/sse-inl.h b/mshadow/packet/sse-inl.h index cdf24c5e..295fb289 100644 --- a/mshadow/packet/sse-inl.h +++ b/mshadow/packet/sse-inl.h @@ -32,6 +32,10 @@ struct Packet { MSHADOW_CINLINE static Packet Load(const float* src) { return Packet(_mm_load_ps(src)); } + // load from address + MSHADOW_CINLINE static Packet LoadUnAligned(const float* src) { + return Packet(_mm_loadu_ps(src)); + } // fill it with value s MSHADOW_CINLINE Packet& operator=(float s) { data_ = _mm_set1_ps(s); @@ -73,6 +77,9 @@ struct Packet { MSHADOW_CINLINE static Packet Load(const double* src) { return Packet(_mm_load_pd(src)); } + MSHADOW_CINLINE static Packet LoadUnAligned(const double* src) { + return Packet(_mm_loadu_pd(src)); + } // fill it with value s MSHADOW_CINLINE Packet& operator=(double s) { data_ = _mm_set1_pd(s);