From 8fc9ed88b045a62f843790bda0e9513403031896 Mon Sep 17 00:00:00 2001
From: Tom Finley <tfinley@gmail.com>
Date: Thu, 6 Dec 2018 00:32:37 -0800
Subject: [PATCH] Revert "Removed AlignedArray  (#1657)"

This reverts commit 72ec121afe1a889218c750f7bda7ee5093c140b7.
A bug was detected that resulted in non-deterministic calculation, since the
underlying C++ code was written in a way apparently that required alignment
to produce consistent results, so of course just removing the alignment and
calling an only slightly modified algorithm compromised determinism, resulting
in test failure for RFF in particular.
---
 .../AlignedArray.cs                           |  93 ++-
 src/Microsoft.ML.CpuMath/AlignedMatrix.cs     | 681 ++++++++++++++++++
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     | 306 ++++----
 .../CpuAligenedMathUtils.cs                   | 148 ++++
 .../CpuMathUtils.netcoreapp.cs                | 166 +++--
 .../CpuMathUtils.netstandard.cs               |  15 +-
 src/Microsoft.ML.CpuMath/Sse.cs               |  78 +-
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     | 295 ++++----
 src/Microsoft.ML.CpuMath/Thunk.cs             |   6 +
 ...AdaptiveSingularSpectrumSequenceModeler.cs |  71 +-
 .../RandomFourierFeaturizing.cs               |  67 +-
 src/Native/CpuMathNative/Sse.cpp              | 353 ++++-----
 .../UnitTests.cs                              | 189 ++---
 .../DataPipe/TestDataPipeBase.cs              |  10 +-
 .../TimeSeries.cs                             |   2 +-
 .../TimeSeriesDirectApi.cs                    |  24 +-
 .../TimeSeriesEstimatorTests.cs               |   4 +-
 17 files changed, 1699 insertions(+), 809 deletions(-)
 rename src/{Microsoft.ML.StandardLearners/FactorizationMachine => Microsoft.ML.CpuMath}/AlignedArray.cs (53%)
 create mode 100644 src/Microsoft.ML.CpuMath/AlignedMatrix.cs
 create mode 100644 src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/AlignedArray.cs b/src/Microsoft.ML.CpuMath/AlignedArray.cs
similarity index 53%
rename from src/Microsoft.ML.StandardLearners/FactorizationMachine/AlignedArray.cs
rename to src/Microsoft.ML.CpuMath/AlignedArray.cs
index f01da9fe28..0a631be0e9 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/AlignedArray.cs
+++ b/src/Microsoft.ML.CpuMath/AlignedArray.cs
@@ -5,7 +5,7 @@
 using Microsoft.ML.Runtime.Internal.CpuMath.Core;
 using System;
 
-namespace Microsoft.ML.Runtime.FactorizationMachine
+namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     using Float = System.Single;
 
@@ -17,6 +17,7 @@ namespace Microsoft.ML.Runtime.FactorizationMachine
     ///
     /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
     /// </summary>
+    [BestFriend]
     internal sealed class AlignedArray
     {
         // Items includes "head" items filled with NaN, followed by _size entries, followed by "tail"
@@ -109,6 +110,60 @@ public Float this[int index]
             }
         }
 
+        public void CopyTo(Span<Float> dst, int index, int count)
+        {
+            Contracts.Assert(0 <= count && count <= _size);
+            Contracts.Assert(dst != null);
+            Contracts.Assert(0 <= index && index <= dst.Length - count);
+            Items.AsSpan(_base, count).CopyTo(dst.Slice(index));
+        }
+
+        public void CopyTo(int start, Span<Float> dst, int index, int count)
+        {
+            Contracts.Assert(0 <= count);
+            Contracts.Assert(0 <= start && start <= _size - count);
+            Contracts.Assert(dst != null);
+            Contracts.Assert(0 <= index && index <= dst.Length - count);
+            Items.AsSpan(start + _base, count).CopyTo(dst.Slice(index));
+        }
+
+        public void CopyFrom(ReadOnlySpan<Float> src)
+        {
+            Contracts.Assert(src.Length <= _size);
+            src.CopyTo(Items.AsSpan(_base));
+        }
+
+        public void CopyFrom(int start, ReadOnlySpan<Float> src)
+        {
+            Contracts.Assert(0 <= start && start <= _size - src.Length);
+            src.CopyTo(Items.AsSpan(start + _base));
+        }
+
+        // Copies values from a sparse vector.
+        // valuesSrc contains only the non-zero entries. Those are copied into their logical positions in the dense array.
+        // rgposSrc contains the logical positions + offset of the non-zero entries in the dense array.
+        // rgposSrc runs parallel to the valuesSrc array.
+        public void CopyFrom(ReadOnlySpan<int> rgposSrc, ReadOnlySpan<Float> valuesSrc, int posMin, int iposMin, int iposLim, bool zeroItems)
+        {
+            Contracts.Assert(rgposSrc != null);
+            Contracts.Assert(valuesSrc != null);
+            Contracts.Assert(rgposSrc.Length <= valuesSrc.Length);
+            Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length);
+
+            // Zeroing-out and setting the values in one-pass does not seem to give any perf benefit.
+            // So explicitly zeroing and then setting the values.
+            if (zeroItems)
+                ZeroItems();
+
+            for (int ipos = iposMin; ipos < iposLim; ++ipos)
+            {
+                Contracts.Assert(posMin <= rgposSrc[ipos]);
+                int iv = _base + rgposSrc[ipos] - posMin;
+                Contracts.Assert(iv < _size + _base);
+                Items[iv] = valuesSrc[ipos];
+            }
+        }
+
         public void CopyFrom(AlignedArray src)
         {
             Contracts.Assert(src != null);
@@ -116,5 +171,41 @@ public void CopyFrom(AlignedArray src)
             Contracts.Assert(src._cbAlign == _cbAlign);
             Array.Copy(src.Items, src._base, Items, _base, _size);
         }
+
+        public void ZeroItems()
+        {
+            Array.Clear(Items, _base, _size);
+        }
+
+        public void ZeroItems(int[] rgposSrc, int posMin, int iposMin, int iposLim)
+        {
+            Contracts.Assert(rgposSrc != null);
+            Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length);
+            Contracts.Assert(iposLim - iposMin <= _size);
+
+            int ivCur = 0;
+            for (int ipos = iposMin; ipos < iposLim; ++ipos)
+            {
+                int ivNextNonZero = rgposSrc[ipos] - posMin;
+                Contracts.Assert(ivCur <= ivNextNonZero && ivNextNonZero < _size);
+                while (ivCur < ivNextNonZero)
+                    Items[_base + ivCur++] = 0;
+                Contracts.Assert(ivCur == ivNextNonZero);
+                // Skip the non-zero element at ivNextNonZero.
+                ivCur++;
+            }
+
+            while (ivCur < _size)
+                Items[_base + ivCur++] = 0;
+        }
+
+        // REVIEW: This is hackish and slightly dangerous. Perhaps we should wrap this in an
+        // IDisposable that "locks" this, prohibiting GetBase from being called, while the buffer
+        // is "checked out".
+        public void GetRawBuffer(out Float[] items, out int offset)
+        {
+            items = Items;
+            offset = _base;
+        }
     }
 }
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/AlignedMatrix.cs b/src/Microsoft.ML.CpuMath/AlignedMatrix.cs
new file mode 100644
index 0000000000..6d550fc3fc
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/AlignedMatrix.cs
@@ -0,0 +1,681 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Float = System.Single;
+
+using Microsoft.ML.Runtime.Internal.CpuMath.Core;
+using System;
+using System.Collections;
+using System.Collections.Generic;
+
+namespace Microsoft.ML.Runtime.Internal.CpuMath
+{
+    using Conditional = System.Diagnostics.ConditionalAttribute;
+
+    /// <summary>
+    /// This implements a logical array of Floats that is automatically aligned for SSE/AVX operations.
+    /// This is a thin wrapper around the AlignedArray type implemented in C++. This simply couples
+    /// the AlignedArray with a logical size, which does not include padding, while the AlignedArray
+    /// size does include padding.
+    /// </summary>
+    [BestFriend]
+    internal sealed class CpuAlignedVector : ICpuVector
+    {
+        private readonly AlignedArray _items;
+        private readonly int _size; // The logical size.
+
+        /// <summary>
+        /// The value count.
+        /// </summary>
+        public int ValueCount { get { return _size; } }
+
+        /// <summary>
+        /// The logical size of the vector.
+        /// </summary>
+        public int VectorSize { get { return _size; } }
+
+        // Round cflt up to a multiple of cfltAlign.
+        private static int RoundUp(int cflt, int cfltAlign)
+        {
+            Contracts.Assert(0 < cflt);
+            // cfltAlign should be a power of two.
+            Contracts.Assert(0 < cfltAlign && (cfltAlign & (cfltAlign - 1)) == 0);
+
+            // Determine the number of "blobs" of size cfltAlign.
+            int cblob = (cflt + cfltAlign - 1) / cfltAlign;
+            return cblob * cfltAlign;
+        }
+
+        /// <summary>
+        /// Allocate an aligned vector with the given alignment (in bytes).
+        /// The alignment must be a power of two and at least sizeof(Float).
+        /// </summary>
+        public CpuAlignedVector(int size, int cbAlign)
+        {
+            Contracts.Assert(0 < size);
+            // cbAlign should be a power of two.
+            Contracts.Assert(sizeof(Float) <= cbAlign);
+            Contracts.Assert((cbAlign & (cbAlign - 1)) == 0);
+
+            int cfltAlign = cbAlign / sizeof(Float);
+            int cflt = RoundUp(size, cfltAlign);
+            _items = new AlignedArray(cflt, cbAlign);
+            _size = size;
+            AssertValid();
+        }
+
+        public void Dispose()
+        {
+        }
+
+        [Conditional("DEBUG")]
+        private void AssertValid()
+        {
+#if DEBUG
+            Contracts.Assert(0 < _size && _size <= _items.Size);
+
+            // The padding, [_size, _items.Size), should contain zeros.
+            for (int i = _size; i < _items.Size; i++)
+                Contracts.Assert(_items[i] == 0);
+#endif
+        }
+
+        /// <summary>
+        /// The physical AligenedArray items.
+        /// </summary>
+        public AlignedArray Items { get { return _items; } }
+
+        /// <summary>
+        /// The alignment.
+        /// </summary>
+        public int CbAlign
+        {
+            get { return _items.CbAlign; }
+        }
+
+        /// <summary>
+        /// Set and get the value of the vector at the given index.
+        /// </summary>
+        /// <param name="index">The index</param>
+        /// <returns>The value at the given index</returns>
+        public Float this[int index]
+        {
+            get
+            {
+                Contracts.Assert(0 <= index && index < _size);
+                return _items[index];
+            }
+            set
+            {
+                Contracts.Assert(0 <= index && index < _size);
+                _items[index] = value;
+            }
+        }
+
+        /// <summary>
+        /// Get the value of the vector at the given index.
+        /// </summary>
+        /// <param name="i">The index</param>
+        /// <returns>The value at the given index</returns>
+        public Float GetValue(int i)
+        {
+            Contracts.Assert(0 <= i && i < _size);
+            return _items[i];
+        }
+
+        /// <summary>
+        /// Assign randomized values to the vector elements via the input function.
+        /// </summary>
+        /// <param name="rand">The input rand om function that takes no arguments and returns a float value</param>
+        public void Randomize(Func<Float> rand)
+        {
+            Contracts.AssertValue(rand);
+            for (int i = 0; i < _size; i++)
+                _items[i] = rand();
+        }
+
+        /// <summary>
+        /// Assign zeros to the vector elements.
+        /// </summary>
+        public void Zero()
+        {
+            _items.ZeroItems();
+        }
+
+        /// <summary>
+        /// Copy the values into dst, starting at slot ivDst and advancing ivDst.
+        /// </summary>
+        /// <param name="dst">The destination array</param>
+        /// <param name="ivDst">The starting index in the destination array</param>
+        public void CopyTo(Float[] dst, ref int ivDst)
+        {
+            Contracts.AssertValue(dst);
+            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - _size);
+            _items.CopyTo(dst, ivDst, _size);
+            ivDst += _size;
+        }
+
+        /// <summary>
+        /// Copy the values from this vector starting at slot ivSrc into dst, starting at slot ivDst.
+        /// The number of values that are copied is determined by count.
+        /// </summary>
+        /// <param name="ivSrc">The staring index in this vector</param>
+        /// <param name="dst">The destination array</param>
+        /// <param name="ivDst">The starting index in the destination array</param>
+        /// <param name="count">The number of elements to be copied</param>
+        public void CopyTo(int ivSrc, Float[] dst, int ivDst, int count)
+        {
+            Contracts.AssertValue(dst);
+            Contracts.Assert(0 <= count && count <= dst.Length);
+            Contracts.Assert(0 <= ivSrc && ivSrc <= _size - count);
+            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - count);
+            _items.CopyTo(ivSrc, dst, ivDst, count);
+        }
+
+        /// <summary>
+        /// Copy the values from src, starting at slot index and advancing index, into this vector.
+        /// </summary>
+        /// <param name="src">The source array</param>
+        /// <param name="index">The starting index in the source array</param>
+        public void CopyFrom(Float[] src, ref int index)
+        {
+            Contracts.AssertValue(src);
+            Contracts.Assert(0 <= index && index <= src.Length - _size);
+            _items.CopyFrom(src.AsSpan(index, _size));
+            index += _size;
+        }
+
+        /// <summary>
+        /// Copy the values from src, starting at slot index and advancing index, into this vector, starting at slot ivDst.
+        /// The number of values that are copied is determined by count.
+        /// </summary>
+        /// <param name="ivDst">The staring index in this vector</param>
+        /// <param name="src">The source array</param>
+        /// <param name="ivSrc">The starting index in the source array</param>
+        /// <param name="count">The number of elements to be copied</param>
+        public void CopyFrom(int ivDst, Float[] src, int ivSrc, int count)
+        {
+            Contracts.AssertValue(src);
+            Contracts.Assert(0 <= count && count <= src.Length);
+            Contracts.Assert(0 <= ivDst && ivDst <= _size - count);
+            Contracts.Assert(0 <= ivSrc && ivSrc <= src.Length - count);
+            _items.CopyFrom(ivDst, src.AsSpan(ivSrc, _size));
+        }
+
+        /// <summary>
+        /// Copy the values of src vector into this vector. The src vector must have the same size as this vector.
+        /// </summary>
+        /// <param name="src">The source vector</param>
+        public void CopyFrom(CpuAlignedVector src)
+        {
+            Contracts.AssertValue(src);
+            Contracts.Assert(src._size == _size);
+            _items.CopyFrom(src._items);
+        }
+
+        /// <summary>
+        /// Get the underlying AlignedArray as IEnumerator&lt;Float&gt;.
+        /// </summary>
+        public IEnumerator<Float> GetEnumerator()
+        {
+            for (int i = 0; i < _size; i++)
+                yield return _items[i];
+        }
+
+        IEnumerator IEnumerable.GetEnumerator()
+        {
+            return GetEnumerator();
+        }
+    }
+
+    /// <summary>
+    /// This implements a logical matrix of Floats that is automatically aligned for SSE/AVX operations.
+    /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
+    /// </summary>
+    [BestFriend]
+    internal abstract class CpuAlignedMatrixBase
+    {
+        // _items includes "head" items filled with NaN, followed by RunLenPhy * RunCntPhy entries, followed by
+        // "tail" items, also filled with NaN. Note that RunLenPhy and RunCntPhy are divisible by the alignment
+        // specified in the ctor and are >= RunLen and RunCnt, respectively. It is illegal to access any slot
+        // outsize [_base, _base + RunLenPhy * RunCntPhy). The padding should all be zero (and maintained as such).
+        // The items are arranged in "runs" of length RunLen. There are RunCnt such runs. Each run ends with
+        // (RunLenPhy - RunLen) padding slots. There are an addition (RunCntPhy - RunCnt) padding runs of length
+        // RunLenPhy, which are entirely zero. Any native code should be able to assume and should maintain
+        // these invariants.
+        public AlignedArray Items { get; }
+
+        protected readonly int FloatAlign; // The alignment.
+
+        // Since FloatAlign is a power of two, shifting by Shift = log_2(FloatAlign) is the same as multiplying/dividing by FloatAlign.
+        protected readonly int Shift;
+        // Since FloatAlign is a power of two, bitwise and with Mask = FloatAlign - 1 will be the same as moding by FloatAlign.
+        protected readonly int Mask;
+
+        // Logical length of runs (RunLen) and number of runs (RunCnt).
+        public readonly int RunLen;
+        public readonly int RunCnt;
+
+        // Physical (padded) length and number of runs.
+        public readonly int RunLenPhy;
+        public readonly int RunCntPhy;
+
+        /// <summary>
+        /// The logical number values in the matrix
+        /// </summary>
+        public int ValueCount => RunLen * RunCnt;
+
+        /// <summary>
+        /// The logical number of rows
+        /// </summary>
+        public abstract int RowCount { get; }
+
+        /// <summary>
+        /// The logical number of columns
+        /// </summary>
+        public abstract int ColCount { get; }
+
+        /// <summary>
+        /// The physical number of rows
+        /// </summary>
+        public abstract int RowCountPhy { get; }
+
+        /// <summary>
+        /// The pysical number of columns
+        /// </summary>
+        public abstract int ColCountPhy { get; }
+
+        // Round cflt up to a multiple of cfltAlign.
+        protected static int RoundUp(int cflt, int cfltAlign)
+        {
+            Contracts.Assert(0 < cflt);
+            // cfltAlign should be a power of two.
+            Contracts.Assert(0 < cfltAlign && (cfltAlign & (cfltAlign - 1)) == 0);
+
+            // Determine the number of "blobs" of size cfltAlign.
+            int cblob = (cflt + cfltAlign - 1) / cfltAlign;
+            return cblob * cfltAlign;
+        }
+
+        /// <summary>
+        /// Allocate an aligned matrix with the given alignment (in bytes).
+        /// </summary>
+        protected CpuAlignedMatrixBase(int runLen, int runCnt, int cbAlign)
+        {
+            Contracts.Assert(0 < runLen);
+            Contracts.Assert(0 < runCnt);
+            // cbAlign should be a power of two.
+            Contracts.Assert(sizeof(Float) <= cbAlign);
+            Contracts.Assert((cbAlign & (cbAlign - 1)) == 0);
+
+            RunLen = runLen;
+            RunCnt = runCnt;
+
+            FloatAlign = cbAlign / sizeof(Float);
+            Shift = GeneralUtils.CbitLowZero((uint)FloatAlign);
+            Mask = FloatAlign - 1;
+
+            RunLenPhy = RoundUp(runLen, FloatAlign);
+            RunCntPhy = RoundUp(runCnt, FloatAlign);
+            Items = new AlignedArray(RunLenPhy * RunCntPhy, cbAlign);
+
+            AssertValid();
+        }
+
+        [Conditional("DEBUG")]
+        protected void AssertValid()
+        {
+#if DEBUG
+            Contracts.Assert(0 < RunLen && RunLen <= RunLenPhy);
+            Contracts.Assert(0 < RunCnt && RunCnt <= RunCntPhy);
+            Contracts.Assert(RunLenPhy * RunCntPhy == Items.Size);
+
+            // Assert that the padding at the end of each run contains zeros.
+            for (int i = 0; i < RunCnt; i++)
+            {
+                for (int j = RunLen; j < RunLenPhy; j++)
+                    Contracts.Assert(Items[i * RunLenPhy + j] == 0);
+            }
+
+            // Assert that the padding runs contain zeros.
+            for (int i = RunCnt; i < RunCntPhy; i++)
+            {
+                for (int j = 0; j < RunLenPhy; j++)
+                    Contracts.Assert(Items[i * RunLenPhy + j] == 0);
+            }
+#endif
+        }
+
+        public void Dispose()
+        {
+        }
+
+        /// <summary>
+        /// Assign randomized values to the matrix elements via the input function.
+        /// </summary>
+        /// <param name="rand">The input rand om function that takes no arguments and returns a float value</param>
+        public void Randomize(Func<Float> rand)
+        {
+            Contracts.AssertValue(rand);
+            for (int i = 0, k = 0; i < RunCnt; i++)
+            {
+                Contracts.Assert(k == i * RunLenPhy);
+                for (int j = 0; j < RunLen; j++)
+                    Items[k + j] = rand();
+                k += RunLenPhy;
+            }
+        }
+
+        /// <summary>
+        /// Assign zeros to the matrix elements.
+        /// </summary>
+        public void Zero()
+        {
+            Items.ZeroItems();
+        }
+
+        /// <summary>
+        /// Copy the values of src matrix into this matrix. The src matrix must have the same physical and logical size as this matrix.
+        /// </summary>
+        /// <param name="src">The source matrix</param>
+        public void CopyFrom(CpuAlignedMatrixBase src)
+        {
+            AssertValid();
+            Contracts.AssertValue(src);
+            src.AssertValid();
+            Contracts.Assert(src.RunLen == RunLen);
+            Contracts.Assert(src.RunCnt == RunCnt);
+            Contracts.Assert(src.RunLenPhy == RunLenPhy);
+            Contracts.Assert(src.RunCntPhy == RunCntPhy);
+            Items.CopyFrom(src.Items);
+        }
+    }
+
+    /// <summary>
+    /// This implements a logical row-major matrix of Floats that is automatically aligned for SSE/AVX operations.
+    /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
+    /// </summary>
+    [BestFriend]
+    internal abstract class CpuAlignedMatrixRowBase : CpuAlignedMatrixBase, ICpuBuffer<Float>
+    {
+        protected CpuAlignedMatrixRowBase(int crow, int ccol, int cbAlign)
+            : base(ccol, crow, cbAlign)
+        {
+        }
+
+        /// <summary>
+        /// The logical number of rows
+        /// </summary>
+        public override int RowCount => RunCnt;
+
+        /// <summary>
+        /// The logical number of columns
+        /// </summary>
+        public override int ColCount { get { return RunLen; } }
+
+        /// <summary>
+        /// The physical number of rows
+        /// </summary>
+        public override int RowCountPhy { get { return RunCntPhy; } }
+
+        /// <summary>
+        /// The physical number of columns
+        /// </summary>
+        public override int ColCountPhy { get { return RunLenPhy; } }
+
+        /// <summary>
+        /// Copy the values into dst, starting at slot ivDst and advancing ivDst.
+        /// </summary>
+        /// <param name="dst">The destination array</param>
+        /// <param name="ivDst">The starting index in the destination array</param>
+        public void CopyTo(Float[] dst, ref int ivDst)
+        {
+            Contracts.AssertValue(dst);
+            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ValueCount);
+
+            if (ColCount == ColCountPhy)
+            {
+                // Can copy all at once.
+                Items.CopyTo(0, dst, ivDst, ValueCount);
+                ivDst += ValueCount;
+            }
+            else
+            {
+                // Copy each row.
+                int ivSrc = 0;
+                for (int row = 0; row < RowCount; row++)
+                {
+                    Items.CopyTo(ivSrc, dst, ivDst, ColCount);
+                    ivSrc += ColCountPhy;
+                    ivDst += ColCount;
+                }
+            }
+        }
+
+        /// <summary>
+        /// Copy the values from src, starting at slot ivSrc and advancing ivSrc.
+        /// </summary>
+        /// <param name="src">The source array</param>
+        /// <param name="ivSrc">The starting index in the source array</param>
+        public void CopyFrom(Float[] src, ref int ivSrc)
+        {
+            Contracts.AssertValue(src);
+            Contracts.Assert(0 <= ivSrc && ivSrc <= src.Length - ValueCount);
+
+            if (ColCount == ColCountPhy)
+            {
+                Items.CopyFrom(src.AsSpan(ivSrc, ValueCount));
+                ivSrc += ValueCount;
+            }
+            else
+            {
+                for (int row = 0; row < RowCount; row++)
+                {
+                    Items.CopyFrom(row * ColCountPhy, src.AsSpan(ivSrc, ColCount));
+                    ivSrc += ColCount;
+                }
+            }
+        }
+
+        /// <summary>
+        /// Get the underlying AlignedArray as IEnumerator&lt;Float&gt;.
+        /// </summary>
+        public IEnumerator<Float> GetEnumerator()
+        {
+            for (int row = 0; row < RowCount; row++)
+            {
+                int ivBase = row * ColCountPhy;
+                for (int col = 0; col < ColCount; col++)
+                    yield return Items[ivBase + col];
+            }
+        }
+
+        IEnumerator IEnumerable.GetEnumerator()
+        {
+            return GetEnumerator();
+        }
+    }
+
+    /// <summary>
+    /// This implements a row-major matrix of Floats that is automatically aligned for SSE/AVX operations.
+    /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
+    /// </summary>
+    [BestFriend]
+    internal sealed class CpuAlignedMatrixRow : CpuAlignedMatrixRowBase, ICpuFullMatrix
+    {
+        public CpuAlignedMatrixRow(int crow, int ccol, int cbAlign)
+            : base(crow, ccol, cbAlign)
+        {
+        }
+
+        /// <summary>
+        /// The logical number of rows
+        /// </summary>
+        public override int RowCount { get { return RunCnt; } }
+
+        /// <summary>
+        /// The logical number of columns
+        /// </summary>
+        public override int ColCount { get { return RunLen; } }
+
+        /// <summary>
+        /// The physical number of rows
+        /// </summary>
+        public override int RowCountPhy { get { return RunCntPhy; } }
+
+        /// <summary>
+        /// The physical number of columns
+        /// </summary>
+        public override int ColCountPhy { get { return RunLenPhy; } }
+
+        /// <summary>
+        /// Copy the values from this matrix, starting from the row into dst, starting at slot ivDst and advancing ivDst.
+        /// </summary>
+        /// <param name="row">The starting row in this matrix</param>
+        /// <param name="dst">The destination array</param>
+        /// <param name="ivDst">The starting index in the destination array</param>
+        public void CopyTo(int row, Float[] dst, ref int ivDst)
+        {
+            Contracts.AssertValue(dst);
+            Contracts.Assert(0 <= row && row < RowCount);
+            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ColCount);
+
+            Items.CopyTo(row * ColCountPhy, dst, ivDst, ColCount);
+            ivDst += ColCount;
+        }
+
+        /// <summary>
+        /// Assign zeros to the values at the indices
+        /// </summary>
+        /// <param name="indices">The indices</param>
+        public void ZeroItems(int[] indices)
+        {
+            Contracts.AssertValue(indices);
+
+            // REVIEW: Ideally, we'd adjust the indices once so we wouldn't need to
+            // repeatedly deal with padding adjustments.
+            CpuMathUtils.ZeroMatrixItems(Items, ColCount, ColCountPhy, indices);
+        }
+    }
+
+    /// <summary>
+    /// This implements a logical matrix of Floats that is automatically aligned for SSE/AVX operations.
+    /// The ctor takes an alignment value, which must be a power of two at least sizeof(Float).
+    /// </summary>
+    [BestFriend]
+    internal sealed class CpuAlignedMatrixCol : CpuAlignedMatrixBase, ICpuFullMatrix
+    {
+        /// <summary>
+        /// Allocate an aligned matrix with the given alignment (in bytes).
+        /// </summary>
+        public CpuAlignedMatrixCol(int crow, int ccol, int cbAlign)
+            : base(crow, ccol, cbAlign)
+        {
+        }
+
+        /// <summary>
+        /// The logical number of rows
+        /// </summary>
+        public override int RowCount { get { return RunCnt; } }
+
+        /// <summary>
+        /// The logical number of columns
+        /// </summary>
+        public override int ColCount { get { return RunLen; } }
+
+        /// <summary>
+        /// The physical number of rows
+        /// </summary>
+        public override int RowCountPhy { get { return RunCntPhy; } }
+
+        /// <summary>
+        /// The physical number of columns
+        /// </summary>
+        public override int ColCountPhy { get { return RunLenPhy; } }
+
+        /// <summary>
+        /// Copy the values into dst, starting at slot ivDst and advancing ivDst.
+        /// </summary>
+        /// <param name="dst">The destination array</param>
+        /// <param name="ivDst">The starting index in the destination array</param>
+        public void CopyTo(Float[] dst, ref int ivDst)
+        {
+            Contracts.AssertValue(dst);
+            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ValueCount);
+
+            for (int row = 0; row < RowCount; row++)
+            {
+                for (int col = 0; col < ColCount; col++)
+                    dst[ivDst++] = Items[row + col * RowCountPhy];
+            }
+        }
+
+        /// <summary>
+        /// Copy the values from this matrix, starting from the row into dst, starting at slot ivDst and advancing ivDst.
+        /// </summary>
+        /// <param name="row">The starting row in this matrix</param>
+        /// <param name="dst">The destination array</param>
+        /// <param name="ivDst">The starting index in the destination array</param>
+        public void CopyTo(int row, Float[] dst, ref int ivDst)
+        {
+            Contracts.AssertValue(dst);
+            Contracts.Assert(0 <= row && row < RowCount);
+            Contracts.Assert(0 <= ivDst && ivDst <= dst.Length - ColCount);
+
+            for (int col = 0; col < ColCount; col++)
+                dst[ivDst++] = Items[row + col * RowCountPhy];
+        }
+
+        /// <summary>
+        /// Copy the values from src, starting at slot ivSrc and advancing ivSrc.
+        /// </summary>
+        /// <param name="src">The source array</param>
+        /// <param name="ivSrc">The starting index in the source array</param>
+        public void CopyFrom(Float[] src, ref int ivSrc)
+        {
+            Contracts.AssertValue(src);
+            Contracts.Assert(0 <= ivSrc && ivSrc <= src.Length - ValueCount);
+            for (int row = 0; row < RowCount; row++)
+            {
+                for (int col = 0; col < ColCount; col++)
+                    Items[row + col * RowCountPhy] = src[ivSrc++];
+            }
+        }
+
+        /// <summary>
+        /// Assign zeros to the values at the indices
+        /// </summary>
+        /// <param name="indices">The indices</param>
+        public void ZeroItems(int[] indices)
+        {
+            Contracts.AssertValue(indices);
+
+            // REVIEW: Ideally, we'd adjust the indices once so we wouldn't need to
+            // repeatedly deal with padding adjustments.
+            foreach (int iv in indices)
+            {
+                int row = iv / ColCount;
+                int col = iv % ColCount;
+                Items[row + col * ColCountPhy] = 0;
+            }
+        }
+
+        /// <summary>
+        /// Get the underlying AlignedArray as IEnumerator&lt;Float&gt;.
+        /// </summary>
+        public IEnumerator<Float> GetEnumerator()
+        {
+            for (int row = 0; row < RowCount; row++)
+            {
+                for (int col = 0; col < ColCount; col++)
+                    yield return Items[row + col * RowCountPhy];
+            }
+        }
+
+        IEnumerator IEnumerable.GetEnumerator()
+        {
+            return GetEnumerator();
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index d6fb169094..2156ddf5fa 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -152,6 +152,11 @@ private static Vector256<float> MultiplyAdd(Vector256<float> src1, Vector256<flo
         }
 
         // Multiply matrix times vector into vector.
+        public static unsafe void MatMul(AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            MatMul(mat.Items, src.Items, dst.Items, crow, ccol);
+        }
+
         public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> src, Span<float> dst, int crow, int ccol)
         {
             fixed (float* psrc = &MemoryMarshal.GetReference(src))
@@ -164,9 +169,8 @@ public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> sr
                 float* pDstEnd = pdst + crow;
                 float* pDstCurrent = pdst;
                 float* pMatCurrent = pmat;
-                int numRows = crow;
 
-                while (pDstCurrent + 4 <= pDstEnd)
+                while (pDstCurrent < pDstEnd)
                 {
                     Vector256<float> res0 = Avx.SetZeroVector256<float>();
                     Vector256<float> res1 = Avx.SetZeroVector256<float>();
@@ -180,11 +184,10 @@ public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> sr
                     int misalignment = (int)(address % 32);
                     int remainder = 0;
 
-                    if ((misalignment & 3) != 0 || (ccol % 8 != 0))
+                    if ((misalignment & 3) != 0)
                     {
                         // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
-                        remainder = length % 8;
-                        while (pSrcCurrent + 8 <= pSrcEnd)
+                        while (pSrcCurrent < pSrcEnd)
                         {
                             Vector256<float> vector = Avx.LoadVector256(pSrcCurrent);
 
@@ -256,32 +259,32 @@ public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> sr
                             // unaligned loads where we mask the input each time.
                             remainder = length;
                         }
-                    }
 
-                    if (remainder != 0)
-                    {
-                        // Handle any trailing elements that don't fit into a 256-bit block by moving back so that the next
-                        // unaligned load will read to the end of the array and then mask out any elements already processed
+                        if (remainder != 0)
+                        {
+                            // Handle any trailing elements that don't fit into a 256-bit block by moving back so that the next
+                            // unaligned load will read to the end of the array and then mask out any elements already processed
 
-                        pMatCurrent -= (8 - remainder);
-                        pSrcCurrent -= (8 - remainder);
+                            pMatCurrent -= (8 - remainder);
+                            pSrcCurrent -= (8 - remainder);
 
-                        Vector256<float> mask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
+                            Vector256<float> mask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
 
-                        float* pMatTemp = pMatCurrent;
-                        Vector256<float> x01 = Avx.And(mask, Avx.LoadVector256(pMatTemp));
-                        Vector256<float> x11 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol));
-                        Vector256<float> x21 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol));
-                        Vector256<float> x31 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol));
-                        Vector256<float> vector = Avx.And(mask, Avx.LoadVector256(pSrcCurrent));
+                            float* pMatTemp = pMatCurrent;
+                            Vector256<float> x01 = Avx.And(mask, Avx.LoadVector256(pMatTemp));
+                            Vector256<float> x11 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol));
+                            Vector256<float> x21 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol));
+                            Vector256<float> x31 = Avx.And(mask, Avx.LoadVector256(pMatTemp += ccol));
+                            Vector256<float> vector = Avx.And(mask, Avx.LoadVector256(pSrcCurrent));
 
-                        res0 = MultiplyAdd(x01, vector, res0);
-                        res1 = MultiplyAdd(x11, vector, res1);
-                        res2 = MultiplyAdd(x21, vector, res2);
-                        res3 = MultiplyAdd(x31, vector, res3);
+                            res0 = MultiplyAdd(x01, vector, res0);
+                            res1 = MultiplyAdd(x11, vector, res1);
+                            res2 = MultiplyAdd(x21, vector, res2);
+                            res3 = MultiplyAdd(x31, vector, res3);
 
-                        pMatCurrent += 8;
-                        pSrcCurrent += 8;
+                            pMatCurrent += 8;
+                            pSrcCurrent += 8;
+                        }
                     }
 
                     // Add up the entries of each, with the 4 results in res0
@@ -294,58 +297,17 @@ public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> sr
 
                     pDstCurrent += 4;
                     pMatCurrent += 3 * ccol;
-                    numRows -= 4;
-                }
-
-                // falling through the case statements
-                switch (numRows)
-                {
-                    case 3:
-                        *(pDstCurrent + 2) = RowMultiply(pMatCurrent + 2 * ccol, psrc, pSrcEnd, ccol);
-                        goto case 2;
-                    case 2:
-                        *(pDstCurrent + 1) = RowMultiply(pMatCurrent + ccol, psrc, pSrcEnd, ccol);
-                        goto case 1;
-                    case 1:
-                        *pDstCurrent = RowMultiply(pMatCurrent, psrc, pSrcEnd, ccol);
-                        break;
                 }
             }
         }
 
-        private static unsafe float RowMultiply(float* pMatCurrent, float* pSrcCurrent, float* pSrcEnd, int ccol)
+        // Partial sparse source vector.
+        public static unsafe void MatMulP(AlignedArray mat, ReadOnlySpan<int> rgposSrc, AlignedArray src,
+                                int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
         {
-            Vector256<float> res0 = Avx.SetZeroVector256<float>();
-            int remainder = ccol % 8;
-            while (pSrcCurrent + 8 <= pSrcEnd)
-            {
-                Vector256<float> vector = Avx.LoadVector256(pSrcCurrent);
-
-                float* pMatTemp = pMatCurrent;
-                res0 = MultiplyAdd(pMatTemp, vector, res0);
-
-                pSrcCurrent += 8;
-                pMatCurrent += 8;
-            }
-
-            res0 = VectorSum256(in res0);
-            float sum = Sse.ConvertToSingle(Sse.AddScalar(Avx.GetLowerHalf(res0), GetHigh(res0)));
-
-            // falling through the case statements
-            switch (remainder)
-            {
-                case 7: sum += *(pSrcCurrent + 6) * *(pMatCurrent + 6); goto case 6;
-                case 6: sum += *(pSrcCurrent + 5) * *(pMatCurrent + 5); goto case 5;
-                case 5: sum += *(pSrcCurrent + 4) * *(pMatCurrent + 4); goto case 4;
-                case 4: sum += *(pSrcCurrent + 3) * *(pMatCurrent + 3); goto case 3;
-                case 3: sum += *(pSrcCurrent + 2) * *(pMatCurrent + 2); goto case 2;
-                case 2: sum += *(pSrcCurrent + 1) * *(pMatCurrent + 1); goto case 1;
-                case 1: sum += *(pSrcCurrent) * *(pMatCurrent); break;
-            }
-            return sum;
+            MatMulP(mat.Items, rgposSrc, src.Items, posMin, iposMin, iposEnd, dst.Items, crow, ccol);
         }
 
-        // Partial sparse source vector.
         public static unsafe void MatMulP(ReadOnlySpan<float> mat, ReadOnlySpan<int> rgposSrc, ReadOnlySpan<float> src,
                                         int posMin, int iposMin, int iposEnd, Span<float> dst, int crow, int ccol)
         {
@@ -499,6 +461,11 @@ Vector256<float> SparseMultiplicationAcrossRow()
             }
         }
 
+        public static unsafe void MatMulTran(AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            MatMulTran(mat.Items, src.Items, dst.Items, crow, ccol);
+        }
+
         public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float> src, Span<float> dst, int crow, int ccol)
         {
             fixed (float* psrc = &MemoryMarshal.GetReference(src))
@@ -511,31 +478,43 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                 float* pDstEnd = pdst + crow;
                 float* pSrcCurrent = psrc;
                 float* pMatCurrent = pmat;
-                int remainder = 0;
-                int numCol = ccol;
 
                 // The reason behind adding the if condtion instead of boolean flag
                 // is to avoid branching in codegen.
                 if (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> h01 = Sse.LoadVector128(pSrcCurrent);
+                    // Replicate each slot of h01 (ABCD) into its own register.
+                    Vector128<float> h11 = Avx.Permute(h01, 0x55); // B
+                    Vector128<float> h21 = Avx.Permute(h01, 0xAA); // C
+                    Vector128<float> h31 = Avx.Permute(h01, 0xFF); // D
                     h01 = Avx.Permute(h01, 0x00); // A
 
                     Vector256<float> x01 = Avx.SetHighLow(h01, h01);
+                    Vector256<float> x11 = Avx.SetHighLow(h11, h11);
+                    Vector256<float> x21 = Avx.SetHighLow(h21, h21);
+                    Vector256<float> x31 = Avx.SetHighLow(h31, h31);
+
                     int length = crow;
                     float* pDstCurrent = pdst;
 
                     nuint address = (nuint)(pMatCurrent);
                     int misalignment = (int)(address % 32);
 
-                    if ((misalignment & 3) != 0 || (crow % 8 != 0))
+                    if ((misalignment & 3) != 0)
                     {
                         // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
-                        remainder = crow % 8;
-                        while (pDstCurrent + 8 <= pDstEnd)
+                        while (pDstCurrent < pDstEnd)
                         {
                             float* pMatTemp = pMatCurrent;
                             Vector256<float> x02 = Avx.Multiply(x01, Avx.LoadVector256(pMatTemp));
+                            Vector256<float> x12 = Avx.Multiply(x11, Avx.LoadVector256(pMatTemp += crow));
+                            Vector256<float> x22 = Avx.Multiply(x21, Avx.LoadVector256(pMatTemp += crow));
+                            Vector256<float> x32 = Avx.Multiply(x31, Avx.LoadVector256(pMatTemp += crow));
+
+                            x02 = Avx.Add(x02, x12);
+                            x22 = Avx.Add(x22, x32);
+                            x02 = Avx.Add(x02, x22);
 
                             Avx.Store(pDstCurrent, x02);
                             pDstCurrent += 8;
@@ -544,6 +523,7 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                     }
                     else
                     {
+                        int remainder = 0;
                         if (misalignment != 0)
                         {
                             // Handle cases where the data is not 256-bit aligned by doing an unaligned read and then
@@ -556,11 +536,22 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                             // We only align pMat since it has significantly more reads.
                             float* pMatTemp = pMatCurrent;
                             Vector256<float> x02 = Avx.And(leadingMask, Avx.LoadVector256(pMatTemp));
+                            Vector256<float> x12 = Avx.And(leadingMask, Avx.LoadVector256(pMatTemp += crow));
+                            Vector256<float> x22 = Avx.And(leadingMask, Avx.LoadVector256(pMatTemp += crow));
+                            Vector256<float> x32 = Avx.And(leadingMask, Avx.LoadVector256(pMatTemp += crow));
+
                             x02 = Avx.Multiply(x01, x02);
+                            x12 = Avx.Multiply(x11, x12);
+                            x22 = Avx.Multiply(x21, x22);
+                            x32 = Avx.Multiply(x31, x32);
+
+                            x02 = Avx.Add(x02, x12);
+                            x22 = Avx.Add(x22, x32);
+                            x02 = Avx.Add(x02, x22);
 
                             Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + ((8 - misalignment) * 8));
                             Vector256<float> x3 = Avx.LoadVector256(pDstCurrent);
-                            x02 = Avx.Add(x02, Avx.And(x3, trailingMask));
+                            x02 = Avx.Or(x02, Avx.And(x3, trailingMask));
 
                             Avx.Store(pDstCurrent, x02);
                             pMatCurrent += misalignment;
@@ -578,7 +569,15 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                                 // (due to semantics of the legacy encoding).
                                 // We don't need an assert, since the instruction will throw for unaligned inputs.
                                 float* pMatTemp = pMatCurrent;
+
                                 Vector256<float> x02 = Avx.Multiply(x01, Avx.LoadVector256(pMatTemp));
+                                Vector256<float> x12 = Avx.Multiply(x11, Avx.LoadVector256(pMatTemp += crow));
+                                Vector256<float> x22 = Avx.Multiply(x21, Avx.LoadVector256(pMatTemp += crow));
+                                Vector256<float> x32 = Avx.Multiply(x31, Avx.LoadVector256(pMatTemp += crow));
+
+                                x02 = Avx.Add(x02, x12);
+                                x22 = Avx.Add(x22, x32);
+                                x02 = Avx.Add(x02, x22);
 
                                 Avx.Store(pDstCurrent, x02);
                                 pDstCurrent += 8;
@@ -592,36 +591,47 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                             // unaligned loads where we mask the input each time.
                             remainder = length;
                         }
-                    }
 
-                    if (remainder != 0)
-                    {
-                        // Handle any trailing elements that don't fit into a 256-bit block by moving back so that the next
-                        // unaligned load will read to the end of the array and then mask out any elements already processed
+                        if (remainder != 0)
+                        {
+                            // Handle any trailing elements that don't fit into a 256-bit block by moving back so that the next
+                            // unaligned load will read to the end of the array and then mask out any elements already processed
 
-                        pMatCurrent -= (8 - remainder);
-                        pDstCurrent -= (8 - remainder);
-                        Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
+                            pMatCurrent -= (8 - remainder);
+                            pDstCurrent -= (8 - remainder);
+                            Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
 
-                        float* pMatTemp = pMatCurrent;
-                        Vector256<float> x02 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp));
-                        x02 = Avx.Multiply(x01, x02);
+                            float* pMatTemp = pMatCurrent;
+                            Vector256<float> x02 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp));
+                            Vector256<float> x12 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow));
+                            Vector256<float> x22 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow));
+                            Vector256<float> x32 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow));
 
-                        Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8));
-                        Vector256<float> x3 = Avx.LoadVector256(pDstCurrent);
-                        x02 = Avx.Add(x02, Avx.And(x3, leadingMask));
+                            x02 = Avx.Multiply(x01, x02);
+                            x12 = Avx.Multiply(x11, x12);
+                            x22 = Avx.Multiply(x21, x22);
+                            x32 = Avx.Multiply(x31, x32);
 
-                        Avx.Store(pDstCurrent, x02);
-                        pDstCurrent += 8;
-                        pMatCurrent += 8;
+                            x02 = Avx.Add(x02, x12);
+                            x22 = Avx.Add(x22, x32);
+                            x02 = Avx.Add(x02, x22);
+
+                            Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8));
+                            Vector256<float> x3 = Avx.LoadVector256(pDstCurrent);
+                            x02 = Avx.Or(x02, Avx.And(x3, leadingMask));
+
+                            Avx.Store(pDstCurrent, x02);
+                            pDstCurrent += 8;
+                            pMatCurrent += 8;
+                        }
                     }
 
-                    pSrcCurrent += 1;
-                    numCol -= 1;
+                    pMatCurrent += 3 * crow;
+                    pSrcCurrent += 4;
                 }
 
                 // We do 4-way unrolling
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> h01 = Sse.LoadVector128(pSrcCurrent);
                     // Replicate each slot of h01 (ABCD) into its own register.
@@ -641,10 +651,9 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                     nuint address = (nuint)(pMatCurrent);
                     int misalignment = (int)(address % 32);
 
-                    if ((misalignment & 3) != 0 || (crow % 8 != 0))
+                    if ((misalignment & 3) != 0)
                     {
-                        remainder = length % 8;
-                        while (pDstCurrent + 4 <= pDstEnd)
+                        while (pDstCurrent < pDstEnd)
                         {
                             float* pMatTemp = pMatCurrent;
                             Vector256<float> x02 = Avx.Multiply(x01, Avx.LoadVector256(pMatTemp));
@@ -665,6 +674,7 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                     }
                     else
                     {
+                        int remainder = 0;
                         if (misalignment != 0)
                         {
                             // Handle cases where the data is not 256-bit aligned by doing an unaligned read and then
@@ -692,7 +702,7 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
 
                             Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + ((8 - misalignment) * 8));
                             Vector256<float> x3 = Avx.LoadVector256(pDstCurrent);
-                            x02 = Avx.Add(x02, Avx.And(x3, trailingMask));
+                            x02 = Avx.Or(x02, Avx.And(x3, trailingMask));
 
                             x02 = Avx.Add(x02, Avx.And(x3, leadingMask));
 
@@ -728,92 +738,46 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                         {
                             remainder = length;
                         }
-                    }
 
-                    if (remainder != 0)
-                    {
-                        pMatCurrent -= (8 - remainder);
-                        pDstCurrent -= (8 - remainder);
-                        Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
+                        if (remainder != 0)
+                        {
+                            pMatCurrent -= (8 - remainder);
+                            pDstCurrent -= (8 - remainder);
+                            Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
 
-                        float* pMatTemp = pMatCurrent;
-                        Vector256<float> x02 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp));
-                        Vector256<float> x12 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow));
-                        Vector256<float> x22 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow));
-                        Vector256<float> x32 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow));
+                            float* pMatTemp = pMatCurrent;
+                            Vector256<float> x02 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp));
+                            Vector256<float> x12 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow));
+                            Vector256<float> x22 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow));
+                            Vector256<float> x32 = Avx.And(trailingMask, Avx.LoadVector256(pMatTemp += crow));
 
-                        x02 = Avx.Multiply(x01, x02);
-                        x12 = Avx.Multiply(x11, x12);
-                        x22 = Avx.Multiply(x21, x22);
-                        x32 = Avx.Multiply(x31, x32);
+                            x02 = Avx.Multiply(x01, x02);
+                            x12 = Avx.Multiply(x11, x12);
+                            x22 = Avx.Multiply(x21, x22);
+                            x32 = Avx.Multiply(x31, x32);
 
-                        x02 = Avx.Add(x02, x12);
-                        x22 = Avx.Add(x22, x32);
-                        x02 = Avx.Add(x02, x22);
+                            x02 = Avx.Add(x02, x12);
+                            x22 = Avx.Add(x22, x32);
+                            x02 = Avx.Add(x02, x22);
 
-                        Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8));
-                        Vector256<float> x3 = Avx.LoadVector256(pDstCurrent);
-                        x02 = Avx.Add(x02, Avx.And(x3, leadingMask));
+                            Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8));
+                            Vector256<float> x3 = Avx.LoadVector256(pDstCurrent);
+                            x02 = Avx.Or(x02, Avx.And(x3, leadingMask));
 
-                        x02 = Avx.Add(x02, Avx.And(x3, trailingMask));
+                            x02 = Avx.Add(x02, Avx.And(x3, trailingMask));
 
-                        Avx.Store(pDstCurrent, x02);
-                        pDstCurrent += 8;
-                        pMatCurrent += 8;
+                            Avx.Store(pDstCurrent, x02);
+                            pDstCurrent += 8;
+                            pMatCurrent += 8;
+                        }
                     }
 
                     pMatCurrent += 3 * crow;
                     pSrcCurrent += 4;
-                    numCol -= 4;
-                }
-
-                // falling through the case statements
-                switch (numCol)
-                {
-                    case 3: ColumnMultiply(pMatCurrent + 2 * crow, pSrcCurrent + 2, pdst, pDstEnd, crow); goto case 2;
-                    case 2: ColumnMultiply(pMatCurrent + crow, pSrcCurrent + 1, pdst, pDstEnd, crow); goto case 1;
-                    case 1: ColumnMultiply(pMatCurrent, pSrcCurrent, pdst, pDstEnd, crow); break;
                 }
             }
         }
 
-        private static unsafe void ColumnMultiply(float* pMatCurrent, float* pSrcCurrent, float* pdst, float* pDstEnd, int crow)
-        {
-            Vector128<float> h01 = Sse.LoadVector128(pSrcCurrent);
-            // Replicate each slot of h01 (ABCD) into its own register.
-            h01 = Avx.Permute(h01, 0x00); // A
-            Vector256<float> x01 = Avx.SetHighLow(h01, h01);
-            int remainder = crow % 8;
-            float* pDstCurrent = pdst;
-
-            while (pDstCurrent + 8 <= pDstEnd)
-            {
-                // If we aren't using the VEX-encoding, the JIT will only fold away aligned loads
-                // (due to semantics of the legacy encoding).
-                // We don't need an assert, since the instruction will throw for unaligned inputs.
-                float* pMatTemp = pMatCurrent;
-                Vector256<float> x02 = Avx.Multiply(x01, Avx.LoadVector256(pMatTemp));
-                x02 = Avx.Add(x02, Avx.LoadVector256(pDstCurrent));
-
-                Avx.Store(pDstCurrent, x02);
-                pDstCurrent += 8;
-                pMatCurrent += 8;
-            }
-
-            // falling through the case statements
-            switch (remainder)
-            {
-                case 7: *(pDstCurrent + 6) += *(pSrcCurrent) * *(pMatCurrent + 6); goto case 6;
-                case 6: *(pDstCurrent + 5) += *(pSrcCurrent) * *(pMatCurrent + 5); goto case 5;
-                case 5: *(pDstCurrent + 4) += *(pSrcCurrent) * *(pMatCurrent + 4); goto case 4;
-                case 4: *(pDstCurrent + 3) += *(pSrcCurrent) * *(pMatCurrent + 3); goto case 3;
-                case 3: *(pDstCurrent + 2) += *(pSrcCurrent) * *(pMatCurrent + 2); goto case 2;
-                case 2: *(pDstCurrent + 1) += *(pSrcCurrent) * *(pMatCurrent + 1); goto case 1;
-                case 1: *pDstCurrent += *(pSrcCurrent) * *(pMatCurrent); break;
-            }
-            return;
-        }
-
         // dst[i] += scale
         public static unsafe void AddScalarU(float scalar, Span<float> dst)
         {
diff --git a/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
new file mode 100644
index 0000000000..33690055b2
--- /dev/null
+++ b/src/Microsoft.ML.CpuMath/CpuAligenedMathUtils.cs
@@ -0,0 +1,148 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Internal.CpuMath.Core;
+
+namespace Microsoft.ML.Runtime.Internal.CpuMath
+{
+    [BestFriend]
+    internal static class CpuAligenedMathUtils<TMatrix>
+        where TMatrix : CpuAlignedMatrixBase, ICpuFullMatrix
+    {
+        /// <summary>
+        /// Assert the compatibility of the underlying AlignedArray for the input matrix in terms of alignment amount.
+        /// </summary>
+        /// <param name="values">The input matrix</param>
+        public static void AssertCompatible(ICpuFullMatrix values)
+        {
+#if DEBUG
+            var mat = values as TMatrix;
+            Contracts.AssertValue(mat);
+            Contracts.Assert((mat.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
+#endif
+        }
+
+        /// <summary>
+        /// Assert the compatibility of the underlying AlignedArray for the input vector in terms of alignment amount.
+        /// </summary>
+        /// <param name="values">The input vector</param>
+        public static void AssertCompatible(ICpuVector values)
+        {
+#if DEBUG
+            CpuAlignedVector vec = values as CpuAlignedVector;
+            Contracts.AssertValue(vec);
+            Contracts.Assert((vec.Items.CbAlign % CpuMathUtils.GetVectorAlignment()) == 0);
+#endif
+        }
+
+        private static TMatrix A(ICpuFullMatrix x)
+        {
+            AssertCompatible(x);
+            return (TMatrix)x;
+        }
+
+        private static CpuAlignedVector A(ICpuVector x)
+        {
+            AssertCompatible(x);
+            return (CpuAlignedVector)x;
+        }
+
+        private static void AssertCompatibleCore(ICpuMatrix mat, ICpuVector src, ICpuVector dst)
+        {
+            AssertCompatible(src);
+            AssertCompatible(dst);
+            Contracts.Assert(mat.ColCount == src.VectorSize);
+            Contracts.Assert(mat.RowCount == dst.VectorSize);
+        }
+
+        /// <summary>
+        /// Asserts the following:
+        /// 1. The compatibility of the underlying AlignedArray for mat in terms of alignment amount.
+        /// 2. The compatibility of the underlying AlignedArray for src in terms of alignment amount.
+        /// 3. The compatibility of the underlying AlignedArray for dst in terms of alignment amount.
+        /// 4. The compatibility of the matrix-vector multiplication mat * src = dst.
+        /// </summary>
+        /// <param name="mat"></param>
+        /// <param name="src"></param>
+        /// <param name="dst"></param>
+        public static void AssertCompatible(ICpuFullMatrix mat, ICpuVector src, ICpuVector dst)
+        {
+            // Also check the physical sizes.
+            AssertCompatible(mat);
+            AssertCompatibleCore(mat, src, dst);
+            var m = A(mat);
+            Contracts.Assert(m.ColCountPhy == A(src).Items.Size);
+            Contracts.Assert(m.RowCountPhy == A(dst).Items.Size);
+        }
+
+        /// <summary>
+        /// Matrix multiplication:
+        /// dst = mat * src
+        /// </summary>
+        /// <param name="mat">The multiplier matrix</param>
+        /// <param name="src">The source vector</param>
+        /// <param name="dst">The destination vector</param>
+        public static void MatTimesSrc(ICpuFullMatrix mat, ICpuVector src, ICpuVector dst)
+        {
+            bool colMajor = typeof(TMatrix) == typeof(CpuAlignedMatrixCol);
+            AssertCompatible(mat, src, dst);
+            var m = A(mat);
+            CpuMathUtils.MatrixTimesSource(colMajor, m.Items, A(src).Items, A(dst).Items, m.RunCnt);
+        }
+
+        /// <summary>
+        /// Matrix transpose multiplication:
+        /// dst = mat' * src
+        /// </summary>
+        /// <param name="mat">The multiplier matrix</param>
+        /// <param name="src">The source vector</param>
+        /// <param name="dst">The destination vector</param>
+        public static void MatTranTimesSrc(ICpuFullMatrix mat, ICpuVector src, ICpuVector dst)
+        {
+            bool colMajor = typeof(TMatrix) == typeof(CpuAlignedMatrixCol);
+            AssertCompatible(mat, dst, src);
+            var m = A(mat);
+            CpuMathUtils.MatrixTimesSource(!colMajor, m.Items, A(src).Items, A(dst).Items, m.RunCnt);
+        }
+    }
+
+    public static class GeneralUtils
+    {
+        /// <summary>
+        /// Count the number of zero bits in the lonest string of zero's from the lowest significant bit of the input integer.
+        /// </summary>
+        /// <param name="u">The input integer</param>
+        /// <returns></returns>
+        public static int CbitLowZero(uint u)
+        {
+            if (u == 0)
+                return 32;
+
+            int cbit = 0;
+            if ((u & 0x0000FFFF) == 0)
+            {
+                cbit += 16;
+                u >>= 16;
+            }
+            if ((u & 0x000000FF) == 0)
+            {
+                cbit += 8;
+                u >>= 8;
+            }
+            if ((u & 0x0000000F) == 0)
+            {
+                cbit += 4;
+                u >>= 4;
+            }
+            if ((u & 0x00000003) == 0)
+            {
+                cbit += 2;
+                u >>= 2;
+            }
+            if ((u & 0x00000001) == 0)
+                cbit += 1;
+            return cbit;
+        }
+    }
+}
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 6546eeef31..d895e590a9 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -11,62 +11,76 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static partial class CpuMathUtils
     {
-        public static void MatrixTimesSource(bool transpose, ReadOnlySpan<float> matrix, ReadOnlySpan<float> source, Span<float> destination, int stride)
+        // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
+        private const int Vector128Alignment = 16;
+
+        // The count of bytes in Vector256<T>, corresponding to _cbAlign in AlignedArray
+        private const int Vector256Alignment = 32;
+
+        // The count of bytes in a 32-bit float, corresponding to _cbAlign in AlignedArray
+        private const int FloatAlignment = 4;
+
+        // If neither AVX nor SSE is supported, return basic alignment for a 4-byte float.
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        public static int GetVectorAlignment()
+            => Avx.IsSupported ? Vector256Alignment : (Sse.IsSupported ? Vector128Alignment : FloatAlignment);
+
+        public static void MatrixTimesSource(bool transpose, AlignedArray matrix, AlignedArray source, AlignedArray destination, int stride)
         {
-            Contracts.AssertNonEmpty(matrix);
-            Contracts.AssertNonEmpty(source);
-            Contracts.AssertNonEmpty(destination);
-            Contracts.Assert(matrix.Length == destination.Length * source.Length);
+            Contracts.Assert(matrix.Size == destination.Size * source.Size);
             Contracts.Assert(stride >= 0);
 
-            if (!transpose)
+            if (Avx.IsSupported)
             {
-                if (Avx.IsSupported && source.Length >= 8)
+                if (!transpose)
                 {
-                    Contracts.Assert(stride <= destination.Length);
-                    AvxIntrinsics.MatMul(matrix, source, destination, stride, source.Length);
+                    Contracts.Assert(stride <= destination.Size);
+                    AvxIntrinsics.MatMul(matrix, source, destination, stride, source.Size);
                 }
-                else if (Sse.IsSupported && source.Length >= 4)
+                else
                 {
-                    Contracts.Assert(stride <= destination.Length);
-                    SseIntrinsics.MatMul(matrix, source, destination, stride, source.Length);
+                    Contracts.Assert(stride <= source.Size);
+                    AvxIntrinsics.MatMulTran(matrix, source, destination, destination.Size, stride);
+                }
+            }
+            else if (Sse.IsSupported)
+            {
+                if (!transpose)
+                {
+                    Contracts.Assert(stride <= destination.Size);
+                    SseIntrinsics.MatMul(matrix, source, destination, stride, source.Size);
                 }
                 else
                 {
-                    Contracts.Assert(stride <= destination.Length);
+                    Contracts.Assert(stride <= source.Size);
+                    SseIntrinsics.MatMulTran(matrix, source, destination, destination.Size, stride);
+                }
+            }
+            else
+            {
+                if (!transpose)
+                {
+                    Contracts.Assert(stride <= destination.Size);
                     for (int i = 0; i < stride; i++)
                     {
                         float dotProduct = 0;
-                        for (int j = 0; j < source.Length; j++)
+                        for (int j = 0; j < source.Size; j++)
                         {
-                            dotProduct += matrix[i * source.Length + j] * source[j];
+                            dotProduct += matrix[i * source.Size + j] * source[j];
                         }
 
                         destination[i] = dotProduct;
                     }
                 }
-            }
-            else
-            {
-                if (Avx.IsSupported && destination.Length >= 8)
-                {
-                    Contracts.Assert(stride <= source.Length);
-                    AvxIntrinsics.MatMulTran(matrix, source, destination, destination.Length, stride);
-                }
-                else if (Sse.IsSupported && destination.Length >=4)
-                {
-                    Contracts.Assert(stride <= source.Length);
-                    SseIntrinsics.MatMulTran(matrix, source, destination, destination.Length, stride);
-                }
                 else
                 {
-                    Contracts.Assert(stride <= source.Length);
-                    for (int i = 0; i < destination.Length; i++)
+                    Contracts.Assert(stride <= source.Size);
+                    for (int i = 0; i < destination.Size; i++)
                     {
                         float dotProduct = 0;
                         for (int j = 0; j < stride; j++)
                         {
-                            dotProduct += matrix[j * destination.Length + i] * source[j];
+                            dotProduct += matrix[j * source.Size + i] * source[j];
                         }
 
                         destination[i] = dotProduct;
@@ -75,22 +89,17 @@ public static void MatrixTimesSource(bool transpose, ReadOnlySpan<float> matrix,
             }
         }
 
-        public static void MatrixTimesSource(ReadOnlySpan<float> matrix, ReadOnlySpan<int> rgposSrc, ReadOnlySpan<float> sourceValues,
-            int posMin, int iposMin, int iposLimit, Span<float> destination, int stride)
+        public static void MatrixTimesSource(AlignedArray matrix, ReadOnlySpan<int> rgposSrc, AlignedArray sourceValues,
+            int posMin, int iposMin, int iposLimit, AlignedArray destination, int stride)
         {
             Contracts.Assert(iposMin >= 0);
             Contracts.Assert(iposMin <= iposLimit);
             Contracts.Assert(iposLimit <= rgposSrc.Length);
-            Contracts.AssertNonEmpty(matrix);
-            Contracts.AssertNonEmpty(sourceValues);
-            Contracts.AssertNonEmpty(destination);
-            Contracts.AssertNonEmpty(rgposSrc);
-            Contracts.Assert(stride > 0);
-            Contracts.Assert(matrix.Length == destination.Length * sourceValues.Length);
+            Contracts.Assert(matrix.Size == destination.Size * sourceValues.Size);
 
             if (iposMin >= iposLimit)
             {
-                destination.Clear();
+                destination.ZeroItems();
                 return;
             }
 
@@ -99,24 +108,24 @@ public static void MatrixTimesSource(ReadOnlySpan<float> matrix, ReadOnlySpan<in
 
             if (Avx.IsSupported)
             {
-                Contracts.Assert(stride <= destination.Length);
-                AvxIntrinsics.MatMulP(matrix, rgposSrc, sourceValues, posMin, iposMin, iposLimit, destination, stride, sourceValues.Length);
+                Contracts.Assert(stride <= destination.Size);
+                AvxIntrinsics.MatMulP(matrix, rgposSrc, sourceValues, posMin, iposMin, iposLimit, destination, stride, sourceValues.Size);
             }
             else if (Sse.IsSupported)
             {
-                Contracts.Assert(stride <= destination.Length);
-                SseIntrinsics.MatMulP(matrix, rgposSrc, sourceValues, posMin, iposMin, iposLimit, destination, stride, sourceValues.Length);
+                Contracts.Assert(stride <= destination.Size);
+                SseIntrinsics.MatMulP(matrix, rgposSrc, sourceValues, posMin, iposMin, iposLimit, destination, stride, sourceValues.Size);
             }
             else
             {
-                Contracts.Assert(stride <= destination.Length);
+                Contracts.Assert(stride <= destination.Size);
                 for (int i = 0; i < stride; i++)
                 {
                     float dotProduct = 0;
                     for (int j = iposMin; j < iposLimit; j++)
                     {
                         int col = rgposSrc[j] - posMin;
-                        dotProduct += matrix[i * sourceValues.Length + col] * sourceValues[col];
+                        dotProduct += matrix[i * sourceValues.Size + col] * sourceValues[col];
                     }
                     destination[i] = dotProduct;
                 }
@@ -627,6 +636,71 @@ public static float L2DistSquared(ReadOnlySpan<float> left, ReadOnlySpan<float>
             }
         }
 
+        public static void ZeroMatrixItems(AlignedArray destination, int ccol, int cfltRow, int[] indices)
+        {
+            Contracts.Assert(ccol > 0);
+            Contracts.Assert(ccol <= cfltRow);
+
+            if (ccol == cfltRow)
+            {
+                ZeroItemsU(destination, destination.Size, indices, indices.Length);
+            }
+            else
+            {
+                ZeroMatrixItemsCore(destination, destination.Size, ccol, cfltRow, indices, indices.Length);
+            }
+        }
+
+        private static unsafe void ZeroItemsU(AlignedArray destination, int c, int[] indices, int cindices)
+        {
+            fixed (float* pdst = &destination.Items[0])
+            fixed (int* pidx = &indices[0])
+            {
+                for (int i = 0; i < cindices; ++i)
+                {
+                    int index = pidx[i];
+                    Contracts.Assert(index >= 0);
+                    Contracts.Assert(index < c);
+                    pdst[index] = 0;
+                }
+            }
+        }
+
+        private static unsafe void ZeroMatrixItemsCore(AlignedArray destination, int c, int ccol, int cfltRow, int[] indices, int cindices)
+        {
+            fixed (float* pdst = &destination.Items[0])
+            fixed (int* pidx = &indices[0])
+            {
+                int ivLogMin = 0;
+                int ivLogLim = ccol;
+                int ivPhyMin = 0;
+
+                for (int i = 0; i < cindices; ++i)
+                {
+                    int index = pidx[i];
+                    Contracts.Assert(index >= 0);
+                    Contracts.Assert(index < c);
+
+                    int col = index - ivLogMin;
+                    if ((uint)col >= (uint)ccol)
+                    {
+                        Contracts.Assert(ivLogMin > index || index >= ivLogLim);
+
+                        int row = index / ccol;
+                        ivLogMin = row * ccol;
+                        ivLogLim = ivLogMin + ccol;
+                        ivPhyMin = row * cfltRow;
+
+                        Contracts.Assert(index >= ivLogMin);
+                        Contracts.Assert(index < ivLogLim);
+                        col = index - ivLogMin;
+                    }
+
+                    pdst[ivPhyMin + col] = 0;
+                }
+            }
+        }
+
         public static void SdcaL1UpdateDense(float primalUpdate, int count, ReadOnlySpan<float> source, float threshold, Span<float> v, Span<float> w)
         {
             Contracts.AssertNonEmpty(source);
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index 400b70d651..5ecbc62be1 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -11,10 +11,17 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
     [BestFriend]
     internal static partial class CpuMathUtils
     {
-        public static void MatrixTimesSource(bool transpose, ReadOnlySpan<float> matrix, ReadOnlySpan<float> source, Span<float> destination, int stride) => SseUtils.MatTimesSrc(transpose, matrix, source, destination, stride);
+        // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
+        private const int Vector128Alignment = 16;
 
-        public static void MatrixTimesSource(ReadOnlySpan<float> matrix, ReadOnlySpan<int> rgposSrc, ReadOnlySpan<float> sourceValues,
-            int posMin, int iposMin, int iposLimit, Span<float> destination, int stride) => SseUtils.MatTimesSrc(matrix, rgposSrc, sourceValues, posMin, iposMin, iposLimit, destination, stride);
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        public static int GetVectorAlignment()
+            => Vector128Alignment;
+
+        public static void MatrixTimesSource(bool transpose, AlignedArray matrix, AlignedArray source, AlignedArray destination, int stride) => SseUtils.MatTimesSrc(transpose, matrix, source, destination, stride);
+
+        public static void MatrixTimesSource(AlignedArray matrix, ReadOnlySpan<int> rgposSrc, AlignedArray sourceValues,
+            int posMin, int iposMin, int iposLimit, AlignedArray destination, int stride) => SseUtils.MatTimesSrc(matrix, rgposSrc, sourceValues, posMin, iposMin, iposLimit, destination, stride);
 
         public static void Add(float value, Span<float> destination) => SseUtils.Add(value, destination);
 
@@ -56,6 +63,8 @@ public static void MatrixTimesSource(ReadOnlySpan<float> matrix, ReadOnlySpan<in
 
         public static float L2DistSquared(ReadOnlySpan<float> left, ReadOnlySpan<float> right, int count) => SseUtils.L2DistSquared(left, right, count);
 
+        public static void ZeroMatrixItems(AlignedArray destination, int ccol, int cfltRow, int[] indices) => SseUtils.ZeroMatrixItems(destination, ccol, cfltRow, indices);
+
         public static void SdcaL1UpdateDense(float primalUpdate, int count, ReadOnlySpan<float> source, float threshold, Span<float> v, Span<float> w)
             => SseUtils.SdcaL1UpdateDense(primalUpdate, count, source, threshold, v, w);
 
diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs
index d57b400e9c..8b1c4da70f 100644
--- a/src/Microsoft.ML.CpuMath/Sse.cs
+++ b/src/Microsoft.ML.CpuMath/Sse.cs
@@ -15,53 +15,74 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
     [BestFriend]
     internal static class SseUtils
     {
-        public static void MatTimesSrc(bool tran, ReadOnlySpan<float> mat, ReadOnlySpan<float> src, Span<float> dst, int crun)
+        public const int CbAlign = 16;
+
+        private static bool Compat(AlignedArray a)
+        {
+            Contracts.AssertValue(a);
+            Contracts.Assert(a.Size > 0);
+            return a.CbAlign == CbAlign;
+        }
+
+        private static unsafe float* Ptr(AlignedArray a, float* p)
+        {
+            Contracts.AssertValue(a);
+            float* q = p + a.GetBase((long)p);
+            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
+            return q;
+        }
+
+        public static void MatTimesSrc(bool tran, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
         {
-            Contracts.Assert(mat.Length == dst.Length * src.Length);
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+            Contracts.Assert(mat.Size == dst.Size * src.Size);
 
             unsafe
             {
-                fixed (float* pmat = &mat[0])
-                fixed (float* psrc = &src[0])
-                fixed (float* pdst = &dst[0])
+                fixed (float* pmat = &mat.Items[0])
+                fixed (float* psrc = &src.Items[0])
+                fixed (float* pdst = &dst.Items[0])
                 {
                     if (!tran)
                     {
-                        Contracts.Assert(0 <= crun && crun <= dst.Length);
-                        Thunk.MatMul(pmat, psrc, pdst, crun, src.Length);
+                        Contracts.Assert(0 <= crun && crun <= dst.Size);
+                        Thunk.MatMul(Ptr(mat, pmat), Ptr(src, psrc), Ptr(dst, pdst), crun, src.Size);
                     }
                     else
                     {
-                        Contracts.Assert(0 <= crun && crun <= src.Length);
-                        Thunk.MatMulTran(pmat, psrc, pdst, dst.Length, crun);
+                        Contracts.Assert(0 <= crun && crun <= src.Size);
+                        Thunk.MatMulTran(Ptr(mat, pmat), Ptr(src, psrc), Ptr(dst, pdst), dst.Size, crun);
                     }
                 }
             }
         }
 
-        public static void MatTimesSrc(ReadOnlySpan<float> mat, ReadOnlySpan<int> rgposSrc, ReadOnlySpan<float> srcValues,
-            int posMin, int iposMin, int iposLim, Span<float> dst, int crun)
+        public static void MatTimesSrc(AlignedArray mat, ReadOnlySpan<int> rgposSrc, AlignedArray srcValues,
+            int posMin, int iposMin, int iposLim, AlignedArray dst, int crun)
         {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(srcValues));
+            Contracts.Assert(Compat(dst));
             Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length);
-            Contracts.Assert(mat.Length == dst.Length * srcValues.Length);
+            Contracts.Assert(mat.Size == dst.Size * srcValues.Size);
 
             if (iposMin >= iposLim)
             {
-                dst.Clear();
+                dst.ZeroItems();
                 return;
             }
-
             Contracts.AssertNonEmpty(rgposSrc);
-
             unsafe
             {
-                fixed (float* pdst = &dst[0])
-                fixed (float* pmat = &mat[0])
-                fixed (float* psrc = &srcValues[0])
+                fixed (float* pdst = &dst.Items[0])
+                fixed (float* pmat = &mat.Items[0])
+                fixed (float* psrc = &srcValues.Items[0])
                 fixed (int* ppossrc = &rgposSrc[0])
                 {
-                    Contracts.Assert(0 <= crun && crun <= dst.Length);
-                    Thunk.MatMulP(pmat, ppossrc, psrc, posMin, iposMin, iposLim, pdst, crun, srcValues.Length);
+                    Contracts.Assert(0 <= crun && crun <= dst.Size);
+                    Thunk.MatMulP(Ptr(mat, pmat), ppossrc, Ptr(srcValues, psrc), posMin, iposMin, iposLim, Ptr(dst, pdst), crun, srcValues.Size);
                 }
             }
         }
@@ -345,6 +366,23 @@ public static float L2DistSquared(ReadOnlySpan<float> a, ReadOnlySpan<float> b,
             }
         }
 
+        public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices)
+        {
+            Contracts.Assert(0 < ccol && ccol <= cfltRow);
+
+            unsafe
+            {
+                fixed (float* pdst = &dst.Items[0])
+                fixed (int* pi = &indices[0])
+                {
+                    if (ccol == cfltRow)
+                        Thunk.ZeroItemsU(Ptr(dst, pdst), dst.Size, pi, indices.Length);
+                    else
+                        Thunk.ZeroMatrixItemsCore(Ptr(dst, pdst), dst.Size, ccol, cfltRow, pi, indices.Length);
+                }
+            }
+        }
+
         public static void SdcaL1UpdateDense(float primalUpdate, int count, ReadOnlySpan<float> src, float threshold, Span<float> v, Span<float> w)
         {
             Contracts.AssertNonEmpty(src);
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 98394f5d7d..b83fd6bbc6 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -116,6 +116,11 @@ internal static Vector128<float> GetNewDst128(in Vector128<float> xDst1, in Vect
         }
 
         // Multiply matrix times vector into vector.
+        public static unsafe void MatMul(AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            MatMul(mat.Items, src.Items, dst.Items, crow, ccol);
+        }
+
         public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> src, Span<float> dst, int crow, int ccol)
         {
             fixed (float* psrc = &MemoryMarshal.GetReference(src))
@@ -128,9 +133,8 @@ public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> sr
                 float* pDstEnd = pdst + crow;
                 float* pDstCurrent = pdst;
                 float* pMatCurrent = pmat;
-                int numRows = crow;
 
-                while (pDstCurrent + 4 <= pDstEnd)
+                while (pDstCurrent < pDstEnd)
                 {
                     Vector128<float> res0 = Sse.SetZeroVector128();
                     Vector128<float> res1 = Sse.SetZeroVector128();
@@ -144,11 +148,10 @@ public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> sr
                     int misalignment = (int)(address % 16);
                     int remainder = 0;
 
-                    if ((misalignment & 3) != 0 || (ccol % 4 != 0))
+                    if ((misalignment & 3) != 0)
                     {
                         // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
-                        remainder = length % 4;
-                        while (pSrcCurrent + 4 <= pSrcEnd)
+                        while (pSrcCurrent < pSrcEnd)
                         {
                             Vector128<float> vector = Sse.LoadVector128(pSrcCurrent);
 
@@ -230,32 +233,32 @@ public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> sr
                             // unaligned loads where we mask the input each time.
                             remainder = length;
                         }
-                    }
 
-                    if (remainder != 0)
-                    {
-                        // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
-                        // unaligned load will read to the end of the array and then mask out any elements already processed
+                        if (remainder != 0)
+                        {
+                            // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                            // unaligned load will read to the end of the array and then mask out any elements already processed
 
-                        pMatCurrent -= (4 - remainder);
-                        pSrcCurrent -= (4 - remainder);
+                            pMatCurrent -= (4 - remainder);
+                            pSrcCurrent -= (4 - remainder);
 
-                        Vector128<float> mask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
+                            Vector128<float> mask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
 
-                        float* pMatTemp = pMatCurrent;
-                        Vector128<float> x01 = Sse.And(mask, Sse.LoadVector128(pMatTemp));
-                        Vector128<float> x11 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol));
-                        Vector128<float> x21 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol));
-                        Vector128<float> x31 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol));
-                        Vector128<float> vector = Sse.And(mask, Sse.LoadVector128(pSrcCurrent));
+                            float* pMatTemp = pMatCurrent;
+                            Vector128<float> x01 = Sse.And(mask, Sse.LoadVector128(pMatTemp));
+                            Vector128<float> x11 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol));
+                            Vector128<float> x21 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol));
+                            Vector128<float> x31 = Sse.And(mask, Sse.LoadVector128(pMatTemp += ccol));
+                            Vector128<float> vector = Sse.And(mask, Sse.LoadVector128(pSrcCurrent));
 
-                        res0 = Sse.Add(res0, Sse.Multiply(x01, vector));
-                        res1 = Sse.Add(res1, Sse.Multiply(x11, vector));
-                        res2 = Sse.Add(res2, Sse.Multiply(x21, vector));
-                        res3 = Sse.Add(res3, Sse.Multiply(x31, vector));
+                            res0 = Sse.Add(res0, Sse.Multiply(x01, vector));
+                            res1 = Sse.Add(res1, Sse.Multiply(x11, vector));
+                            res2 = Sse.Add(res2, Sse.Multiply(x21, vector));
+                            res3 = Sse.Add(res3, Sse.Multiply(x31, vector));
 
-                        pMatCurrent += 4;
-                        pSrcCurrent += 4;
+                            pMatCurrent += 4;
+                            pSrcCurrent += 4;
+                        }
                     }
 
                     // Add up the entries of each, with the 4 results in res0
@@ -266,56 +269,17 @@ public static unsafe void MatMul(ReadOnlySpan<float> mat, ReadOnlySpan<float> sr
                     Sse.Store(pDstCurrent, res0);
                     pDstCurrent += 4;
                     pMatCurrent += 3 * ccol;
-                    numRows -= 4;
-                }
-
-                // falling through the case statements
-                switch (numRows)
-                {
-                    case 3:
-                        *(pDstCurrent + 2) = RowMultiply(pMatCurrent + 2 * ccol, psrc, pSrcEnd, ccol);
-                        goto case 2;
-                    case 2:
-                        *(pDstCurrent + 1) = RowMultiply(pMatCurrent + ccol, psrc, pSrcEnd, ccol);
-                        goto case 1;
-                    case 1:
-                        *pDstCurrent = RowMultiply(pMatCurrent, psrc, pSrcEnd, ccol);
-                        break;
                 }
             }
         }
 
-        private static unsafe float RowMultiply(float* pMatCurrent, float* pSrcCurrent, float* pSrcEnd, int ccol)
+        // Partial sparse source vector.
+        public static unsafe void MatMulP(AlignedArray mat, ReadOnlySpan<int> rgposSrc, AlignedArray src,
+                                int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
         {
-            Vector128<float> res0 = Sse.SetZeroVector128();
-            int remainder = ccol % 4;
-            while (pSrcCurrent + 4 <= pSrcEnd)
-            {
-                Vector128<float> vector = Sse.LoadVector128(pSrcCurrent);
-
-                float* pMatTemp = pMatCurrent;
-                Vector128<float> x01 = Sse.Multiply(vector, Sse.LoadVector128(pMatTemp));
-                res0 = Sse.Add(res0, x01);
-
-                pSrcCurrent += 4;
-                pMatCurrent += 4;
-            }
-
-            // Add up the entries of each, with the 4 results in res0
-            res0 = VectorSum128(in res0);
-            float sum = Sse.ConvertToSingle(res0);
-
-            // falling through the case statements
-            switch (remainder)
-            {
-                case 3: sum += *(pSrcCurrent + 2) * *(pMatCurrent + 2); goto case 2;
-                case 2: sum += *(pSrcCurrent + 1) * *(pMatCurrent + 1); goto case 1;
-                case 1: sum += *(pSrcCurrent) * *(pMatCurrent); break;
-            }
-            return sum;
+            MatMulP(mat.Items, rgposSrc, src.Items, posMin, iposMin, iposEnd, dst.Items, crow, ccol);
         }
 
-        // Partial sparse source vector.
         public static unsafe void MatMulP(ReadOnlySpan<float> mat, ReadOnlySpan<int> rgposSrc, ReadOnlySpan<float> src,
                                         int posMin, int iposMin, int iposEnd, Span<float> dst, int crow, int ccol)
         {
@@ -472,6 +436,11 @@ Vector128<float> SparseMultiplicationAcrossRow()
             }
         }
 
+        public static unsafe void MatMulTran(AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            MatMulTran(mat.Items, src.Items, dst.Items, crow, ccol);
+        }
+
         public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float> src, Span<float> dst, int crow, int ccol)
         {
             fixed (float* psrc = &MemoryMarshal.GetReference(src))
@@ -484,14 +453,16 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                 float* pDstEnd = pdst + crow;
                 float* pSrcCurrent = psrc;
                 float* pMatCurrent = pmat;
-                int remainder = 0;
-                int numCol = ccol;
 
                 // The reason behind adding the if condtion instead of boolean flag
                 // is to avoid branching in codegen.
                 if (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> x01 = Sse.LoadVector128(pSrcCurrent);
+                    // Replicate each 32-bit slot of x01 (ABCD) into its own register.
+                    Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55); // B
+                    Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA); // C
+                    Vector128<float> x31 = Sse.Shuffle(x01, x01, 0xFF); // D
                     x01 = Sse.Shuffle(x01, x01, 0x00); // A
 
                     int length = crow;
@@ -500,14 +471,20 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                     nuint address = (nuint)(pMatCurrent);
                     int misalignment = (int)(address % 16);
 
-                    if ((misalignment & 3) != 0 || (crow % 4 != 0))
+                    if ((misalignment & 3) != 0)
                     {
                         // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
-                        remainder = crow % 4;
-                        while (pDstCurrent + 4 <= pDstEnd)
+                        while (pDstCurrent < pDstEnd)
                         {
                             float* pMatTemp = pMatCurrent;
                             Vector128<float> x02 = Sse.Multiply(x01, Sse.LoadVector128(pMatTemp));
+                            Vector128<float> x12 = Sse.Multiply(x11, Sse.LoadVector128(pMatTemp += crow));
+                            Vector128<float> x22 = Sse.Multiply(x21, Sse.LoadVector128(pMatTemp += crow));
+                            Vector128<float> x32 = Sse.Multiply(x31, Sse.LoadVector128(pMatTemp += crow));
+
+                            x02 = Sse.Add(x02, x12);
+                            x22 = Sse.Add(x22, x32);
+                            x02 = Sse.Add(x02, x22);
 
                             Sse.Store(pDstCurrent, x02);
                             pDstCurrent += 4;
@@ -516,6 +493,7 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                     }
                     else
                     {
+                        int remainder = 0;
                         if (misalignment != 0)
                         {
                             // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then
@@ -528,11 +506,22 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                             // We only align pMat since it has significantly more reads.
                             float* pMatTemp = pMatCurrent;
                             Vector128<float> x02 = Sse.And(leadingMask, Sse.LoadVector128(pMatTemp));
+                            Vector128<float> x12 = Sse.And(leadingMask, Sse.LoadVector128(pMatTemp += crow));
+                            Vector128<float> x22 = Sse.And(leadingMask, Sse.LoadVector128(pMatTemp += crow));
+                            Vector128<float> x32 = Sse.And(leadingMask, Sse.LoadVector128(pMatTemp += crow));
+
                             x02 = Sse.Multiply(x01, x02);
+                            x12 = Sse.Multiply(x11, x12);
+                            x22 = Sse.Multiply(x21, x22);
+                            x32 = Sse.Multiply(x31, x32);
+
+                            x02 = Sse.Add(x02, x12);
+                            x22 = Sse.Add(x22, x32);
+                            x02 = Sse.Add(x02, x22);
 
                             Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + ((4 - misalignment) * 4));
                             Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
-                            x02 = Sse.Add(x02, Sse.And(x3, trailingMask));
+                            x02 = Sse.Or(x02, Sse.And(x3, trailingMask));
 
                             Sse.Store(pDstCurrent, x02);
                             pMatCurrent += misalignment;
@@ -549,7 +538,15 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                                 // (due to semantics of the legacy encoding).
                                 // We don't need an assert, since the instruction will throw for unaligned inputs.
                                 float* pMatTemp = pMatCurrent;
+
                                 Vector128<float> x02 = Sse.Multiply(x01, Sse.LoadAlignedVector128(pMatTemp));
+                                Vector128<float> x12 = Sse.Multiply(x11, Sse.LoadAlignedVector128(pMatTemp += crow));
+                                Vector128<float> x22 = Sse.Multiply(x21, Sse.LoadAlignedVector128(pMatTemp += crow));
+                                Vector128<float> x32 = Sse.Multiply(x31, Sse.LoadAlignedVector128(pMatTemp += crow));
+
+                                x02 = Sse.Add(x02, x12);
+                                x22 = Sse.Add(x22, x32);
+                                x02 = Sse.Add(x02, x22);
 
                                 Sse.Store(pDstCurrent, x02);
                                 pDstCurrent += 4;
@@ -563,36 +560,47 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                             // unaligned loads where we mask the input each time.
                             remainder = length;
                         }
-                    }
 
-                    if (remainder != 0)
-                    {
-                        // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
-                        // unaligned load will read to the end of the array and then mask out any elements already processed
-                        pMatCurrent -= (4 - remainder);
-                        pDstCurrent -= (4 - remainder);
+                        if (remainder != 0)
+                        {
+                            // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                            // unaligned load will read to the end of the array and then mask out any elements already processed
+                            pMatCurrent -= (4 - remainder);
+                            pDstCurrent -= (4 - remainder);
 
-                        Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
+                            Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
 
-                        float* pMatTemp = pMatCurrent;
-                        Vector128<float> x02 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp));
-                        x02 = Sse.Multiply(x01, x02);
+                            float* pMatTemp = pMatCurrent;
+                            Vector128<float> x02 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp));
+                            Vector128<float> x12 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow));
+                            Vector128<float> x22 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow));
+                            Vector128<float> x32 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow));
 
-                        Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4));
-                        Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
-                        x02 = Sse.Add(x02, Sse.And(x3, leadingMask));
+                            x02 = Sse.Multiply(x01, x02);
+                            x12 = Sse.Multiply(x11, x12);
+                            x22 = Sse.Multiply(x21, x22);
+                            x32 = Sse.Multiply(x31, x32);
 
-                        Sse.Store(pDstCurrent, x02);
-                        pDstCurrent += 4;
-                        pMatCurrent += 4;
+                            x02 = Sse.Add(x02, x12);
+                            x22 = Sse.Add(x22, x32);
+                            x02 = Sse.Add(x02, x22);
+
+                            Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4));
+                            Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
+                            x02 = Sse.Or(x02, Sse.And(x3, leadingMask));
+
+                            Sse.Store(pDstCurrent, x02);
+                            pDstCurrent += 4;
+                            pMatCurrent += 4;
+                        }
                     }
 
-                    numCol -= 1;
-                    pSrcCurrent += 1;
+                    pMatCurrent += 3 * crow;
+                    pSrcCurrent += 4;
                 }
 
                 // We do 4-way unrolling
-                while (pSrcCurrent + 4 <= pSrcEnd)
+                while (pSrcCurrent < pSrcEnd)
                 {
                     Vector128<float> x01 = Sse.LoadVector128(pSrcCurrent);
                     // Replicate each 32-bit slot of x01 (ABCD) into its own register.
@@ -607,10 +615,9 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                     nuint address = (nuint)(pMatCurrent);
                     int misalignment = (int)(address % 16);
 
-                    if ((misalignment & 3) != 0 || (crow % 4 != 0))
+                    if ((misalignment & 3) != 0)
                     {
-                        remainder = length % 4;
-                        while (pDstCurrent + 4 <= pDstEnd)
+                        while (pDstCurrent < pDstEnd)
                         {
                             float* pMatTemp = pMatCurrent;
                             Vector128<float> x02 = Sse.Multiply(x01, Sse.LoadVector128(pMatTemp));
@@ -631,6 +638,7 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                     }
                     else
                     {
+                        int remainder = 0;
                         if (misalignment != 0)
                         {
                             // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then
@@ -658,7 +666,7 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
 
                             Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + ((4 - misalignment) * 4));
                             Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
-                            x02 = Sse.Add(x02, Sse.And(x3, trailingMask));
+                            x02 = Sse.Or(x02, Sse.And(x3, trailingMask));
 
                             x02 = Sse.Add(x02, Sse.And(x3, leadingMask));
 
@@ -693,84 +701,43 @@ public static unsafe void MatMulTran(ReadOnlySpan<float> mat, ReadOnlySpan<float
                         {
                             remainder = length;
                         }
-                    }
 
-                    if (remainder != 0)
-                    {
-                        pMatCurrent -= (4 - remainder);
-                        pDstCurrent -= (4 - remainder);
-                        Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
+                        if (remainder != 0)
+                        {
+                            pMatCurrent -= (4 - remainder);
+                            pDstCurrent -= (4 - remainder);
+                            Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
 
-                        float* pMatTemp = pMatCurrent;
-                        Vector128<float> x02 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp));
-                        Vector128<float> x12 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow));
-                        Vector128<float> x22 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow));
-                        Vector128<float> x32 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow));
+                            float* pMatTemp = pMatCurrent;
+                            Vector128<float> x02 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp));
+                            Vector128<float> x12 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow));
+                            Vector128<float> x22 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow));
+                            Vector128<float> x32 = Sse.And(trailingMask, Sse.LoadVector128(pMatTemp += crow));
 
-                        x02 = Sse.Multiply(x01, x02);
-                        x12 = Sse.Multiply(x11, x12);
-                        x22 = Sse.Multiply(x21, x22);
-                        x32 = Sse.Multiply(x31, x32);
+                            x02 = Sse.Multiply(x01, x02);
+                            x12 = Sse.Multiply(x11, x12);
+                            x22 = Sse.Multiply(x21, x22);
+                            x32 = Sse.Multiply(x31, x32);
 
-                        x02 = Sse.Add(x02, x12);
-                        x22 = Sse.Add(x22, x32);
-                        x02 = Sse.Add(x02, x22);
+                            x02 = Sse.Add(x02, x12);
+                            x22 = Sse.Add(x22, x32);
+                            x02 = Sse.Add(x02, x22);
 
-                        Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4));
-                        Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
-                        x02 = Sse.Add(x02, Sse.And(x3, leadingMask));
+                            Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4));
+                            Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
+                            x02 = Sse.Or(x02, Sse.And(x3, leadingMask));
 
-                        x02 = Sse.Add(x02, Sse.And(x3, trailingMask));
-                        Sse.Store(pDstCurrent, x02);
-                        pDstCurrent += 4;
-                        pMatCurrent += 4;
+                            x02 = Sse.Add(x02, Sse.And(x3, trailingMask));
+                            Sse.Store(pDstCurrent, x02);
+                            pDstCurrent += 4;
+                            pMatCurrent += 4;
+                        }
                     }
 
                     pMatCurrent += 3 * crow;
                     pSrcCurrent += 4;
-                    numCol -= 4;
                 }
-
-                // falling through the case statements
-                switch (numCol)
-                {
-                    case 3: ColumnMultiply(pMatCurrent + 2 * crow, pSrcCurrent + 2, pdst, pDstEnd, crow); goto case 2;
-                    case 2: ColumnMultiply(pMatCurrent + crow, pSrcCurrent + 1, pdst, pDstEnd, crow); goto case 1;
-                    case 1: ColumnMultiply(pMatCurrent, pSrcCurrent, pdst, pDstEnd, crow); break;
-                }
-            }
-        }
-
-        private static unsafe void ColumnMultiply(float* pMatCurrent, float* pSrcCurrent, float* pdst, float* pDstEnd, int crow)
-        {
-            Vector128<float> x01 = Sse.LoadVector128(pSrcCurrent);
-            // Replicate each slot of h01 (ABCD) into its own register.
-            x01 = Sse.Shuffle(x01, x01, 0x00); // A
-            int remainder = crow % 4;
-            float* pDstCurrent = pdst;
-
-            while (pDstCurrent + 4 <= pDstEnd)
-            {
-                // If we aren't using the VEX-encoding, the JIT will only fold away aligned loads
-                // (due to semantics of the legacy encoding).
-                // We don't need an assert, since the instruction will throw for unaligned inputs.
-                float* pMatTemp = pMatCurrent;
-                Vector128<float> x02 = Sse.Multiply(x01, Sse.LoadVector128(pMatTemp));
-                x02 = Sse.Add(x02, Sse.LoadVector128(pDstCurrent));
-
-                Sse.Store(pDstCurrent, x02);
-                pDstCurrent += 4;
-                pMatCurrent += 4;
-            }
-
-            // falling through the case statements
-            switch (remainder)
-            {
-                case 3: *(pDstCurrent + 2) += *(pSrcCurrent) * *(pMatCurrent + 2); goto case 2;
-                case 2: *(pDstCurrent + 1) += *(pSrcCurrent) * *(pMatCurrent + 1); goto case 1;
-                case 1: *pDstCurrent += *(pSrcCurrent) * *(pMatCurrent); break;
             }
-            return;
         }
 
         // dst[i] += scale
diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs
index 9505db8766..8ff725b54a 100644
--- a/src/Microsoft.ML.CpuMath/Thunk.cs
+++ b/src/Microsoft.ML.CpuMath/Thunk.cs
@@ -86,6 +86,12 @@ public static extern void MatMulP(/*const*/ float* pmat, /*const*/ int* pposSrc,
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern float Dist2(/*const*/ float* px, /*const*/ float* py, int c);
 
+        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
+        public static extern void ZeroItemsU(float* pd, int c, /*const*/ int* pindices, int cindices);
+
+        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
+        public static extern void ZeroMatrixItemsCore(float* pd, int c, int ccol, int cfltRow, /*const*/ int* pindices, int cindices);
+
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern void SdcaL1UpdateU(float primalUpdate, /*const*/ float* ps, float threshold, float* pd1, float* pd2, int c);
 
diff --git a/src/Microsoft.ML.TimeSeries/AdaptiveSingularSpectrumSequenceModeler.cs b/src/Microsoft.ML.TimeSeries/AdaptiveSingularSpectrumSequenceModeler.cs
index 88cbc536dd..a473ccec29 100644
--- a/src/Microsoft.ML.TimeSeries/AdaptiveSingularSpectrumSequenceModeler.cs
+++ b/src/Microsoft.ML.TimeSeries/AdaptiveSingularSpectrumSequenceModeler.cs
@@ -174,8 +174,8 @@ public ModelInfo Info
         private Single[] _alpha;
         private Single[] _state;
         private readonly FixedSizeQueue<Single> _buffer;
-        private float[] _x;
-        private float[] _xSmooth;
+        private CpuAlignedVector _x;
+        private CpuAlignedVector _xSmooth;
         private int _windowSize;
         private readonly int _seriesLength;
         private readonly RankSelectionMethod _rankSelectionMethod;
@@ -188,14 +188,14 @@ public ModelInfo Info
 
         private readonly IHost _host;
 
-        private float[] _wTrans;
+        private CpuAlignedMatrixRow _wTrans;
         private Single _observationNoiseVariance;
         private Single _observationNoiseMean;
         private Single _autoregressionNoiseVariance;
         private Single _autoregressionNoiseMean;
 
         private int _rank;
-        private float[] _y;
+        private CpuAlignedVector _y;
         private Single _nextPrediction;
 
         /// <summary>
@@ -290,8 +290,8 @@ public AdaptiveSingularSpectrumSequenceModeler(IHostEnvironment env, int trainSi
 
             _alpha = new Single[windowSize - 1];
             _state = new Single[windowSize - 1];
-            _x = new float[windowSize];
-            _xSmooth = new float[windowSize];
+            _x = new CpuAlignedVector(windowSize, SseUtils.CbAlign);
+            _xSmooth = new CpuAlignedVector(windowSize, SseUtils.CbAlign);
             ShouldComputeForecastIntervals = shouldComputeForecastIntervals;
 
             _observationNoiseVariance = 0;
@@ -345,14 +345,14 @@ private AdaptiveSingularSpectrumSequenceModeler(AdaptiveSingularSpectrumSequence
             _state = new Single[_windowSize - 1];
             Array.Copy(model._state, _state, _windowSize - 1);
 
-            _x = new float[_windowSize];
-            _xSmooth = new float[_windowSize];
+            _x = new CpuAlignedVector(_windowSize, SseUtils.CbAlign);
+            _xSmooth = new CpuAlignedVector(_windowSize, SseUtils.CbAlign);
 
             if (model._wTrans != null)
             {
-                _y = new float[_rank];
-                _wTrans = new float[_rank * _windowSize];
-                Array.Copy(model._wTrans, _wTrans, _rank * _windowSize);
+                _y = new CpuAlignedVector(_rank, SseUtils.CbAlign);
+                _wTrans = new CpuAlignedMatrixRow(_rank, _windowSize, SseUtils.CbAlign);
+                _wTrans.CopyFrom(model._wTrans);
             }
         }
 
@@ -452,16 +452,18 @@ public AdaptiveSingularSpectrumSequenceModeler(IHostEnvironment env, ModelLoadCo
             {
                 var tempArray = ctx.Reader.ReadFloatArray();
                 _host.CheckDecode(Utils.Size(tempArray) == _rank * _windowSize);
-                _wTrans = new float[_rank * _windowSize];
-                Array.Copy(tempArray, _wTrans, tempArray.Length);
+                _wTrans = new CpuAlignedMatrixRow(_rank, _windowSize, SseUtils.CbAlign);
+                int i = 0;
+                _wTrans.CopyFrom(tempArray, ref i);
                 tempArray = ctx.Reader.ReadFloatArray();
-                _y = new float[_rank];
-                Array.Copy(tempArray, _y, tempArray.Length);
+                i = 0;
+                _y = new CpuAlignedVector(_rank, SseUtils.CbAlign);
+                _y.CopyFrom(tempArray, ref i);
             }
 
             _buffer = TimeSeriesUtils.DeserializeFixedSizeQueueSingle(ctx.Reader, _host);
-            _x = new float[_windowSize];
-            _xSmooth = new float[_windowSize];
+            _x = new CpuAlignedVector(_windowSize, SseUtils.CbAlign);
+            _xSmooth = new CpuAlignedVector(_windowSize, SseUtils.CbAlign);
         }
 
         public override void Save(ModelSaveContext ctx)
@@ -525,11 +527,14 @@ public override void Save(ModelSaveContext ctx)
 
             if (_wTrans != null)
             {
+                // REVIEW: this may not be the most efficient way for serializing an aligned matrix.
                 var tempArray = new Single[_rank * _windowSize];
-                Array.Copy(_wTrans, tempArray, _wTrans.Length);
+                int iv = 0;
+                _wTrans.CopyTo(tempArray, ref iv);
                 ctx.Writer.WriteSingleArray(tempArray);
                 tempArray = new float[_rank];
-                Array.Copy(_y, tempArray, tempArray.Length);
+                iv = 0;
+                _y.CopyTo(tempArray, ref iv);
                 ctx.Writer.WriteSingleArray(tempArray);
             }
 
@@ -1125,14 +1130,15 @@ internal override void Consume(ref Single input, bool updateModel = false)
 
             if (_wTrans == null)
             {
-                _y = new float[_rank];
-                _wTrans = new float[_rank * _windowSize];
+                _y = new CpuAlignedVector(_rank, SseUtils.CbAlign);
+                _wTrans = new CpuAlignedMatrixRow(_rank, _windowSize, SseUtils.CbAlign);
                 Single[] vecs = new Single[_rank * _windowSize];
 
                 for (i = 0; i < _rank; ++i)
                     vecs[(_windowSize + 1) * i] = 1;
 
-                Array.Copy(_wTrans, vecs, _rank * _windowSize);
+                i = 0;
+                _wTrans.CopyFrom(vecs, ref i);
             }
 
             // Forming vector x
@@ -1151,10 +1157,10 @@ internal override void Consume(ref Single input, bool updateModel = false)
             _x[_windowSize - 1] = input;
 
             // Computing y: Eq. (11) in https://hal-institut-mines-telecom.archives-ouvertes.fr/hal-00479772/file/twocolumns.pdf
-            CpuMathUtils.MatrixTimesSource(transpose: false, _wTrans, _x, _y, _y.Length);
+            CpuAligenedMathUtils<CpuAlignedMatrixRow>.MatTimesSrc(_wTrans, _x, _y);
 
             // Updating the state vector
-            CpuMathUtils.MatrixTimesSource(transpose: true, _wTrans, _y, _xSmooth, _y.Length);
+            CpuAligenedMathUtils<CpuAlignedMatrixRow>.MatTranTimesSrc(_wTrans, _y, _xSmooth);
 
             _nextPrediction = _autoregressionNoiseMean + _observationNoiseMean;
             for (i = 0; i < _windowSize - 2; ++i)
@@ -1305,8 +1311,8 @@ private void TrainCore(Single[] dataArray, int originalSeriesLength)
                             _maxRank = _windowSize / 2;
                             _alpha = new Single[_windowSize - 1];
                             _state = new Single[_windowSize - 1];
-                            _x = new float[_windowSize];
-                            _xSmooth = new float[_windowSize];
+                            _x = new CpuAlignedVector(_windowSize, SseUtils.CbAlign);
+                            _xSmooth = new CpuAlignedVector(_windowSize, SseUtils.CbAlign);
 
                             TrainCore(dataArray, originalSeriesLength);
                             return;
@@ -1343,11 +1349,12 @@ private void TrainCore(Single[] dataArray, int originalSeriesLength)
             }
 
             // Setting the the y vector
-            _y = new float[_rank];
+            _y = new CpuAlignedVector(_rank, SseUtils.CbAlign);
 
             // Setting the weight matrix
-            _wTrans = new float[_rank * _windowSize];
-            Array.Copy(leftSingularVecs, _wTrans, _wTrans.Length);
+            _wTrans = new CpuAlignedMatrixRow(_rank, _windowSize, SseUtils.CbAlign);
+            i = 0;
+            _wTrans.CopyFrom(leftSingularVecs, ref i);
 
             // Setting alpha
             Single nu = 0;
@@ -1357,7 +1364,7 @@ private void TrainCore(Single[] dataArray, int originalSeriesLength)
                 nu += _y[i] * _y[i];
             }
 
-            CpuMathUtils.MatrixTimesSource(transpose: true, _wTrans, _y, _xSmooth, _y.Length);
+            CpuAligenedMathUtils<CpuAlignedMatrixRow>.MatTranTimesSrc(_wTrans, _y, _xSmooth);
             for (i = 0; i < _windowSize - 1; ++i)
                 _alpha[i] = _xSmooth[i] / (1 - nu);
 
@@ -1402,8 +1409,8 @@ private void TrainCore(Single[] dataArray, int originalSeriesLength)
                     _x[i - originalSeriesLength + _windowSize] = dataArray[i];
             }
 
-            CpuMathUtils.MatrixTimesSource(transpose: false, _wTrans, _x, _y, _y.Length);
-            CpuMathUtils.MatrixTimesSource(transpose: true, _wTrans, _y, _xSmooth, _y.Length);
+            CpuAligenedMathUtils<CpuAlignedMatrixRow>.MatTimesSrc(_wTrans, _x, _y);
+            CpuAligenedMathUtils<CpuAlignedMatrixRow>.MatTranTimesSrc(_wTrans, _y, _xSmooth);
 
             for (i = 1; i < _windowSize; ++i)
             {
diff --git a/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs b/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs
index 8f4e2b6221..f1b7d49895 100644
--- a/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs
+++ b/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs
@@ -95,10 +95,10 @@ private sealed class TransformInfo
             public readonly int SrcDim;
 
             // the matrix containing the random fourier vectors
-            public readonly float[] RndFourierVectors;
+            public readonly AlignedArray RndFourierVectors;
 
             // the random rotations
-            public readonly float[] RotationTerms;
+            public readonly AlignedArray RotationTerms;
 
             private readonly IFourierDistributionSampler _matrixGenerator;
             private readonly bool _useSin;
@@ -120,10 +120,10 @@ public TransformInfo(IHost host, ColumnInfo column, int d, float avgDist)
                 var generator = column.Generator;
                 _matrixGenerator = generator.CreateComponent(host, avgDist);
 
-                int roundedUpD = RoundToMultipleOf4(NewDim);
-                int roundedUpNumFeatures = RoundToMultipleOf4(SrcDim);
-                RndFourierVectors = new float[roundedUpD * roundedUpNumFeatures];
-                RotationTerms = _useSin ? null : new float[roundedUpD];
+                int roundedUpD = RoundUp(NewDim, _cfltAlign);
+                int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign);
+                RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment());
+                RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment());
 
                 InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD);
             }
@@ -154,10 +154,10 @@ public TransformInfo(IHostEnvironment env, ModelLoadContext ctx, string director
                     ctx.LoadModelOrNull<IFourierDistributionSampler, SignatureLoadModel>(env, out _matrixGenerator, directoryName));
 
                 // initialize the transform matrix
-                int roundedUpD = RoundToMultipleOf4(NewDim);
-                int roundedUpNumFeatures = RoundToMultipleOf4(SrcDim);
-                RndFourierVectors = new float[roundedUpD * roundedUpNumFeatures];
-                RotationTerms = _useSin ? null : new float[roundedUpD];
+                int roundedUpD = RoundUp(NewDim, _cfltAlign);
+                int roundedUpNumFeatures = RoundUp(SrcDim, _cfltAlign);
+                RndFourierVectors = new AlignedArray(roundedUpD * roundedUpNumFeatures, CpuMathUtils.GetVectorAlignment());
+                RotationTerms = _useSin ? null : new AlignedArray(roundedUpD, CpuMathUtils.GetVectorAlignment());
                 InitializeFourierCoefficients(roundedUpNumFeatures, roundedUpD);
             }
 
@@ -225,6 +225,8 @@ private static VersionInfo GetVersionInfo()
 
         private readonly TransformInfo[] _transformInfos;
 
+        private static readonly int _cfltAlign = CpuMathUtils.GetVectorAlignment() / sizeof(float);
+
         private static string TestColumnType(ColumnType type)
         {
             if (type.ItemType == NumberType.Float && type.IsKnownSizeVector)
@@ -293,11 +295,16 @@ public RandomFourierFeaturizingTransformer(IHostEnvironment env, IDataView input
             }
         }
 
-        private static int RoundToMultipleOf4(int number)
+        // Round cflt up to a multiple of cfltAlign.
+        private static int RoundUp(int cflt, int cfltAlign)
         {
-            Contracts.Assert(0 < number);
-            int multipleOf4 = (number + 3) / 4;
-            return multipleOf4 * 4;
+            Contracts.Assert(0 < cflt);
+            // cfltAlign should be a power of two.
+            Contracts.Assert(0 < cfltAlign && (cfltAlign & (cfltAlign - 1)) == 0);
+
+            // Determine the number of "blobs" of size cfltAlign.
+            int cblob = (cflt + cfltAlign - 1) / cfltAlign;
+            return cblob * cfltAlign;
         }
 
         private float[] GetAvgDistances(ColumnInfo[] columns, IDataView input)
@@ -548,14 +555,14 @@ private ValueGetter<VBuffer<float>> GetterFromVectorType(Row input, int iinfo)
                 var getSrc = input.GetGetter<VBuffer<float>>(_srcCols[iinfo]);
                 var src = default(VBuffer<float>);
 
-                var features = new float[RoundToMultipleOf4(_srcTypes[iinfo].ValueCount)];
-                var product = new float[RoundToMultipleOf4(_parent._transformInfos[iinfo].NewDim)];
+                var featuresAligned = new AlignedArray(RoundUp(_srcTypes[iinfo].ValueCount, _cfltAlign), CpuMathUtils.GetVectorAlignment());
+                var productAligned = new AlignedArray(RoundUp(_parent._transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment());
 
                 return
                     (ref VBuffer<float> dst) =>
                     {
                         getSrc(ref src);
-                        TransformFeatures(in src, ref dst, _parent._transformInfos[iinfo], features, product);
+                        TransformFeatures(in src, ref dst, _parent._transformInfos[iinfo], featuresAligned, productAligned);
                     };
 
             }
@@ -565,8 +572,8 @@ private ValueGetter<VBuffer<float>> GetterFromFloatType(Row input, int iinfo)
                 var getSrc = input.GetGetter<float>(_srcCols[iinfo]);
                 var src = default(float);
 
-                var featuresAligned = new float[4];
-                var productAligned = new float[RoundToMultipleOf4(_parent._transformInfos[iinfo].NewDim)];
+                var featuresAligned = new AlignedArray(RoundUp(1, _cfltAlign), CpuMathUtils.GetVectorAlignment());
+                var productAligned = new AlignedArray(RoundUp(_parent._transformInfos[iinfo].NewDim, _cfltAlign), CpuMathUtils.GetVectorAlignment());
 
                 var oneDimensionalVector = new VBuffer<float>(1, new float[] { 0 });
 
@@ -580,7 +587,7 @@ private ValueGetter<VBuffer<float>> GetterFromFloatType(Row input, int iinfo)
             }
 
             private void TransformFeatures(in VBuffer<float> src, ref VBuffer<float> dst, TransformInfo transformInfo,
-                float[] features, float[] product)
+                AlignedArray featuresAligned, AlignedArray productAligned)
             {
                 Host.Check(src.Length == transformInfo.SrcDim, "column does not have the expected dimensionality.");
 
@@ -599,9 +606,9 @@ private void TransformFeatures(in VBuffer<float> src, ref VBuffer<float> dst, Tr
 
                 if (src.IsDense)
                 {
-                    src.GetValues().CopyTo(features);
-                    CpuMathUtils.MatrixTimesSource(transpose: false, transformInfo.RndFourierVectors, features, product,
-                        RoundToMultipleOf4(transformInfo.NewDim));
+                    featuresAligned.CopyFrom(src.GetValues());
+                    CpuMathUtils.MatrixTimesSource(false, transformInfo.RndFourierVectors, featuresAligned, productAligned,
+                        transformInfo.NewDim);
                 }
                 else
                 {
@@ -609,21 +616,15 @@ private void TransformFeatures(in VBuffer<float> src, ref VBuffer<float> dst, Tr
                     // no need to zero them out.
                     var srcValues = src.GetValues();
                     var srcIndices = src.GetIndices();
-
-                    for (int i = 0; i < srcValues.Length; i++)
-                    {
-                        int iv = srcIndices[i];
-                        features[iv] = srcValues[i];
-                    }
-
-                    CpuMathUtils.MatrixTimesSource(transformInfo.RndFourierVectors, srcIndices, features, 0, 0,
-                        srcValues.Length, product, RoundToMultipleOf4(transformInfo.NewDim));
+                    featuresAligned.CopyFrom(srcIndices, srcValues, 0, 0, srcValues.Length, zeroItems: false);
+                    CpuMathUtils.MatrixTimesSource(transformInfo.RndFourierVectors, srcIndices, featuresAligned, 0, 0,
+                        srcValues.Length, productAligned, transformInfo.NewDim);
                 }
 
                 var dstEditor = VBufferEditor.Create(ref dst, newDstLength);
                 for (int i = 0; i < transformInfo.NewDim; i++)
                 {
-                    var dotProduct = product[i];
+                    var dotProduct = productAligned[i];
                     if (transformInfo.RotationTerms != null)
                         dstEditor.Values[i] = (float)MathUtils.Cos(dotProduct + transformInfo.RotationTerms[i]) * scale;
                     else
diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp
index 2240d630eb..607af332d1 100644
--- a/src/Native/CpuMathNative/Sse.cpp
+++ b/src/Native/CpuMathNative/Sse.cpp
@@ -57,69 +57,15 @@ const unsigned int TrailingAlignmentMask[16] =
     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
 };
 
-float RowMultiply(const float* pMatCurrent, const float* pSrcCurrent, const float* pSrcEnd, int ccol)
-{
-    __m128 res0 = _mm_setzero_ps();
-    int remainder = ccol % 4;
-    
-    while (pSrcCurrent + 4 <= pSrcEnd)
-    {
-        __m128 vector = _mm_loadu_ps(pSrcCurrent);
-        const float* pMatTemp = pMatCurrent;
-        __m128 x01 = _mm_mul_ps(vector, _mm_loadu_ps(pMatTemp));
-        res0 = _mm_add_ps(res0, x01);
-
-        pSrcCurrent += 4;
-        pMatCurrent += 4;
-    }
-
-    res0 = _mm_hadd_ps(res0, res0);
-    res0 = _mm_hadd_ps(res0, res0);
-
-    float sum = _mm_cvtss_f32(res0);
-
-    // falling through the case statements
-    switch (remainder)
-    {
-        case 3:
-            sum += *(pSrcCurrent + 2) * *(pMatCurrent + 2);
-        case 2:
-            sum += *(pSrcCurrent + 1) * *(pMatCurrent + 1);
-        case 1:
-            sum += *(pSrcCurrent) * *(pMatCurrent);
-    }
-    return sum;
-}
-
 // Multiply matrix times vector into vector.
 EXPORT_API(void) MatMul(_In_ const float * pmat, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
 {
-    if (ccol < 4)
-    {
-        for (int i = 0 ; i < crow; i++)
-        {
-            float dotProduct = 0;
-            switch (ccol)
-            {
-                case 3:
-                    dotProduct += pmat[i * ccol + 2] * psrc[2];
-                case 2:
-                    dotProduct += pmat[i * ccol + 1] * psrc[1];
-                case 1:
-                    dotProduct += pmat[i * ccol + 0] * psrc[0];
-            }
-            pdst[i] = dotProduct;
-        }
-        return;
-    }
-
     const float * pSrcEnd = psrc + ccol;
     const float * pDstEnd = pdst + crow;
     float* pDstCurrent = pdst;
     const float* pMatCurrent = pmat;
-    int numRows = crow;
 
-    while (pDstCurrent + 4 <= pDstEnd)
+    while (pDstCurrent < pDstEnd)
     {
         __m128 res0 = _mm_setzero_ps();
         __m128 res1 = res0;
@@ -133,11 +79,10 @@ EXPORT_API(void) MatMul(_In_ const float * pmat, _In_ const float * psrc, _Inout
         uintptr_t misalignment = address % 16;
         int remainder = 0;
 
-        if ((misalignment & 3) != 0 || (ccol % 4 != 0))
+        if ((misalignment & 3) != 0)
         {
             // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
-            remainder = length % 4;
-            while (pSrcCurrent + 4 <= pSrcEnd)
+            while (pSrcCurrent < pSrcEnd)
             {
                 __m128 vector = _mm_loadu_ps(pSrcCurrent);
 
@@ -216,32 +161,32 @@ EXPORT_API(void) MatMul(_In_ const float * pmat, _In_ const float * psrc, _Inout
                 // unaligned loads where we mask the input each time.
                 remainder = length;
             }
-        }
 
-        if (remainder != 0)
-        {
-            // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
-            // unaligned load will read to the end of the array and then mask out any elements already processed
+            if (remainder != 0)
+            {
+                // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                // unaligned load will read to the end of the array and then mask out any elements already processed
 
-            pMatCurrent -= (4 - remainder);
-            pSrcCurrent -= (4 - remainder);
+                pMatCurrent -= (4 - remainder);
+                pSrcCurrent -= (4 - remainder);
 
-            __m128 mask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4));
+                __m128 mask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4));
 
-            const float* pMatTemp = pMatCurrent;
-            __m128 x01 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp));
-            __m128 x11 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol));
-            __m128 x21 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol));
-            __m128 x31 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol));
-            __m128 vector = _mm_and_ps(mask, _mm_loadu_ps(pSrcCurrent));
+                const float* pMatTemp = pMatCurrent;
+                __m128 x01 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp));
+                __m128 x11 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol));
+                __m128 x21 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol));
+                __m128 x31 = _mm_and_ps(mask, _mm_loadu_ps(pMatTemp += ccol));
+                __m128 vector = _mm_and_ps(mask, _mm_loadu_ps(pSrcCurrent));
 
-            res0 = _mm_add_ps(res0, _mm_mul_ps(x01, vector));
-            res1 = _mm_add_ps(res1, _mm_mul_ps(x11, vector));
-            res2 = _mm_add_ps(res2, _mm_mul_ps(x21, vector));
-            res3 = _mm_add_ps(res3, _mm_mul_ps(x31, vector));
+                res0 = _mm_add_ps(res0, _mm_mul_ps(x01, vector));
+                res1 = _mm_add_ps(res1, _mm_mul_ps(x11, vector));
+                res2 = _mm_add_ps(res2, _mm_mul_ps(x21, vector));
+                res3 = _mm_add_ps(res3, _mm_mul_ps(x31, vector));
 
-            pMatCurrent += 4;
-            pSrcCurrent += 4;
+                pMatCurrent += 4;
+                pSrcCurrent += 4;
+            }
         }
 
         // Add up the entries of each, with the 4 results in res0
@@ -253,19 +198,6 @@ EXPORT_API(void) MatMul(_In_ const float * pmat, _In_ const float * psrc, _Inout
 
         pDstCurrent += 4;
         pMatCurrent += 3 * ccol;
-        numRows -= 4;
-    }
-    
-    // falling through the case statements
-    switch(numRows)
-    {
-        case 3:
-            *(pDstCurrent + 2) = RowMultiply(pMatCurrent + 2 * ccol, psrc, pSrcEnd, ccol);
-        case 2:
-            *(pDstCurrent + 1) = RowMultiply(pMatCurrent + 1 * ccol, psrc, pSrcEnd, ccol);
-        case 1:
-            *pDstCurrent = RowMultiply(pMatCurrent, psrc, pSrcEnd, ccol);
-            break;
     }
 }
 
@@ -424,62 +356,21 @@ EXPORT_API(void) MatMulP(_In_ const float * pmat, _In_ const int * pposSrc, _In_
     }
 }
 
-void ColumnMultiply(const float* pMatCurrent, const float* pSrcCurrent, float* pdst, const float* pDstEnd, int crow)
-{
-    __m128 x01 = _mm_loadu_ps(pSrcCurrent);
-     x01 = _mm_shuffle_ps(x01, x01, 0x00);
-    float* pDstCurrent = pdst;
-    int remainder = crow % 4;
-
-    while (pDstCurrent + 4 <= pDstEnd)
-    {
-        const float* pMatTemp = pMatCurrent;
-        __m128 x02 = _mm_mul_ps(x01, _mm_loadu_ps(pMatTemp));
-        x02 = _mm_add_ps(x02, _mm_loadu_ps(pDstCurrent));
-
-        _mm_storeu_ps(pDstCurrent, x02);
-
-        pDstCurrent += 4;
-        pMatCurrent += 4;
-    }
-
-    // falling through the case statements
-    switch (remainder)
-    {
-        case 3: *(pDstCurrent + 2) += *(pSrcCurrent) * *(pMatCurrent + 2);
-        case 2: *(pDstCurrent + 1) += *(pSrcCurrent) * *(pMatCurrent + 1);
-        case 1: *pDstCurrent += *(pSrcCurrent) * *(pMatCurrent); break;
-    }
-    return;
-}
-
 EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _Inout_ float * pdst, int crow, int ccol)
 {
-    if (crow < 4)
-    {
-        for (int i = 0 ; i < crow; i++)
-        {
-            float dotProduct = 0;
-            for (int j = 0; j < ccol; j++)
-            {
-                dotProduct += pmat[j * crow + i] * psrc[j];
-            }
-            pdst[i] = dotProduct;
-        }
-        return;
-    }
-
     const float * pSrcEnd = psrc + ccol;
     const float * pDstEnd = pdst + crow;
 
     const float* pMatCurrent = pmat;
     const float* pSrcCurrent = psrc;
-    int remainder = 0;
-    int numCol = ccol;
 
     if (pSrcCurrent < pSrcEnd)
     {
         __m128 x01 = _mm_loadu_ps(pSrcCurrent);
+        // Replicate each slot of x01 into its own register.
+        __m128 x11 = _mm_shuffle_ps(x01, x01, 0x55);
+        __m128 x21 = _mm_shuffle_ps(x01, x01, 0xAA);
+        __m128 x31 = _mm_shuffle_ps(x01, x01, 0xFF);
         x01 = _mm_shuffle_ps(x01, x01, 0x00);
 
         int length = crow;
@@ -489,14 +380,20 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I
         uintptr_t misalignment = address % 16;
         int remainder = 0;
 
-        if ((misalignment & 3) != 0 || (crow % 4 != 0))
+        if ((misalignment & 3) != 0)
         {
             // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
-            remainder = crow % 4;
-            while (pDstCurrent + 4 <= pDstEnd)
+            while (pDstCurrent < pDstEnd)
             {
                 const float* pMatTemp = pMatCurrent;
                 __m128 x02 = _mm_mul_ps(x01, _mm_loadu_ps(pMatTemp));
+                __m128 x12 = _mm_mul_ps(x11, _mm_loadu_ps(pMatTemp += crow));
+                __m128 x22 = _mm_mul_ps(x21, _mm_loadu_ps(pMatTemp += crow));
+                __m128 x32 = _mm_mul_ps(x31, _mm_loadu_ps(pMatTemp += crow));
+
+                x02 = _mm_add_ps(x02, x12);
+                x22 = _mm_add_ps(x22, x32);
+                x02 = _mm_add_ps(x02, x22);
 
                 _mm_storeu_ps(pDstCurrent, x02);
                 pDstCurrent += 4;
@@ -505,6 +402,7 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I
         }
         else
         {
+            int remainder = 0;
             if (misalignment != 0)
             {
                 // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then
@@ -517,11 +415,22 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I
                 // We only align pMat since it has significantly more reads.
                 const float* pMatTemp = pMatCurrent;
                 __m128 x02 = _mm_and_ps(leadingMask, _mm_loadu_ps(pMatTemp));
+                __m128 x12 = _mm_and_ps(leadingMask, _mm_loadu_ps(pMatTemp += crow));
+                __m128 x22 = _mm_and_ps(leadingMask, _mm_loadu_ps(pMatTemp += crow));
+                __m128 x32 = _mm_and_ps(leadingMask, _mm_loadu_ps(pMatTemp += crow));
+
                 x02 = _mm_mul_ps(x01, x02);
+                x12 = _mm_mul_ps(x11, x12);
+                x22 = _mm_mul_ps(x21, x22);
+                x32 = _mm_mul_ps(x31, x32);
+
+                x02 = _mm_add_ps(x02, x12);
+                x22 = _mm_add_ps(x22, x32);
+                x02 = _mm_add_ps(x02, x22);
 
                 __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - misalignment) * 4));
                 __m128 x3 = _mm_loadu_ps(pDstCurrent);
-                x02 = _mm_add_ps(x02, _mm_and_ps(x3, trailingMask));
+                x02 = _mm_or_ps(x02, _mm_and_ps(x3, trailingMask));
 
                 _mm_storeu_ps(pDstCurrent, x02);
                 pMatCurrent += misalignment;
@@ -538,6 +447,13 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I
                 {
                     const float* pMatTemp = pMatCurrent;
                     __m128 x02 = _mm_mul_ps(x01, _mm_load_ps(pMatTemp));
+                    __m128 x12 = _mm_mul_ps(x11, _mm_load_ps(pMatTemp += crow));
+                    __m128 x22 = _mm_mul_ps(x21, _mm_load_ps(pMatTemp += crow));
+                    __m128 x32 = _mm_mul_ps(x31, _mm_load_ps(pMatTemp += crow));
+
+                    x02 = _mm_add_ps(x02, x12);
+                    x22 = _mm_add_ps(x22, x32);
+                    x02 = _mm_add_ps(x02, x22);
 
                     _mm_storeu_ps(pDstCurrent, x02);
 
@@ -552,36 +468,47 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I
                 // unaligned loads where we mask the input each time.
                 remainder = length;
             }
-        }
 
-        if (remainder != 0)
-        {
-            // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
-            // unaligned load will read to the end of the array and then mask out any elements already processed
+            if (remainder != 0)
+            {
+                // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                // unaligned load will read to the end of the array and then mask out any elements already processed
 
-            pMatCurrent -= (4 - remainder);
-            pDstCurrent -= (4 - remainder);
+                pMatCurrent -= (4 - remainder);
+                pDstCurrent -= (4 - remainder);
 
-            __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4));
+                __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4));
+
+                const float* pMatTemp = pMatCurrent;
+                __m128 x02 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp));
+                __m128 x12 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow));
+                __m128 x22 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow));
+                __m128 x32 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow));
 
-            const float* pMatTemp = pMatCurrent;
-            __m128 x02 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp));
-            x02 = _mm_mul_ps(x01, x02);
+                x02 = _mm_mul_ps(x01, x02);
+                x12 = _mm_mul_ps(x11, x12);
+                x22 = _mm_mul_ps(x21, x22);
+                x32 = _mm_mul_ps(x31, x32);
 
-            __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4));
-            __m128 x3 = _mm_loadu_ps(pDstCurrent);
-            x02 = _mm_add_ps(x02, _mm_and_ps(x3, leadingMask));
+                x02 = _mm_add_ps(x02, x12);
+                x22 = _mm_add_ps(x22, x32);
+                x02 = _mm_add_ps(x02, x22);
 
-            _mm_storeu_ps(pDstCurrent, x02);
-            pMatCurrent += 4;
-            pDstCurrent += 4;
+                __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4));
+                __m128 x3 = _mm_loadu_ps(pDstCurrent);
+                x02 = _mm_or_ps(x02, _mm_and_ps(x3, leadingMask));
+
+                _mm_storeu_ps(pDstCurrent, x02);
+                pMatCurrent += 4;
+                pDstCurrent += 4;
+            }
         }
 
-        numCol -= 1;
-        pSrcCurrent += 1;
+        pMatCurrent += 3 * crow;
+        pSrcCurrent += 4;
     }
-    
-    while (pSrcCurrent + 4 <= pSrcEnd)
+
+    while (pSrcCurrent < pSrcEnd)
     {
         __m128 x01 = _mm_loadu_ps(pSrcCurrent);
         // Replicate each slot of x01 into its own register.
@@ -597,10 +524,9 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I
         uintptr_t misalignment = address % 16;
         int remainder = 0;
 
-        if ((misalignment & 3) != 0 || (crow % 4 != 0))
+        if ((misalignment & 3) != 0)
         {
-            remainder = length % 4;
-            while (pDstCurrent + 4 <= pDstEnd)
+            while (pDstCurrent < pDstEnd)
             {
                 const float* pMatTemp = pMatCurrent;
                 __m128 x02 = _mm_mul_ps(x01, _mm_loadu_ps(pMatTemp));
@@ -621,6 +547,7 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I
         }
         else
         {
+            int remainder = 0;
             if (misalignment != 0)
             {
                 misalignment >>= 2;
@@ -646,7 +573,7 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I
 
                 __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - misalignment) * 4));
                 __m128 x3 = _mm_loadu_ps(pDstCurrent);
-                x02 = _mm_add_ps(x02, _mm_and_ps(x3, trailingMask));
+                x02 = _mm_or_ps(x02, _mm_and_ps(x3, trailingMask));
                 x02 = _mm_add_ps(x02, _mm_and_ps(x3, leadingMask));
 
                 _mm_storeu_ps(pDstCurrent, x02);
@@ -682,52 +609,43 @@ EXPORT_API(void) MatMulTran(_In_ const float * pmat, _In_ const float * psrc, _I
             {
                 remainder = length;
             }
-        }
 
-        if (remainder != 0)
-        {
-            pMatCurrent -= (4 - remainder);
-            pDstCurrent -= (4 - remainder);
+            if (remainder != 0)
+            {
+                pMatCurrent -= (4 - remainder);
+                pDstCurrent -= (4 - remainder);
 
-            __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4));
+                __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4));
 
-            const float* pMatTemp = pMatCurrent;
-            __m128 x02 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp));
-            __m128 x12 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow));
-            __m128 x22 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow));
-            __m128 x32 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow));
+                const float* pMatTemp = pMatCurrent;
+                __m128 x02 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp));
+                __m128 x12 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow));
+                __m128 x22 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow));
+                __m128 x32 = _mm_and_ps(trailingMask, _mm_loadu_ps(pMatTemp += crow));
 
-            x02 = _mm_mul_ps(x01, x02);
-            x12 = _mm_mul_ps(x11, x12);
-            x22 = _mm_mul_ps(x21, x22);
-            x32 = _mm_mul_ps(x31, x32);
+                x02 = _mm_mul_ps(x01, x02);
+                x12 = _mm_mul_ps(x11, x12);
+                x22 = _mm_mul_ps(x21, x22);
+                x32 = _mm_mul_ps(x31, x32);
 
-            x02 = _mm_add_ps(x02, x12);
-            x22 = _mm_add_ps(x22, x32);
-            x02 = _mm_add_ps(x02, x22);
+                x02 = _mm_add_ps(x02, x12);
+                x22 = _mm_add_ps(x22, x32);
+                x02 = _mm_add_ps(x02, x22);
 
-            __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4));
-            __m128 x3 = _mm_loadu_ps(pDstCurrent);
-            x02 = _mm_add_ps(x02, _mm_and_ps(x3, leadingMask));
+                __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4));
+                __m128 x3 = _mm_loadu_ps(pDstCurrent);
+                x02 = _mm_or_ps(x02, _mm_and_ps(x3, leadingMask));
 
-            x02 = _mm_add_ps(x02, _mm_and_ps(x3, trailingMask));
-            _mm_storeu_ps(pDstCurrent, x02);
-            pMatCurrent += 4;
-            pDstCurrent += 4;
+                x02 = _mm_add_ps(x02, _mm_and_ps(x3, trailingMask));
+                _mm_storeu_ps(pDstCurrent, x02);
+                pMatCurrent += 4;
+                pDstCurrent += 4;
+            }
         }
 
-        numCol -= 4;
         pMatCurrent += 3 * crow;
         pSrcCurrent += 4;
     }
-
-    // falling through the case statements
-    switch (numCol)
-    {
-        case 3: ColumnMultiply(pMatCurrent + 2 * crow, pSrcCurrent + 2, pdst, pDstEnd, crow);
-        case 2: ColumnMultiply(pMatCurrent + crow, pSrcCurrent + 1, pdst, pDstEnd, crow);
-        case 1: ColumnMultiply(pMatCurrent, pSrcCurrent, pdst, pDstEnd, crow); break;
-    }
 }
 
 // pd[i] += a
@@ -1320,6 +1238,43 @@ EXPORT_API(float) Dist2(const float * px, const float * py, int c)
     return norm2;
 }
 
+EXPORT_API(void) ZeroItemsU(_Inout_ float * pd, int c, _In_ const int * pindices, int cindices)
+{
+    DEBUG_ONLY(c);
+    for (int i = 0; i < cindices; ++i)
+    {
+        int iv = pindices[i];
+        assert(0 <= iv && iv < c);
+        pd[iv] = 0;
+    }
+}
+
+EXPORT_API(void) ZeroMatrixItemsCore(_Inout_ float * pd, int c, int ccol, int cfltRow, _In_ const int * pindices, int cindices)
+{
+    DEBUG_ONLY(c);
+    int ivLogMin = 0;
+    int ivLogLim = ccol;
+    int ivPhyMin = 0;
+    for (int i = 0; i < cindices; ++i)
+    {
+        int iv = pindices[i];
+        assert(0 <= iv && iv < c);
+
+        int col = iv - ivLogMin;
+        if ((unsigned int)col >= (unsigned int)ccol)
+        {
+            assert(ivLogMin > iv || iv >= ivLogLim);
+            int row = iv / ccol;
+            ivLogMin = row * ccol;
+            ivLogLim = ivLogMin + ccol;
+            ivPhyMin = row * cfltRow;
+            assert(ivLogMin <= iv && iv < ivLogLim);
+            col = iv - ivLogMin;
+        }
+        pd[ivPhyMin + col] = 0;
+    }
+}
+
 EXPORT_API(void) SdcaL1UpdateU(float primalUpdate, _In_ const float * ps, float threshold, _Inout_ float *pd1, _Inout_ float * pd2, int c)
 {
     const float * psLim = ps + c;
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
index 5adb641b92..25996ec42c 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
@@ -14,14 +14,14 @@ public class CpuMathUtilsUnitTests
     {
         private readonly float[][] _testArrays;
         private readonly int[] _testIndexArray;
-        private readonly float[][] _testMatrices;
-        private readonly float[][] _testSrcVectors;
-        private readonly float[][] _testDstVectors;
+        private readonly AlignedArray[] _testMatrices;
+        private readonly AlignedArray[] _testSrcVectors;
+        private readonly AlignedArray[] _testDstVectors;
+        private readonly int _vectorAlignment = CpuMathUtils.GetVectorAlignment();
         private readonly FloatEqualityComparer _comparer;
         private readonly FloatEqualityComparerForMatMul _matMulComparer;
 
         private const float DefaultScale = 1.7f;
-        private const int DefaultSeed = 253421;
 
         public CpuMathUtilsUnitTests()
         {
@@ -50,19 +50,34 @@ public CpuMathUtilsUnitTests()
                 testMatrix2[i] = i + 1;
             }
 
-            _testMatrices = new float[][] { testMatrix1, testMatrix2 };
+            AlignedArray testMatrixAligned1 = new AlignedArray(8 * 8, _vectorAlignment);
+            AlignedArray testMatrixAligned2 = new AlignedArray(8 * 16, _vectorAlignment);
+            testMatrixAligned1.CopyFrom(testMatrix1);
+            testMatrixAligned2.CopyFrom(testMatrix2);
+
+            _testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 };
 
             // Padded source vectors whose dimensions are multiples of 8
             float[] testSrcVector1 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f };
             float[] testSrcVector2 = new float[16] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f, 16f };
 
-            _testSrcVectors = new float[][] { testSrcVector1, testSrcVector2 };
+            AlignedArray testSrcVectorAligned1 = new AlignedArray(8, _vectorAlignment);
+            AlignedArray testSrcVectorAligned2 = new AlignedArray(16, _vectorAlignment);
+            testSrcVectorAligned1.CopyFrom(testSrcVector1);
+            testSrcVectorAligned2.CopyFrom(testSrcVector2);
+
+            _testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
 
             // Padded destination vectors whose dimensions are multiples of 8
             float[] testDstVector1 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f };
             float[] testDstVector2 = new float[16] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, 10f, 11f, 12f, 13f, 14f, 15f };
 
-            _testDstVectors = new float[][] { testDstVector1, testDstVector2 };
+            AlignedArray testDstVectorAligned1 = new AlignedArray(8, _vectorAlignment);
+            AlignedArray testDstVectorAligned2 = new AlignedArray(16, _vectorAlignment);
+            testDstVectorAligned1.CopyFrom(testDstVector1);
+            testDstVectorAligned2.CopyFrom(testDstVector2);
+
+            _testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
         }
 
         [Theory]
@@ -71,125 +86,29 @@ public CpuMathUtilsUnitTests()
         [InlineData(1, 0, 1, new float[] { 204f, 492f, 780f, 1068f, 1356f, 1644f, 1932f, 2220f, 2508f, 2796f, 3084f, 3372f, 3660f, 3948f, 4236f, 4524f })]
         public void MatMulTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            float[] mat = _testMatrices[matTest];
-            float[] src = _testSrcVectors[srcTest];
-            float[] dst = _testDstVectors[dstTest];
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
 
-            CpuMathUtils.MatrixTimesSource(false, mat, src, dst, dst.Length);
-            float[] actual = new float[dst.Length];
-            Array.Copy(dst, actual, dst.Length);
+            CpuMathUtils.MatrixTimesSource(false, mat, src, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
-        [Theory]
-        [InlineData(10, 5)]
-        [InlineData(10, 8)]
-        [InlineData(10, 11)]
-        [InlineData(11, 8)]
-        [InlineData(8, 23)]
-        [InlineData(2, 8)]
-        [InlineData(2, 9)]
-        [InlineData(2, 3)]
-        [InlineData(2, 5)]
-        [InlineData(4, 5)]
-        [InlineData(4, 7)]
-        [InlineData(4, 9)]
-        [InlineData(5, 7)]
-        [InlineData(5, 9)]
-        private void MatMulAnyDimensionTest(int col, int row)
-        {
-            Random rand = new Random(DefaultSeed);
-            float[] mat = new float[col * row];
-            for (int i = 0; i < col * row; i++)
-            {
-                mat[i] = rand.Next(-10, 10);
-            }
-
-            float[] src = new float[col];
-            for (int i = 0; i < col; i++)
-            {
-                src[i] = rand.Next(-10, 10);
-            }
-
-            float[] dst = new float[row];
-            float[] expected = new float[row];
-            
-            for (int i = 0; i < row; i++)
-            {
-                float dotProduct = 0;
-                for (int j = 0; j < src.Length; j++)
-                {
-                    dotProduct += mat[i * src.Length + j] * src[j];
-                }
-
-                expected[i] = dotProduct;
-            }
-
-            CpuMathUtils.MatrixTimesSource(false, mat, src, dst, dst.Length);
-            Assert.Equal(expected, dst, _matMulComparer);
-        }
-
-        [Theory]
-        [InlineData(10, 5)]
-        [InlineData(10, 8)]
-        [InlineData(10, 11)]
-        [InlineData(11, 8)]
-        [InlineData(8, 23)]
-        [InlineData(2, 8)]
-        [InlineData(2, 9)]
-        [InlineData(2, 3)]
-        [InlineData(2, 5)]
-        [InlineData(4, 5)]
-        [InlineData(4, 7)]
-        [InlineData(4, 9)]
-        [InlineData(5, 7)]
-        [InlineData(5, 9)]
-        private void MatMulTranAnyDimensionTest(int col, int row)
-        {
-            float[] mat = new float[col * row];
-            Random rand = new Random(DefaultSeed);
-            for (int i = 0; i < col * row; i++)
-            {
-                mat[i] = rand.Next(0, 10);
-            }
-
-            float[] src = new float[row];
-            for (int i = 0; i < row; i++)
-            {
-                src[i] = rand.Next(0, 10);
-            }
-
-            float[] dst = new float[col];
-            float[] expected = new float[col];
-
-            for (int i = 0; i < dst.Length; i++)
-            {
-                float dotProduct = 0;
-                for (int j = 0; j < row; j++)
-                {
-                    dotProduct += mat[j * dst.Length + i] * src[j];
-                }
-
-                expected[i] = dotProduct;
-            }
-
-            CpuMathUtils.MatrixTimesSource(true, mat, src, dst, row);
-            Assert.Equal(expected, dst, _matMulComparer);
-        }
-
         [Theory]
         [InlineData(0, 0, 0, new float[] { 70.56001f, -85.68f, -351.36f, 498.24f, -3829.32f, -969.48f, 1168.2f, 118.44f })]
         [InlineData(1, 0, 1, new float[] { 2724f, 2760f, 2796f, 2832f, 2868f, 2904f, 2940f, 2976f, 3012f, 3048f, 3084f, 3120f, 3156f, 3192f, 3228f, 3264f })]
         [InlineData(1, 1, 0, new float[] { 11016f, 11152f, 11288f, 11424f, 11560f, 11696f, 11832f, 11968f })]
         public void MatMulTranTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            float[] mat = _testMatrices[matTest];
-            float[] src = _testSrcVectors[srcTest];
-            float[] dst = _testDstVectors[dstTest];
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
 
-            CpuMathUtils.MatrixTimesSource(true, mat, src, dst, src.Length);
-            float[] actual = new float[dst.Length];
-            Array.Copy(dst, actual, dst.Length);
+            CpuMathUtils.MatrixTimesSource(true, mat, src, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -199,14 +118,14 @@ public void MatMulTranTest(int matTest, int srcTest, int dstTest, float[] expect
         [InlineData(1, 0, 1, new float[] { 95f, 231f, 367f, 503f, 639f, 775f, 911f, 1047f, 1183f, 1319f, 1455f, 1591f, 1727f, 1863f, 1999f, 2135f })]
         public void MatTimesSrcSparseTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            float[] mat = _testMatrices[matTest];
-            float[] src = _testSrcVectors[srcTest];
-            float[] dst = _testDstVectors[dstTest];
+            AlignedArray mat = _testMatrices[matTest];
+            AlignedArray src = _testSrcVectors[srcTest];
+            AlignedArray dst = _testDstVectors[dstTest];
             int[] idx = _testIndexArray;
 
-            CpuMathUtils.MatrixTimesSource(mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Length);
-            float[] actual = new float[dst.Length];
-            Array.Copy(dst, actual, dst.Length);
+            CpuMathUtils.MatrixTimesSource(mat, idx, src, 0, 0, (srcTest == 0) ? 4 : 9, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
             Assert.Equal(expected, actual, _matMulComparer);
         }
 
@@ -548,6 +467,34 @@ public void Dist2Test(int test, float expected)
             Assert.Equal(expected, actual, 0);
         }
 
+        [Theory]
+        [InlineData(0, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })]
+        [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13, 14 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 0f, 10f, 11f, 0f, 0f, 0f, 0f, 16f })]
+        public void ZeroItemsUTest(int test, int[] idx, float[] expected)
+        {
+            AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment);
+            src.CopyFrom(_testSrcVectors[test]);
+
+            CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx);
+            float[] actual = new float[src.Size];
+            src.CopyTo(actual, 0, src.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
+        [Theory]
+        [InlineData(0, new int[] { 0, 2, 5 }, new float[] { 0f, 2f, 0f, 4f, 5f, 6f, 0f, 8f })]
+        [InlineData(1, new int[] { 0, 2, 5, 6, 8, 11, 12, 13 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f, 9f, 0f, 11f, 12f, 0f, 0f, 0f, 16f })]
+        public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected)
+        {
+            AlignedArray src = new AlignedArray(8 + 8 * test, _vectorAlignment);
+            src.CopyFrom(_testSrcVectors[test]);
+
+            CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx);
+            float[] actual = new float[src.Size];
+            src.CopyTo(actual, 0, src.Size);
+            Assert.Equal(expected, actual, _comparer);
+        }
+
         [Theory]
         [InlineData(0)]
         [InlineData(1)]
diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs
index d8141426ea..16ceae22d3 100644
--- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs
+++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs
@@ -17,7 +17,6 @@
 using Microsoft.ML.Runtime.Model;
 using Microsoft.ML.TestFramework;
 using Xunit;
-using Xunit.Abstractions;
 
 namespace Microsoft.ML.Runtime.RunTests
 {
@@ -775,6 +774,7 @@ protected bool SaveLoadTransposed(IDataView view, IHostEnvironment env, string s
 
     public abstract partial class TestDataViewBase : BaseTestBaseline
     {
+
         public class SentimentData
         {
             [ColumnName("Label")]
@@ -934,7 +934,7 @@ protected bool CheckSameValues(RowCursor curs1, RowCursor curs2, bool exactTypes
                     var comp = comps[col];
                     if (comp != null && !comp())
                     {
-                        Fail($"Different values in column {col} of row {curs1.Position}");
+                        Fail("Different values in column {0} of row {1}", col, curs1.Position);
                         return Failed();
                     }
                     if (idComp != null && !idComp())
@@ -1158,10 +1158,12 @@ protected Func<bool> GetColumnComparer(Row r1, Row r2, int col, ColumnType type,
             throw Contracts.Except("Unknown type in GetColumnComparer: '{0}'", type);
         }
 
-        private bool EqualWithEpsDouble(Double x, Double y)
+        private const Double DoubleEps = 1e-9;
+
+        private static bool EqualWithEpsDouble(Double x, Double y)
         {
             // bitwise comparison is needed because Abs(Inf-Inf) and Abs(NaN-NaN) are not 0s.
-            return FloatUtils.GetBits(x) == FloatUtils.GetBits(y) || CompareNumbersWithTolerance(x, y, null, 3);
+            return FloatUtils.GetBits(x) == FloatUtils.GetBits(y) || Math.Abs(x - y) < DoubleEps;
         }
 
         private const float SingleEps = 1e-6f;
diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeries.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeries.cs
index 90428934ba..39340d225b 100644
--- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeries.cs
+++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeries.cs
@@ -72,7 +72,7 @@ public void SavePipeSsaSpike()
             Done();
         }
 
-        [Fact]
+        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))]  // netcore3.0 output differs from Baseline
         public void SavePipeSsaSpikeNoData()
         {
             string pathData = DeleteOutputPath("SavePipe", "SsaSpikeNoData.txt");
diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs
index 274a229630..0bd4604daa 100644
--- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs
+++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs
@@ -87,7 +87,7 @@ public void ChangeDetection()
             }
         }
 
-        [Fact]
+        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline
         public void ChangePointDetectionWithSeasonality()
         {
             var env = new MLContext(conc: 1);
@@ -130,14 +130,14 @@ public void ChangePointDetectionWithSeasonality()
             while (enumerator.MoveNext() && index < expectedValues.Count)
             {
                 row = enumerator.Current;
-                Assert.Equal(expectedValues[index++], row.Change[0], precision: 5);  // Alert
-                Assert.Equal(expectedValues[index++], row.Change[1], precision: 5);  // Raw score
-                Assert.Equal(expectedValues[index++], row.Change[2], precision: 5);  // P-Value score
-                Assert.Equal(expectedValues[index++], row.Change[3], precision: 5);  // Martingale score
+                Assert.Equal(expectedValues[index++], row.Change[0], precision: 7);  // Alert
+                Assert.Equal(expectedValues[index++], row.Change[1], precision: 7);  // Raw score
+                Assert.Equal(expectedValues[index++], row.Change[2], precision: 7);  // P-Value score
+                Assert.Equal(expectedValues[index++], row.Change[3], precision: 7);  // Martingale score
             }
         }
 
-        [Fact]
+        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))]
         public void ChangePointDetectionWithSeasonalityPredictionEngineNoColumn()
         {
             const int ChangeHistorySize = 10;
@@ -190,10 +190,10 @@ public void ChangePointDetectionWithSeasonalityPredictionEngineNoColumn()
             var engine2 = model2.CreateTimeSeriesPredictionFunction<Data, Prediction>(ml);
             var prediction2 = engine2.Predict(new Data(1));
             //Raw score after first input.
-            Assert.Equal(1.1661833524703979, prediction2.Change[1], precision: 4); // Raw score
+            Assert.Equal(1.1661833524703979, prediction2.Change[1], precision: 5); // Raw score
             prediction2 = engine2.Predict(new Data(1));
             //Raw score after second input.
-            Assert.Equal(0.12216401100158691, prediction2.Change[1], precision: 4); // Raw score
+            Assert.Equal(0.12216401100158691, prediction2.Change[1], precision: 5); // Raw score
 
             //Even though time series column is not requested it will 
             // pass the observation through time series transform and update the state with the first input.
@@ -210,10 +210,10 @@ public void ChangePointDetectionWithSeasonalityPredictionEngineNoColumn()
             //and raw score should match the raw score obtained by passing the two input in the first model.
             var engine3 = model3.CreateTimeSeriesPredictionFunction<Data, Prediction>(ml);
             var prediction3 = engine3.Predict(new Data(1));
-            Assert.Equal(0.12216401100158691, prediction2.Change[1], precision: 4); // Raw score
+            Assert.Equal(0.12216401100158691, prediction2.Change[1], precision: 5); // Raw score
         }
 
-        [Fact]
+        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))]
         public void ChangePointDetectionWithSeasonalityPredictionEngine()
         {
             const int ChangeHistorySize = 10;
@@ -264,7 +264,7 @@ public void ChangePointDetectionWithSeasonalityPredictionEngine()
             //Model 1: Prediction #2
             prediction = engine.Predict(new Data(1));
             Assert.Equal(0, prediction.Change[0], precision: 7); // Alert
-            Assert.Equal(0.12216401100158691, prediction.Change[1], precision: 4); // Raw score
+            Assert.Equal(0.12216401100158691, prediction.Change[1], precision: 5); // Raw score
             Assert.Equal(0.14823824685192111, prediction.Change[2], precision: 5); // P-Value score
             Assert.Equal(1.5292508189989167E-07, prediction.Change[3], precision: 7); // Martingale score
 
@@ -277,7 +277,7 @@ public void ChangePointDetectionWithSeasonalityPredictionEngine()
             engine = model2.CreateTimeSeriesPredictionFunction<Data, Prediction>(ml);
             prediction = engine.Predict(new Data(1));
             Assert.Equal(0, prediction.Change[0], precision: 7); // Alert
-            Assert.Equal(0.12216401100158691, prediction.Change[1], precision: 4); // Raw score
+            Assert.Equal(0.12216401100158691, prediction.Change[1], precision: 5); // Raw score
             Assert.Equal(0.14823824685192111, prediction.Change[2], precision: 5); // P-Value score
             Assert.Equal(1.5292508189989167E-07, prediction.Change[3], precision: 5); // Martingale score
         }
diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesEstimatorTests.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesEstimatorTests.cs
index 0e6a5063bd..a7892701d7 100644
--- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesEstimatorTests.cs
+++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesEstimatorTests.cs
@@ -41,7 +41,7 @@ public TimeSeriesEstimatorTests(ITestOutputHelper output) : base(output)
         {
         }
 
-        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCoreAnd64BitProcess))] // 32bit and netcore3.0 output differs from Baseline
+        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline
         void TestSsaChangePointEstimator()
         {
             int Confidence = 95;
@@ -75,7 +75,7 @@ void TestSsaChangePointEstimator()
             Done();
         }
 
-        [Fact]
+        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline
         void TestSsaSpikeEstimator()
         {
             int Confidence = 95;