Skip to content

grensen/good_vs_bad_code

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

50 Commits
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Good Code vs. Bad Code Demo Using C#

The Idea

The Demo

The Demo With .NET 7

How To Unroll a Loop

int n = 100;
int i = 0;
if (unrolling)
    for (; i < n - 4; i += 4)
    {
        sum += data[i + 0];
        sum += data[i + 1];
        sum += data[i + 2];
        sum += data[i + 3];
    }
for (; i < n; i++)
    sum += data[i];

Implementation Details That Can Count

// a first?
float[] a;
int b;
// or b first? 
int b;
float[] a;

Visual Studio Profiler

https://www.youtube.com/watch?v=y4HV5m5GR7o

Feed Forward Cost Details

Array vs. Span Assembly On sharplab.io

static void SumProductToSpan(Span<float> neurons, Span<float> weights, float n)
{    
    for(int r = 0; r < neurons.Length; r++)
    {                 
       neurons[r] = neurons[r] + weights[r] * n;
    }   
}

static void SumProductToArray(float[] neurons, float[] weights, float n)
{    
    for(int r = 0; r < neurons.Length; r++)
    {                 
       neurons[r] = neurons[r] + weights[r] * n;
    }   
}

https://sharplab.io/

Assembly Instructions Array

Assembly Instructions Span

Floating Point "Features"

Forward Propagation Default Implementation With Arrays

static void FeedForwardDefaultArray(float[] neurons, float[] weights, int[] net)
{
    for (int i = 0, j = 0, k = net[0], m = 0; i < net.Length - 1; i++)
    {
        int left = net[i], right = net[i + 1];
        for (int l = 0, w = m; l < left; l++)
        {
            float n = neurons[j + l];
            if (n > 0)
                for (int r = 0; r < right; r++)
                    neurons[k + r] += n * weights[w + r];
            w += right;
        }
        m += left * right; j += left; k += right;
    }
}

Default Arrays Unrolled

static void FeedForwardDefaultArrayUnrolled(float[] neurons, float[] weights, int[] net)
{
    for (int i = 0, j = 0, k = net[0], m = 0; i < net.Length - 1; i++)
    {
        int left = net[i], right = net[i + 1];
        for (int l = 0, w = m; l < left; l++)
        {
            float n = neurons[j + l];
            if (n > 0)
            {
                int r = 0;
                for (; r < right - 8; r = 8 + r)
                {
                    neurons[k + r] = n * weights[w + r] + neurons[k + r];
                    neurons[k + r + 1] = n * weights[w + r + 1] + neurons[k + r + 1]; 
                    neurons[k + r + 2] = n * weights[w + r + 2] + neurons[k + r + 2];
                    neurons[k + r + 3] = n * weights[w + r + 3] + neurons[k + r + 3]; 
                    neurons[k + r + 4] = n * weights[w + r + 4] + neurons[k + r + 4];
                    neurons[k + r + 5] = n * weights[w + r + 5] + neurons[k + r + 5]; 
                    neurons[k + r + 6] = n * weights[w + r + 6] + neurons[k + r + 6];
                    neurons[k + r + 7] = n * weights[w + r + 7] + neurons[k + r + 7];
                }
                for (; r < right; r++)
                    neurons[k + r] += n * weights[w + r];
            }
            w += right;
        }
        m += left * right; j += left; k += right;
    }
}

Default Implementation With Spans

static void FeedForwardDefaultSpanEachInput(Span<float> neurons, ReadOnlySpan<float> weights, ReadOnlySpan<int> net)
{
    for (int i = 0, j = 0, k = net[0], m = 0; i < net.Length - 1; i++)
    {
        int left = net[i], right = net[i + 1];
        for (int l = 0, w = m; l < left; l++, w += right)
        {
            float n = neurons[j + l];
            if (n <= 0) continue;
            ReadOnlySpan<float> localWts = weights.Slice(w, right);
            Span<float> localOut = neurons.Slice(k, right);
            for (int r = 0; r < localOut.Length; r++)
                localOut[r] = localWts[r] * n + localOut[r];               
        }
        m += left * right; j += left; k += right;
    }
}

Advanced Spans Layer Wise

static void FeedForwardAdvancedSpanEachLayer(Span<float> neuron, ReadOnlySpan<float> weights, ReadOnlySpan<int> net)
{
    for (int k = net[0], w = 0, i = 0; i < net.Length - 1; i++)
    {
        Span<float> activations = neuron.Slice(k, net[i + 1]);
        ReadOnlySpan<float> localInp = neuron.Slice(k - net[i], net[i]);
        Span<float> localOut = stackalloc float[net[i + 1]];
        for (int l = 0; l < net[i]; w = w + localOut.Length, l++)
        {
            float n = localInp[l];
            if (n <= 0) continue;
            ReadOnlySpan<float> wts = weights.Slice(w, localOut.Length);
            for (int r = 0; r < localOut.Length; r++)
                localOut[r] = wts[r] * n + localOut[r];
        }
        k = localOut.Length + k;
        localOut.CopyTo(activations);
    }
}

Advanced Vector SIMD

static void FeedForwardVectorSIMD(Span<float> neurons, ReadOnlySpan<float> weights, ReadOnlySpan<int> net)
{
    for (int k = net[0], w = 0, i = 0; i < net.Length - 1; i++)
    {
        ReadOnlySpan<float> localInp = neurons.Slice(k - net[i], net[i]);
        Span<float> localOut = stackalloc float[net[i + 1]];
        for (int l = 0; l < net[i]; w = localOut.Length + w, l++)
        {
            float n = localInp[l];
            if (n <= 0) continue;
            ReadOnlySpan<float> wts = weights.Slice(w, localOut.Length);
            int r = 0;
            for (; r < localOut.Length - Vector<float>.Count; r += Vector<float>.Count)
            {
                Vector<float> va = new Vector<float>(localOut.Slice(r, Vector<float>.Count));
                Vector<float> vb = new Vector<float>(wts.Slice(r, Vector<float>.Count));
                va += vb * n;
                va.CopyTo(localOut.Slice(r, Vector<float>.Count));
            }
            for (; r < localOut.Length; ++r)
                localOut[r] = wts[r] * n + localOut[r];
        }
        localOut.CopyTo(neurons.Slice(k, net[i + 1]));
        k = localOut.Length + k;
    }
}

Advanced Vector SIMD No Copy

static void FeedForwardVectorSIMDNoCopy(Span<float> neurons, ReadOnlySpan<float> weights, ReadOnlySpan<int> net)
{
    for (int k = net[0], w = 0, i = 0; i < net.Length - 1; i++)
    {
        ReadOnlySpan<float> localInp = neurons.Slice(k - net[i], net[i]);
        Span<float> localOut = stackalloc float[net[i + 1]];                         
        for (int l = 0; l < localInp.Length; w = w + localOut.Length, l++)
        {
            float n = localInp[l];
            if (n <= 0) continue;
            ReadOnlySpan<float> wts = weights.Slice(w, localOut.Length);
            ReadOnlySpan<Vector<float>> wtsVecArray = MemoryMarshal.Cast<float, Vector<float>>(wts);
            Span<Vector<float>> resultsVecArray = MemoryMarshal.Cast<float, Vector<float>>(localOut);
            for (int v = 0; v < resultsVecArray.Length; v++)
                resultsVecArray[v] = wtsVecArray[v] * n + resultsVecArray[v];
            for (int r = wtsVecArray.Length * Vector<float>.Count; r < localOut.Length; r++)
                localOut[r] = wts[r] * n + localOut[r];
        }
        Span<float> activations = neurons.Slice(k, localOut.Length);
        localOut.CopyTo(activations);
        k = localOut.Length + k;
    }
}

Backpropagation Default

static void Backprop(int[] net, float[] weights, float[] neuron, float[] delta, int target)
{
    Span<float> gradient = stackalloc float[neuron.Length];

    for (int r = neuron.Length - net[^1], p = 0; r < neuron.Length; r++, p++)
        gradient[r] = target == p ? 1 - neuron[r] : -neuron[r];

    for (int i = net.Length - 2, j = neuron.Length - net[^1], k = neuron.Length, m = weights.Length; i >= 0; i--)
    {
        int right = net[i + 1], left = net[i];
        k -= right; j -= left; m -= right * left;

        for (int l = j, w = m; l < left + j; l++, w += right)
        {
            var n = neuron[l];
            if (n > 0)
            {
                float sum = 0.0f;
                for (int r = 0; r < right; r++)
                {
                    int wr = r + w;
                    var g = gradient[k + r];
                    sum += weights[wr] * g; delta[wr] += n * g;
                }
                gradient[l] = sum;
            }
        }
    }
}

Backpropagation SIMD No Copy

static void BackpropSIMDNoCopy(int[] net, Span<float> weights, float[] neuron, Span<float> delta, int target)
{
    Span<float> gradient = stackalloc float[neuron.Length];

    // output error gradients, hard target as 1 for its class
    for (int r = neuron.Length - net[^1], p = 0; r < neuron.Length; r++, p++)
        gradient[r] = target == p ? 1 - neuron[r] : -neuron[r];

    for (int j = neuron.Length - net[^1], k = neuron.Length, m = weights.Length, i = net.Length - 2; i >= 0; i--)
    {
        int right = net[i + 1], left = net[i];
        k -= right; j -= left; m -= right * left;
        Span<float> gra = gradient.Slice(k, right);

        for (int l = 0, w = m; l < left; l++, w += right)
        {
            var n = neuron[l + j];
            if (n <= 0) continue;

            Span<float> wts = weights.Slice(w, right);
            Span<float> dts = delta.Slice(w, right);

            Span<Vector<float>> graVec = MemoryMarshal.Cast<float, Vector<float>>(gra);
            Span<Vector<float>> dtsVec = MemoryMarshal.Cast<float, Vector<float>>(dts);
            Span<Vector<float>> wtsVec = MemoryMarshal.Cast<float, Vector<float>>(wts);

            var sumVec = Vector<float>.Zero;  
            for (int v = 0; v < wtsVec.Length; v++)
            {
                var gVec = graVec[v];
                sumVec = wtsVec[v] * gVec + sumVec;
                dtsVec[v] = n * gVec + dtsVec[v];
            }

            // changed float result with vector sum
            float sum = Vector.Sum(sumVec);

            for (int r = wtsVec.Length * Vector<float>.Count; r < wts.Length; r++)
            {
                var g = gra[r];
                sum = wts[r] * g + sum;
                dts[r] = n * g + dts[r];
            }
            gradient[l + j] = sum;
        }
    }
}

Update Weights Default

static void UpdateDefault(float[] weight, float[] delta, float lr, float mom)
{
    for (int w = 0; w < weight.Length; w++)
    {
        var d = delta[w] * lr;
        weight[w] += d;
        delta[w] *= mom;
    }  
}

Update Weights SIMD

static void UpdateSIMDNoCopy(float[] weight, float[] delta, float lr, float mom)
{
    Span<Vector<float>> weightVecArray = MemoryMarshal.Cast<float, Vector<float>>(weight);
    Span<Vector<float>> deltaVecArray = MemoryMarshal.Cast<float, Vector<float>>(delta);
    for (int v = 0; v < weightVecArray.Length; v++)
    {
        weightVecArray[v] += deltaVecArray[v] * lr;
        deltaVecArray[v] *= mom;
    }
    for (int w = weightVecArray.Length * Vector<float>.Count; w < weight.Length; w++)
    {
        weight[w] += delta[w] * lr;
        delta[w] *= mom;
    }
}

Update ChatGPT Version

static void UpdateChatGPT(float[] weight, float[] delta, float lr, float mom)
{
    // Pre-compute the values of lr and mom,
    // and use local variables to store these values.
    float lr_value = lr;
    float mom_value = mom;

    // Use the "unsafe" keyword to enable pointer arithmetic.
    unsafe
    {
        // Use the "fixed" keyword to fix the arrays
        // in memory and get a pointer to their elements.
        fixed (float* w = weight, d = delta)
        {
            // Iterate over the elements in the arrays
            // using pointer arithmetic.
            for (int i = 0; i < weight.Length; i++)
            {
                // Use local variables to store the values
                // of w[i] and d[i], and update these values
                // using the += and *= operators.
                float w_value = w[i];
                float d_value = d[i];
                w_value += d_value * lr_value;
                d_value *= mom_value;

                // Update the values in the arrays using
                // the pointer and the dereference operator (*).
                *(w + i) = w_value;
                *(d + i) = d_value;
            }
        }
    }
}

ChatGPT With Hard Prompts

static void Update//ChatGPTUnrolledAVX2
    (float[] weight, float[] delta, float lr, float mom)
{
    unsafe
    {
        fixed (float* w = weight, d = delta)
        {
            int i = 0;
            for (; i < weight.Length - 7; i += 8)
            {
                // Load 8 floats from the weight and delta arrays.
                var wVector = Avx2.LoadVector256(w + i);
                var dVector = Avx2.LoadVector256(d + i);

                // Convert the learning rate and momentum factors to vectors.
                var lrVector = Avx2.BroadcastScalarToVector256(&lr);
                var momVector = Avx2.BroadcastScalarToVector256(&mom);

                // Update the weight and delta vectors using AVX2 instructions.
                wVector += Avx2.Multiply(dVector, lrVector);
                dVector *= momVector;

                // Store the updated weight and delta vectors back to memory.
                Avx2.Store(w + i, wVector);
                Avx2.Store(d + i, dVector);
            }
            // Update the remaining elements using a regular for loop.
            for (; i < weight.Length; i++)
            {
                w[i] += d[i] * lr;
                d[i] *= mom;
            }
        }
    }
}

SIMD

https://devblogs.microsoft.com/dotnet/hardware-intrinsics-in-net-core/

Language Efficiency

https://greenlab.di.uminho.pt/wp-content/uploads/2017/09/paperSLE.pdf

Numeric performance in C, C# and Java

old paper: https://www.itu.dk/people/sestoft/papers/numericperformance.pdf

EFFICIENT MATRIX MULTIPLICATION USING HARDWAREINTRINSICS AND PARALLELISM WITH C#

https://docplayer.net/227695722-Efficient-matrix-multiplication-using-hardware-intrinsics-and-parallelism-with-c.html

Benchmark code 109 Times Faster!

50 Epochs With 1,794,000 Weights Take 23.19 Seconds

50 Epochs With 5,588,000 Weights Take 64.80 Seconds

50 Epochs With 11,382,000 Weights Take 156.89 Seconds

1 Million Frames Per Second

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages