Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Analyzer to recommend ThreadSafe collections in parallel context. #79419

Open
gfs opened this issue Mar 11, 2022 · 3 comments
Open

Analyzer to recommend ThreadSafe collections in parallel context. #79419

gfs opened this issue Mar 11, 2022 · 3 comments
Labels
api-suggestion Early API idea and discussion, it is NOT ready for implementation area-System.Collections code-analyzer Marks an issue that suggests a Roslyn analyzer
Milestone

Comments

@gfs
Copy link

gfs commented Mar 11, 2022

Describe the problem you are trying to solve

A common pattern is to run parallel processing and to produce a set of data. The default collection types like List and HashSet are not threadsafe and you may get unexpected behavior (like nulls being added to the collection) if you modify them from multiple threads at the same time.

Describe suggestions on how to achieve the rule

I think there should be an analyzer that detects when you are using the non thread safe collections in the context of the built in parallelization mechanisms (ForAll or Parallel) and can swap out the collection you are using for a more appropriate collection.

Good Examples:

  • Replace List with ConcurrentBag (add operations will not need to be changed, indexing operations would, which adds complexity to this)
  • Replace Dictionary with ConcurrentDictionary

Other examples:

  • Replace HashSet with ConcurrentDictionary<T,bool> (this is a workaround for there not being a threadsafe HashSet. The bool value is then ignored but you get threadsafety in a HashSet like structure.)

Additional context

The following set of tests demonstrates this issue in action along with potential rewrites.

namespace NonThreadSafeCollections
{
    using Microsoft.VisualStudio.TestPlatform.ObjectModel;
    using Microsoft.VisualStudio.TestTools.UnitTesting;
    using System;
    using System.Collections.Concurrent;
    using System.Collections.Generic;
    using System.Linq;
    using System.Security.Cryptography;

    [TestClass]
    public class NonThreadSafeCollections
    {
        static List<TestObject> generatedObjects = new List<TestObject>();
        
        [TestMethod]
        public void ParallelListNested()
        {
            Dictionary<(char, char), List<TestObject>> testObjects = new();
            generatedObjects.AsParallel().ForAll(x => {
                if (!testObjects.ContainsKey((x.FieldOne[0], x.FieldTwo[0])))
                {
                    testObjects[(x.FieldOne[0], x.FieldTwo[0])] = new List<TestObject>();
                }
                testObjects[(x.FieldOne[0], x.FieldTwo[0])].Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Values.Select(x => x.Count).Sum());
            Assert.IsFalse(testObjects.Any(x => x.Value.Any(y => y is null)));
        }

        [TestMethod]
        public void ParallelListNested_Fixed()
        {
            ConcurrentDictionary<(char, char), ConcurrentBag<TestObject>> testObjects = new();
            foreach(var character1 in chars)
            {
                foreach(var character2 in chars)
                {
                    testObjects[(character1, character2)] = new ConcurrentBag<TestObject>();
                }
            }
            generatedObjects.AsParallel().ForAll(x => {
                testObjects[(x.FieldOne[0], x.FieldTwo[0])].Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Values.Select(x => x.Count).Sum());
            Assert.IsFalse(testObjects.Any(x => x.Value.Any(y => y is null)));
        }

        [TestMethod]
        public void SingleThreadedListNested()
        {
            Dictionary<(char, char), List<TestObject>> testObjects = new();
            generatedObjects.ForEach(x => {
                if (!testObjects.ContainsKey((x.FieldOne[0], x.FieldTwo[0])))
                {
                    testObjects[(x.FieldOne[0], x.FieldTwo[0])] = new List<TestObject>();
                }
                testObjects[(x.FieldOne[0], x.FieldTwo[0])].Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Values.Select(x => x.Count).Sum());
            Assert.IsFalse(testObjects.Any(x => x.Value.Any(y => y is null)));
        }

        [TestMethod]
        public void ParallelList()
        {
            List<TestObject> testObjects = new();
            generatedObjects.AsParallel().ForAll(x => {
                testObjects.Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Count);
            Assert.IsFalse(testObjects.Any(x => x is null));
        }

        [TestMethod]
        public void ParallelList_Fixed()
        {
            ConcurrentBag<TestObject> testObjects = new();
            generatedObjects.AsParallel().ForAll(x => {
                testObjects.Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Count);
            Assert.IsFalse(testObjects.Any(x => x is null));
        }

        [TestMethod]
        public void SingleThreadedList()
        {
            List<TestObject> testObjects = new();
            generatedObjects.ForEach(x => {
                testObjects.Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Count);
            Assert.IsFalse(testObjects.Any(x => x is null));
        }

        [ClassInitialize]
        public static void ClassInit(TestContext context)
        {
            for (int i = 0; i < 10000; i++)
            {
                generatedObjects.Add(MakeNewTestObject());
            }
        }

        public static int GetRandomPositiveIndex(int max)
        {
            var randomInteger = uint.MaxValue;
            while (randomInteger == uint.MaxValue)
            {
                byte[] data = RandomNumberGenerator.GetBytes(4);
                randomInteger = BitConverter.ToUInt32(data, 0);
            }

            return (int)(max * (randomInteger / (double)uint.MaxValue));
        }

        public static string GetRandomString(int characters) => new(Enumerable.Range(1, characters).Select(_ => chars[GetRandomPositiveIndex(chars.Length)]).ToArray());

        private const string chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
        static TestObject MakeNewTestObject(int fieldLength = 1000)
        {
            return new TestObject
            {
                FieldOne = GetRandomString(fieldLength),
                FieldTwo = GetRandomString(fieldLength),
                FieldThree = GetRandomString(fieldLength),
                FieldFour = GetRandomString(fieldLength),
            };
        }

        class TestObject
        {
            public string FieldOne { get; set; } = string.Empty;
            public string FieldTwo { get; set; } = string.Empty;
            public string FieldThree { get; set; } = string.Empty;
            public string FieldFour { get; set; } = string.Empty;
        }
    }
}
@Youssef1313
Copy link
Member

@stephentoub Should this move to dotnet/runtime?

@stephentoub stephentoub transferred this issue from dotnet/roslyn-analyzers Dec 8, 2022
@dotnet-issue-labeler
Copy link

I couldn't figure out the best area label to add to this issue. If you have write-permissions please help me learn by adding exactly one area label.

@ghost ghost added the untriaged New issue has not been triaged by the area owner label Dec 8, 2022
@stephentoub stephentoub added api-suggestion Early API idea and discussion, it is NOT ready for implementation code-analyzer Marks an issue that suggests a Roslyn analyzer area-System.Collections and removed untriaged New issue has not been triaged by the area owner labels Dec 8, 2022
@ghost
Copy link

ghost commented Dec 8, 2022

Tagging subscribers to this area: @dotnet/area-system-collections
See info in area-owners.md if you want to be subscribed.

Issue Details

Describe the problem you are trying to solve

A common pattern is to run parallel processing and to produce a set of data. The default collection types like List and HashSet are not threadsafe and you may get unexpected behavior (like nulls being added to the collection) if you modify them from multiple threads at the same time.

Describe suggestions on how to achieve the rule

I think there should be an analyzer that detects when you are using the non thread safe collections in the context of the built in parallelization mechanisms (ForAll or Parallel) and can swap out the collection you are using for a more appropriate collection.

Good Examples:

  • Replace List with ConcurrentBag (add operations will not need to be changed, indexing operations would, which adds complexity to this)
  • Replace Dictionary with ConcurrentDictionary

Other examples:

  • Replace HashSet with ConcurrentDictionary<T,bool> (this is a workaround for there not being a threadsafe HashSet. The bool value is then ignored but you get threadsafety in a HashSet like structure.)

Additional context

The following set of tests demonstrates this issue in action along with potential rewrites.

namespace NonThreadSafeCollections
{
    using Microsoft.VisualStudio.TestPlatform.ObjectModel;
    using Microsoft.VisualStudio.TestTools.UnitTesting;
    using System;
    using System.Collections.Concurrent;
    using System.Collections.Generic;
    using System.Linq;
    using System.Security.Cryptography;

    [TestClass]
    public class NonThreadSafeCollections
    {
        static List<TestObject> generatedObjects = new List<TestObject>();
        
        [TestMethod]
        public void ParallelListNested()
        {
            Dictionary<(char, char), List<TestObject>> testObjects = new();
            generatedObjects.AsParallel().ForAll(x => {
                if (!testObjects.ContainsKey((x.FieldOne[0], x.FieldTwo[0])))
                {
                    testObjects[(x.FieldOne[0], x.FieldTwo[0])] = new List<TestObject>();
                }
                testObjects[(x.FieldOne[0], x.FieldTwo[0])].Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Values.Select(x => x.Count).Sum());
            Assert.IsFalse(testObjects.Any(x => x.Value.Any(y => y is null)));
        }

        [TestMethod]
        public void ParallelListNested_Fixed()
        {
            ConcurrentDictionary<(char, char), ConcurrentBag<TestObject>> testObjects = new();
            foreach(var character1 in chars)
            {
                foreach(var character2 in chars)
                {
                    testObjects[(character1, character2)] = new ConcurrentBag<TestObject>();
                }
            }
            generatedObjects.AsParallel().ForAll(x => {
                testObjects[(x.FieldOne[0], x.FieldTwo[0])].Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Values.Select(x => x.Count).Sum());
            Assert.IsFalse(testObjects.Any(x => x.Value.Any(y => y is null)));
        }

        [TestMethod]
        public void SingleThreadedListNested()
        {
            Dictionary<(char, char), List<TestObject>> testObjects = new();
            generatedObjects.ForEach(x => {
                if (!testObjects.ContainsKey((x.FieldOne[0], x.FieldTwo[0])))
                {
                    testObjects[(x.FieldOne[0], x.FieldTwo[0])] = new List<TestObject>();
                }
                testObjects[(x.FieldOne[0], x.FieldTwo[0])].Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Values.Select(x => x.Count).Sum());
            Assert.IsFalse(testObjects.Any(x => x.Value.Any(y => y is null)));
        }

        [TestMethod]
        public void ParallelList()
        {
            List<TestObject> testObjects = new();
            generatedObjects.AsParallel().ForAll(x => {
                testObjects.Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Count);
            Assert.IsFalse(testObjects.Any(x => x is null));
        }

        [TestMethod]
        public void ParallelList_Fixed()
        {
            ConcurrentBag<TestObject> testObjects = new();
            generatedObjects.AsParallel().ForAll(x => {
                testObjects.Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Count);
            Assert.IsFalse(testObjects.Any(x => x is null));
        }

        [TestMethod]
        public void SingleThreadedList()
        {
            List<TestObject> testObjects = new();
            generatedObjects.ForEach(x => {
                testObjects.Add(x);
            });
            Assert.AreEqual(generatedObjects.Count, testObjects.Count);
            Assert.IsFalse(testObjects.Any(x => x is null));
        }

        [ClassInitialize]
        public static void ClassInit(TestContext context)
        {
            for (int i = 0; i < 10000; i++)
            {
                generatedObjects.Add(MakeNewTestObject());
            }
        }

        public static int GetRandomPositiveIndex(int max)
        {
            var randomInteger = uint.MaxValue;
            while (randomInteger == uint.MaxValue)
            {
                byte[] data = RandomNumberGenerator.GetBytes(4);
                randomInteger = BitConverter.ToUInt32(data, 0);
            }

            return (int)(max * (randomInteger / (double)uint.MaxValue));
        }

        public static string GetRandomString(int characters) => new(Enumerable.Range(1, characters).Select(_ => chars[GetRandomPositiveIndex(chars.Length)]).ToArray());

        private const string chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
        static TestObject MakeNewTestObject(int fieldLength = 1000)
        {
            return new TestObject
            {
                FieldOne = GetRandomString(fieldLength),
                FieldTwo = GetRandomString(fieldLength),
                FieldThree = GetRandomString(fieldLength),
                FieldFour = GetRandomString(fieldLength),
            };
        }

        class TestObject
        {
            public string FieldOne { get; set; } = string.Empty;
            public string FieldTwo { get; set; } = string.Empty;
            public string FieldThree { get; set; } = string.Empty;
            public string FieldFour { get; set; } = string.Empty;
        }
    }
}
Author: gfs
Assignees: -
Labels:

api-suggestion, area-System.Collections, code-analyzer

Milestone: -

@eiriktsarpalis eiriktsarpalis added this to the Future milestone Dec 9, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
api-suggestion Early API idea and discussion, it is NOT ready for implementation area-System.Collections code-analyzer Marks an issue that suggests a Roslyn analyzer
Projects
None yet
Development

No branches or pull requests

4 participants