From e26c9ba2ced88d5bcf1804e228918b63cae76b20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20Techet?= <techet@gmail.com>
Date: Sun, 14 Jun 2015 17:52:24 +0200
Subject: [PATCH] Add linear tag remove path for cases where not many files are
 open

When tested with 200000 LOC python file (created by making many copies
of scripts/create_py_tags.py), the tm_tags_remove_file_tags() function
takes about 50% of the CPU time when only this file is open. After adding
the linear path to tm_tags_remove_file_tags() it takes just about 2%. See
the comment in the patch for more details.
---
 tagmanager/src/tm_tag.c | 74 ++++++++++++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 24 deletions(-)

diff --git a/tagmanager/src/tm_tag.c b/tagmanager/src/tm_tag.c
index 955621f755..9afc845f3d 100644
--- a/tagmanager/src/tm_tag.c
+++ b/tagmanager/src/tm_tag.c
@@ -828,37 +828,63 @@ gboolean tm_tags_sort(GPtrArray *tags_array, TMTagAttrType *sort_attributes,
 void tm_tags_remove_file_tags(TMSourceFile *source_file, GPtrArray *tags_array)
 {
 	guint i;
-	GPtrArray *to_delete = g_ptr_array_sized_new(source_file->tags_array->len);
-	
-	for (i = 0; i < source_file->tags_array->len; i++)
+
+	/* Now we choose between an algorithm with complexity O(tags_array->len) and
+	 * O(source_file->tags_array->len * log(tags_array->len)). The latter algorithm
+	 * is better when tags_array contains many times more tags than
+	 * source_file->tags_array so instead of trying to find the removed tags
+	 * linearly, binary search is used. The constant 20 is more or less random
+	 * but seems to work well. It's exact value isn't so critical because it's
+	 * the extremes where the difference is the biggest: when
+	 * source_file->tags_array->len == tags_array->len (single file open) and
+	 * source_file->tags_array->len << tags_array->len (the number of tags
+	 * from the file is a small fraction of all tags).
+	 */
+	if (source_file->tags_array->len != 0 &&
+		tags_array->len / source_file->tags_array->len < 20)
 	{
-		guint j;
-		guint tag_count;
-		TMTag **found;
-		TMTag *tag = source_file->tags_array->pdata[i];
-		
-		found = tm_tags_find(tags_array, tag->name, FALSE, TRUE, &tag_count);
-		
-		for (j = 0; j < tag_count; j++)
+		for (i = 0; i < tags_array->len; i++)
 		{
-			if (*found != NULL && (*found)->file == source_file)
+			TMTag *tag = tags_array->pdata[i];
+
+			if (tag->file == source_file)
+				tags_array->pdata[i] = NULL;
+		}
+	}
+	else
+	{
+		GPtrArray *to_delete = g_ptr_array_sized_new(source_file->tags_array->len);
+
+		for (i = 0; i < source_file->tags_array->len; i++)
+		{
+			guint j;
+			guint tag_count;
+			TMTag **found;
+			TMTag *tag = source_file->tags_array->pdata[i];
+
+			found = tm_tags_find(tags_array, tag->name, FALSE, TRUE, &tag_count);
+
+			for (j = 0; j < tag_count; j++)
 			{
-				/* we cannot set the pointer to NULL now because the search wouldn't work */
-				g_ptr_array_add(to_delete, found);
-				/* no break - if there are multiple tags of the same name, we would 
-				 * always find the first instance and wouldn't remove others; duplicates
-				 * in the to_delete list aren't a problem */
+				if (*found != NULL && (*found)->file == source_file)
+				{
+					/* we cannot set the pointer to NULL now because the search wouldn't work */
+					g_ptr_array_add(to_delete, found);
+					/* no break - if there are multiple tags of the same name, we would
+					 * always find the first instance and wouldn't remove others; duplicates
+					 * in the to_delete list aren't a problem */
+				}
+				found++;
 			}
-			found++;
 		}
-	}
 
-	for (i = 0; i < to_delete->len; i++)
-	{
-		TMTag **tag = to_delete->pdata[i];
-		*tag = NULL;
+		for (i = 0; i < to_delete->len; i++)
+		{
+			TMTag **tag = to_delete->pdata[i];
+			*tag = NULL;
+		}
+		g_ptr_array_free(to_delete, TRUE);
 	}
-	g_ptr_array_free(to_delete, TRUE);
 
 	tm_tags_prune(tags_array);
 }