Skip to content

Commit

Permalink
Copilot chat: Avoid duplication in document memories (microsoft#773)
Browse files Browse the repository at this point in the history
### Motivation and Context
Importing documents in Copilot chat will not check for duplication. This
will potentially create issues where the same documents are uploaded
multiple times.

### Description
1. Enable the document import controller to check for duplication before
saving a new memory to the document memory.
2. Allow developers to configure the threshold of duplication
(similarity score) in the settings.
3. Move a couple hard-coded similarity score settings to settings.
  • Loading branch information
TaoChenOSU committed May 8, 2023
1 parent 3888dc4 commit 4b38b66
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,11 @@ public class DocumentMemoryOptions
/// </summary>
[Range(0, int.MaxValue)]
public int FileSizeLimit { get; set; } = 1000000;

/// <summary>
/// Similarity threshold to avoid document memory duplication when importing documents to memory.
/// The higher the value, the more document memories can be similar without being eliminated as duplicates.
/// </summary>
[Range(0.0, 1.0)]
public double DeduplicationSimilarityThreshold { get; set; } = 0.8;
}
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,21 @@ private async Task ParseDocumentContentToMemoryAsync(IKernel kernel, string cont

foreach (var paragraph in paragraphs)
{
await kernel.Memory.SaveInformationAsync(
var memories = kernel.Memory.SearchAsync(
collection: targetCollectionName,
text: paragraph,
id: Guid.NewGuid().ToString(),
description: $"Document: {documentName}");
query: paragraph,
limit: 1,
minRelevanceScore: this._options.DeduplicationSimilarityThreshold
).ToEnumerable();

if (!memories.Any())
{
await kernel.Memory.SaveInformationAsync(
collection: targetCollectionName,
text: paragraph,
id: Guid.NewGuid().ToString(),
description: $"Document: {documentName}");
}
}

this._logger.LogInformation(
Expand Down
4 changes: 2 additions & 2 deletions samples/apps/copilot-chat-app/webapi/Skills/ChatSkill.cs
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ public async Task<string> ExtractUserMemoriesAsync(SKContext context)
SemanticMemoryExtractor.MemoryCollectionName(chatId, memoryName),
latestMessage.ToString(),
limit: 100,
minRelevanceScore: 0.8);
minRelevanceScore: this._promptSettings.SemanticMemoryMinRelevance);
await foreach (var memory in results)
{
relevantMemories.Add(memory);
Expand Down Expand Up @@ -563,7 +563,7 @@ private async Task CreateMemoryAsync(SemanticChatMemoryItem item, string chatId,
collection: memoryCollectionName,
query: item.ToFormattedString(),
limit: 1,
minRelevanceScore: 0.8,
minRelevanceScore: this._promptSettings.SemanticMemoryMinRelevance,
cancellationToken: context.CancellationToken
).ToEnumerable();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public async Task<string> QueryDocumentsAsync(string query, SKContext context)
documentCollection,
query,
limit: 100,
minRelevanceScore: 0.8);
minRelevanceScore: this._promptSettings.DocumentMemoryMinRelevance);
await foreach (var memory in results)
{
relevantMemories.Add(memory);
Expand Down
19 changes: 7 additions & 12 deletions samples/apps/copilot-chat-app/webapi/Skills/PromptSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,16 @@ public PromptSettings(PromptsConfig promptsConfig)
internal double RelatedInformationContextWeight { get; } = 0.75;

/// <summary>
/// Maximum number of tokens per line that will be used to split a document into lines.
/// Setting this to a low value will result in higher context granularity, but
/// takes longer to process the entire document into embeddings.
/// Default to 30 tokens as suggested by OpenAI:
/// https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
/// Minimum relevance of a semantic memory to be included in the final prompt.
/// The higher the value, the answer will be more relevant to the user intent.
/// </summary>
internal int DocumentLineSplitMaxTokens { get; } = 30;
internal double SemanticMemoryMinRelevance { get; } = 0.8;

/// <summary>
/// Maximum number of tokens per paragraph that will be used to combine lines into paragraphs.
/// Setting this to a low value will result in higher context granularity, but
/// takes longer to process the entire document into embeddings.
/// Default to 100 tokens as suggested by OpenAI:
/// https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them /// </summary>
internal int DocumentParagraphSplitMaxLines { get; } = 100;
/// Minimum relevance of a document memory to be included in the final prompt.
/// The higher the value, the answer will be more relevant to the user intent.
/// </summary>
internal double DocumentMemoryMinRelevance { get; } = 0.8;

internal string KnowledgeCutoffDate => this._promptsConfig.KnowledgeCutoffDate;
internal string InitialBotMessage => this._promptsConfig.InitialBotMessage;
Expand Down
6 changes: 5 additions & 1 deletion samples/apps/copilot-chat-app/webapi/appsettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,17 @@
// https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
// - Prevent large uploads by setting a file size limit (in bytes) as suggested here:
// https://learn.microsoft.com/en-us/aspnet/core/mvc/models/file-uploads?view=aspnetcore-6.0
// - Deduplication similarity threshold is used to determine if a document is similar to
// another document that is already in memory:
// - The higher the value, the more document memories can be similar without being eliminated as duplicates.
//
"DocumentMemory": {
"GlobalDocumentCollectionName": "global-documents",
"UserDocumentCollectionNamePrefix": "user-documents-",
"DocumentLineSplitMaxTokens": 30,
"DocumentParagraphSplitMaxLines": 100,
"FileSizeLimit": 1000000
"FileSizeLimit": 1000000,
"DeduplicationSimilarityThreshold": 0.8
},

//
Expand Down

0 comments on commit 4b38b66

Please sign in to comment.