In [4]:
from collections import defaultdict
import difflib

# Function to find the common prefix
def common_prefix(strings):
    return ''.join(c[0] for c in zip(*strings) if all(x == c[0] for x in c)) or '*'

# Function to cluster and shorten
def shorten_list(input_list, target_length):
    if len(input_list) <= target_length:
        return input_list
    
    # Use difflib to find close matches and group
    grouped = defaultdict(list)
    used = set()
    
    for item in input_list:
        if item in used:
            continue
        matches = difflib.get_close_matches(item, input_list, n=target_length)
        for m in matches:
            used.add(m)
            grouped[item].append(m)
    
    # Take common prefixes and add '*'
    result = []
    for group in grouped.values():
        prefix = common_prefix(group)
        result.append(prefix + '*' if prefix != '*' else '*')
    
    # If we still have too many, cut to target length
    return result[:target_length]



In [8]:
# Example usage
long_list = ['abcdefd', 'abcd454', 'acd4567', 'acd9876']
shortened = shorten_list(long_list, 2)
print(shortened)

['abcdefd*', 'a*']


In [9]:
from collections import defaultdict

# Function to find the longest common prefix in a list of strings
def longest_common_prefix(strings):
    if not strings:
        return ""
    prefix = strings[0]
    for s in strings[1:]:
        while not s.startswith(prefix):
            prefix = prefix[:-1]
            if not prefix:
                return ""
    return prefix

# Function to group strings by prefixes
def group_by_prefix(strings, target_groups):
    # Sort strings to help grouping
    strings.sort()
    groups = []
    
    # Start greedy grouping
    while strings and len(groups) < target_groups:
        prefix = longest_common_prefix(strings)
        if len(prefix) < 2:  # Prevent too short prefixes, adjust as needed
            prefix = strings[0][:3]  # Take at least 3 characters if no common prefix
        # Find all items with this prefix
        grouped_items = [s for s in strings if s.startswith(prefix)]
        # Append summarized group
        groups.append(prefix + '*')
        # Remove grouped items from list
        strings = [s for s in strings if not s.startswith(prefix)]
    
    # If remaining strings and not enough groups, add as '*' group
    if strings and len(groups) < target_groups:
        groups.append('*')
    
    return groups



In [14]:
# Example usage
long_list = ['abcdefd', 'abcd454', 'acd4567', 'acd9876']
shortened = group_by_prefix(long_list,3)
print(shortened)

['abc*', 'acd*']


In [None]:
from collections import defaultdict

# Function to find longest common prefix of a list of strings
def longest_common_prefix(strings):
    if not strings:
        return ""
    prefix = strings[0]
    for s in strings[1:]:
        while not s.startswith(prefix):
            prefix = prefix[:-1]
            if not prefix:
                return ""
    return prefix

# Function to group strings by meaningful prefixes
def group_by_prefix(strings, target_groups):
    if not strings:
        return []
    
    # If target group is 1, return common root with '*'
    if target_groups == 1:
        prefix = longest_common_prefix(strings)
        # If nothing in common, return wildcard
        return [prefix + '*' if prefix else '*']
    
    # Sort to make grouping easier
    strings.sort()
    groups = []
    remaining = strings[:]
    
    # Try to form groups based on common prefixes
    while remaining and len(groups) < target_groups:
        prefix = longest_common_prefix(remaining)
        if len(prefix) < 1:
            # If too short, use first 2 characters to avoid empty prefix (adjustable)
            prefix = remaining[0][:2]
        
        # Collect strings matching this prefix
        matched = [s for s in remaining if s.startswith(prefix)]
        
        # Add summarized prefix
        groups.append(prefix + '*')
        
        # Remove grouped strings
        remaining = [s for s in remaining if not s.startswith(prefix)]
    
    # If remaining items and still space, add them to separate groups
    while remaining and len(groups) < target_groups:
        item = remaining.pop(0)
        groups.append(item[:2] + '*')  # Use first 2 characters as fallback

    # If still leftover strings but already hit target_groups, compress them under '*'
    if remaining:
        groups.append('*')
    
    return groups[:target_groups]  # Ensure we don't exceed target group count


Target 1: ['a*']
Target 2: ['a*']
Target 3: ['a*']


In [16]:

# Example usage
long_list = ['abcdefd', 'abcd454', 'acd4567', 'acd9876']
print("Target 1:", group_by_prefix(long_list, 1))  # Expect ['a*']
print("Target 2:", group_by_prefix(long_list, 2))  # Expect ['abc*', 'acd*']
print("Target 3:", group_by_prefix(long_list, 3))  # Expect ['abc*', 'acd*']

Target 1: ['a*']
Target 2: ['a*']
Target 3: ['a*']


In [22]:
from collections import defaultdict

# Find the longest common prefix for a list of strings
def longest_common_prefix(strings):
    if not strings:
        return ""
    prefix = strings[0]
    for s in strings[1:]:
        while not s.startswith(prefix):
            prefix = prefix[:-1]
            if not prefix:
                return ""
    return prefix

# Function to group by first N letters to help splitting when needed
def group_by_n_letters(strings, n=3):
    groups = defaultdict(list)
    for s in strings:
        key = s[:n] if len(s) >= n else s
        groups[key].append(s)
    return list(groups.values())

# Main function to create target number of groups
def group_by_prefix(strings, target_groups):
    if not strings:
        return []
    
    # If only one group wanted, return common root
    if target_groups == 1:
        prefix = longest_common_prefix(strings)
        return [prefix + '*' if prefix else '*']

    # First attempt: group by first 3 letters
    grouped = group_by_n_letters(strings, 3)
    
    # If still too many groups, merge small ones together until target reached
    while len(grouped) > target_groups:
        # Merge two smallest groups
        grouped = sorted(grouped, key=len)
        merged = grouped[0] + grouped[1]
        grouped = [merged] + grouped[2:]
    
    # If too few groups, try splitting large groups
    while len(grouped) < target_groups:
        # Sort groups to try to split largest one
        grouped = sorted(grouped, key=len, reverse=True)
        largest = grouped.pop(0)
        if len(largest) <= 1:
            # Cannot split further
            grouped.append(largest)
            break
        # Split by first 4 letters now (incrementally finer grouping)
        split = group_by_n_letters(largest, 4)
        grouped.extend(split)

    # Now finalize each group as common prefix + '*'
    result = []
    for group in grouped:
        prefix = longest_common_prefix(group)
        result.append((prefix if prefix else group[0][:2]) + '*')

    # Limit to target_groups if over
    return result[:target_groups]



In [20]:

# Example usage
long_list = ['abcdefd', 'abcd454', 'acd4567', 'acd9876']
print("Target 1:", group_by_prefix(long_list, 1))  # Expected ['a*']
print("Target 2:", group_by_prefix(long_list, 2))  # Expected ['abc*', 'acd*']
print("Target 3:", group_by_prefix(long_list, 3))  # Expected ['abc*', 'acd*']
#print("Target 4:", group_by_prefix(long_list, 4))  # Possible individual: ['abcdefd*', 'abcd454*', 'acd*']

Target 1: ['a*']
Target 2: ['abcd*', 'acd*']
Target 3: ['abcd*', 'acd4567*', 'acd9876*']


In [24]:
from collections import defaultdict

# Function to find common prefix
def longest_common_prefix(strings):
    if not strings:
        return ''
    s1, s2 = min(strings), max(strings)
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1

# Function to summarize group to prefix*
def summarize_group(group, max_len=8):
    prefix = longest_common_prefix(group)
    return (prefix[:max_len-1] + '*') if len(prefix) + 1 > max_len else (prefix + '*')

# Function to process each list for shortening/grouping
def shorten_list(lst, max_len=8):
    if not lst:
        return ''
    # If only one element, truncate if necessary
    if len(lst) == 1:
        return lst[0][:max_len]
    # If multiple, find common prefix
    return summarize_group(lst, max_len)

# Main function to process raw data
def format_raw_data(raw_data, max_len=8, delimiter=', '):
    # Sort keys by length of their value lists
    sorted_items = sorted(raw_data.items(), key=lambda item: len(item[1]))
    # Process and shorten each value list
    formatted_elements = [shorten_list(lst, max_len) for key, lst in sorted_items]
    # Join as final string
    return delimiter.join(formatted_elements)

# Example usage
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],
    'key2': ['xyz12345', 'xyzt6789'],
    'key3': ['aaa0000'],
}

result = format_raw_data(raw_data)
print(result)  # Output: aaa0000, xyzt*, abcd*

aaa0000, xyz*, abcd*


In [25]:
from collections import defaultdict

# Find the longest common prefix for a list of strings
def longest_common_prefix(strings):
    if not strings:
        return ""
    prefix = strings[0]
    for s in strings[1:]:
        while not s.startswith(prefix):
            prefix = prefix[:-1]
            if not prefix:
                return ""
    return prefix

# Function to group by first N letters
def group_by_n_letters(strings, n=3):
    groups = defaultdict(list)
    for s in strings:
        key = s[:n] if len(s) >= n else s
        groups[key].append(s)
    return list(groups.values())

# Main function to create target number of groups safely
def group_by_prefix(strings, target_groups):
    if not strings:
        return []
    
    # If only one group wanted, return common root
    if target_groups == 1:
        prefix = longest_common_prefix(strings)
        return [prefix + '*' if prefix else '*']
    
    # First attempt: group by first 3 letters
    grouped = group_by_n_letters(strings, 3)
    
    # Merge groups if too many
    while len(grouped) > target_groups:
        # Merge two smallest groups
        grouped = sorted(grouped, key=len)
        merged = grouped[0] + grouped[1]
        grouped = [merged] + grouped[2:]
    
    # Attempt to split groups if too few
    tried_splits = set()  # To avoid endless splitting
    while len(grouped) < target_groups:
        # Sort to try splitting the largest
        grouped = sorted(grouped, key=len, reverse=True)
        largest = grouped.pop(0)
        
        if len(largest) <= 1 or tuple(largest) in tried_splits:
            # Cannot split further or already tried
            grouped.append(largest)
            break  # Prevent endless loop
        
        # Try splitting with deeper prefix
        split = group_by_n_letters(largest, 4)
        
        if len(split) <= 1:
            # Splitting ineffective, remember and stop trying
            tried_splits.add(tuple(largest))
            grouped.append(largest)
            break
        else:
            grouped.extend(split)
    
    # If still fewer groups than target, consider padding with single items
    while len(grouped) < target_groups:
        # Flatten and create single-item groups if needed
        remaining_items = [item for group in grouped for item in group]
        needed = target_groups - len(grouped)
        single_groups = [[item] for item in remaining_items[:needed]]
        grouped.extend(single_groups)
        break  # Avoid creating infinite single splits

    # Now finalize each group as common prefix + '*'
    result = []
    for group in grouped:
        prefix = longest_common_prefix(group)
        result.append((prefix if prefix else group[0][:2]) + '*')

    # Limit to target_groups if overshoot
    return result[:target_groups]

# Example usage
long_list = ['abcdefd', 'abcd454', 'acd4567', 'acd9876']
print("Target 1:", group_by_prefix(long_list, 1))  # Expected ['a*']
print("Target 2:", group_by_prefix(long_list, 2))  # Expected ['abc*', 'acd*']
print("Target 3:", group_by_prefix(long_list, 3))  # Expected ['abc*', 'acd*']
print("Target 4:", group_by_prefix(long_list, 4))  # Expected like ['abcdefd*', 'abcd454*', 'acd*', ...]

Target 1: ['a*']
Target 2: ['abcd*', 'acd*']
Target 3: ['acd*', 'abcd*', 'acd4567*']
Target 4: ['acd*', 'abcd*', 'acd4567*', 'acd9876*']


In [26]:
from collections import defaultdict

# Function to find common prefix
def longest_common_prefix(strings):
    if not strings:
        return ''
    s1, s2 = min(strings), max(strings)
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1

# Function to summarize group to prefix*
def summarize_group(group, max_len=8):
    prefix = longest_common_prefix(group)
    return (prefix[:max_len-1] + '*') if len(prefix) + 1 > max_len else (prefix + '*')

# Function to process each list for shortening/grouping
def shorten_list(lst, max_len=8):
    if not lst:
        return ''
    # If only one element, truncate if necessary
    if len(lst) == 1:
        return lst[0][:max_len]
    # If multiple, find common prefix
    return summarize_group(lst, max_len)

# Main function to process raw data
def format_raw_data(raw_data, max_len=8, delimiter=', '):
    # Sort keys by length of their value lists
    sorted_items = sorted(raw_data.items(), key=lambda item: len(item[1]))
    # Process and shorten each value list
    formatted_elements = [shorten_list(lst, max_len) for key, lst in sorted_items]
    # Join as final string
    return delimiter.join(formatted_elements)

# Example usage
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],
    'key2': ['xyz12345', 'xyzt6789'],
    'key3': ['aaa0000'],
}

result = format_raw_data(raw_data)
print(result)  # Output: aaa0000, xyzt*, abcd*

aaa0000, xyz*, abcd*


In [27]:
from collections import defaultdict

# Function to find the longest common prefix
def longest_common_prefix(strings):
    if not strings:
        return ''
    s1, s2 = min(strings), max(strings)
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1

# Function to summarize group to prefix* within max length
def summarize_group(group, max_len=8):
    prefix = longest_common_prefix(group)
    # Truncate and append '*' if needed
    if len(prefix) + 1 > max_len:
        return prefix[:max_len - 1] + '*'
    else:
        return prefix + '*'

# Function to process and shorten each value list
def shorten_list(lst, max_len=8):
    if not lst:
        return ''
    if len(lst) == 1:
        # Single element case, truncate if necessary
        return lst[0][:max_len]
    else:
        # Multiple elements, summarize/group
        return summarize_group(lst, max_len)

# Main formatting function
def format_raw_data(raw_data, max_len=8, delimiter=', '):
    # Normalize values: convert single string to list
    normalized_data = {k: v if isinstance(v, list) else [v] for k, v in raw_data.items()}
    # Sort items based on length of list value
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))
    # Process each list and shorten
    formatted_elements = [shorten_list(lst, max_len) for _, lst in sorted_items]
    # Join as final string
    return delimiter.join(formatted_elements)

# -----------------------
# ✅ Example usage
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],
    'key2': 'xyz12345',  # Single string case
    'key3': ['aaa0000'],
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],  # Longer list
}

result = format_raw_data(raw_data, max_len=8)
print(result)  # Expected: aaa0000, xyz12345, abcd*, longtext*

xyz12345, aaa0000, abcd*, longtex*


In [28]:
from collections import defaultdict

# Function to extract prefix of a given string (up to length - 1 to leave room for '*')
def get_prefix(s, max_prefix_len):
    return s[:max_prefix_len]

# Function to build grouped representation under length constraint
def build_grouped_representation(strings, max_len=8):
    if not strings:
        return ''
    if len(strings) == 1:
        # If only one element, truncate if necessary
        return strings[0][:max_len]
    
    # Extract unique short prefixes (max length minus 1 for '*')
    unique_prefixes = set()
    max_prefix_len = max_len - 1  # Reserve 1 char for '*' in each prefix
    
    for s in strings:
        prefix = get_prefix(s, max_prefix_len)
        unique_prefixes.add(prefix)
    
    # Now try to combine prefixes up to max_len
    result_parts = []
    total_length = 0
    for prefix in sorted(unique_prefixes):  # Sort to make it deterministic
        part = prefix + '*'
        if total_length + len(part) <= max_len:
            result_parts.append(part)
            total_length += len(part)
        else:
            break  # Stop if adding another prefix exceeds max_len
    
    # Join parts together, remove the trailing '*' for better appearance if needed
    final_string = ''.join(result_parts)
    return final_string

# Main function to process and format raw data
def format_raw_data_with_max_groups(raw_data, max_len=8, delimiter=', '):
    # Normalize: convert string to list if necessary
    normalized_data = {k: v if isinstance(v, list) else [v] for k, v in raw_data.items()}
    
    # Sort by increasing length of value lists
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))
    
    # Process each value list
    formatted_elements = [build_grouped_representation(lst, max_len) for _, lst in sorted_items]
    
    # Join as final output string
    return delimiter.join(formatted_elements)

# -----------------------
# ✅ Example usage
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],
    'key2': 'xyz12345',  # Single string
    'key3': ['aaa0000'],
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],
    'key5': ['abc12', 'ab89', 'ac90']  # Complex grouping case
}

result = format_raw_data_with_max_groups(raw_data, max_len=8)
print(result)  # Expected: aaa0000, xyz12345, ab*ac*, abcd*, longte*

xyz12345, aaa0000, abcd123*, ab89*, longtex*


In [29]:
from typing import List

# Function to find longest common prefix in a list of strings
def longest_common_prefix(strings: List[str]) -> str:
    if not strings:
        return ''
    s1, s2 = min(strings), max(strings)  # min and max in lexicographical order
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1  # all characters are common

# Function to handle grouping and length restriction
def process_value_list(strings: List[str], max_len: int = 8) -> str:
    if not strings:
        return ''
    
    if len(strings) == 1:
        # If single element, truncate if needed
        return strings[0][:max_len]
    
    # Try to find overall common prefix
    common_prefix = longest_common_prefix(strings)
    
    if common_prefix:
        result = common_prefix + '*'
        return result[:max_len] if len(result) > max_len else result
    
    # If no common prefix, attempt to select prefixes to maximize within max_len
    prefixes = set()
    max_prefix_len = max_len - 1  # for '*'
    
    for s in strings:
        prefix = s[:max_prefix_len]
        prefixes.add(prefix)
    
    # Now pick as many prefixes as possible within length limit
    result = ''
    total_length = 0
    for prefix in sorted(prefixes):  # sort for consistency
        part = prefix + '*'
        if total_length + len(part) <= max_len:
            result += part
            total_length += len(part)
        else:
            break  # stop if adding this exceeds limit

    return result

# Main function to process the raw data and format it
def format_raw_data(raw_data: dict, max_len: int = 8, delimiter: str = ', ') -> str:
    # Normalize strings to list
    normalized_data = {k: v if isinstance(v, list) else [v] for k, v in raw_data.items()}
    
    # Sort by increasing length of value lists
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))
    
    # Process each list
    formatted_elements = [process_value_list(lst, max_len) for _, lst in sorted_items]
    
    return delimiter.join(formatted_elements)

# -----------------------
# ✅ Example usage
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],
    'key2': 'xyz12345',  # Single string
    'key3': ['aaa0000'],
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],
    'key5': ['abc12', 'ab89', 'ac90']  # Complex grouping case
}

result = format_raw_data(raw_data, max_len=8)
print(result)  # Expected something like: aaa0000, xyz12345, ab*ac*, abcd*, longte*

xyz12345, aaa0000, abcd*, a*, longtext


In [30]:
from typing import List

# Find longest common prefix
def longest_common_prefix(strings: List[str]) -> str:
    if not strings:
        return ''
    s1, s2 = min(strings), max(strings)
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1

# Group by common sub-prefix to maximize representation within length
def group_prefixes(strings: List[str], max_len: int) -> str:
    if not strings:
        return ''
    if len(strings) == 1:
        # Only one element, return truncated if needed
        return strings[0][:max_len]

    # Step 1: Try to find overall common prefix
    common_prefix = longest_common_prefix(strings)
    if common_prefix:
        result = common_prefix + '*'
        return result[:max_len] if len(result) > max_len else result

    # Step 2: No common prefix, try partial grouping
    # Find first 2-3 chars as candidate prefixes
    prefix_len = 2  # adjustable, 2 is a good start for brevity
    candidate_prefixes = set(s[:prefix_len] for s in strings)

    # Step 3: Collect and combine as many prefixes as fit in max_len
    result_parts = []
    current_length = 0
    for prefix in sorted(candidate_prefixes):  # sort for stable output
        part = prefix + '*'
        if current_length + len(part) <= max_len:
            result_parts.append(part)
            current_length += len(part)
        else:
            break  # Stop adding if max length reached

    return ''.join(result_parts)

# Main processing function
def format_raw_data(raw_data: dict, max_len: int = 8, delimiter: str = ', ') -> str:
    # Normalize values to list if they are strings
    normalized_data = {k: v if isinstance(v, list) else [v] for k, v in raw_data.items()}

    # Sort items by the number of elements in the value list (ascending)
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))

    # Process each item using grouping logic
    formatted_elements = [group_prefixes(values, max_len) for _, values in sorted_items]

    return delimiter.join(formatted_elements)

# -----------------------
# ✅ Test Example
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],  # should be 'abcd*'
    'key2': 'xyz12345',  # should be 'xyz12345'
    'key3': ['aaa0000'],  # should be 'aaa0000'
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],  # should be 'longte*'
    'key5': ['abc12', 'ab89', 'ac90']  # should be 'ab*ac*'
}

result = format_raw_data(raw_data, max_len=8)
print("Formatted Result:", result)

Formatted Result: xyz12345, aaa0000, abcd*, a*, longtext


In [31]:
from typing import List, Dict

# Function to find the longest common prefix
def longest_common_prefix(strings: List[str]) -> str:
    if not strings:
        return ''
    s1, s2 = min(strings), max(strings)
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1

# Function to group prefixes under max length constraint
def group_prefixes(strings: List[str], max_len: int) -> str:
    if not strings:
        return ''
    
    # If only one string, return it truncated to max_len
    if len(strings) == 1:
        return strings[0][:max_len]

    # Step 1: Try the longest common prefix
    common_prefix = longest_common_prefix(strings)
    if common_prefix:
        candidate = common_prefix + '*'
        if len(candidate) <= max_len:
            return candidate

    # Step 2: If no good common prefix, attempt prefix grouping
    prefix_len = 2  # Starting with 2-character prefix (adjustable)
    candidate_prefixes = sorted(set(s[:prefix_len] for s in strings))

    # Step 3: Try to concatenate as many as possible within max length
    result_parts = []
    current_length = 0
    for prefix in candidate_prefixes:
        part = prefix + '*'
        if current_length + len(part) <= max_len:
            result_parts.append(part)
            current_length += len(part)
        else:
            break  # Stop if adding next exceeds max length

    # Step 4: Join all valid prefixes and return
    return ''.join(result_parts).rstrip('*')  # Remove trailing '*' if any

# Main function to process raw data and format the result
def format_raw_data(raw_data: Dict[str, List[str] or str], max_len: int = 8, delimiter: str = ', ') -> str:
    # Step 1: Normalize string values to list of one element
    normalized_data = {k: v if isinstance(v, list) else [v] for k, v in raw_data.items()}

    # Step 2: Sort items by length of their value list (ascending)
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))

    # Step 3: Process each value list into formatted string
    formatted_elements = []
    for _, values in sorted_items:
        formatted = group_prefixes(values, max_len)
        formatted_elements.append(formatted)

    # Step 4: Join all formatted parts
    return delimiter.join(formatted_elements)

# ---------------------
# ✅ Test Example
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],  # ➡️ 'abcd*'
    'key2': 'xyz12345',  # ➡️ 'xyz12345'
    'key3': ['aaa0000'],  # ➡️ 'aaa0000'
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],  # ➡️ 'longte*'
    'key5': ['abc12', 'ab89', 'ac90']  # ➡️ 'ab*ac*'
}

result = format_raw_data(raw_data, max_len=8)
print("Formatted Result:", result)

Formatted Result: xyz12345, aaa0000, abcd*, a*, lo


In [32]:
from typing import List, Dict, Union

# Find longest common prefix of a list of strings
def longest_common_prefix(strs: List[str]) -> str:
    if not strs:
        return ''
    shortest = min(strs, key=len)
    for i, char in enumerate(shortest):
        for other in strs:
            if other[i] != char:
                return shortest[:i]
    return shortest

# Group and compact representation under max_len constraint
def compact_representation(values: List[str], max_len: int = 8) -> str:
    if not values:
        return ''
    
    # Single item case
    if len(values) == 1:
        return values[0][:max_len]

    # Try full common prefix
    common_prefix = longest_common_prefix(values)
    candidate = common_prefix + '*' if common_prefix else ''
    if candidate and len(candidate) <= max_len:
        return candidate
    
    # Group by smaller prefixes (2 or 3 chars), keep adding until max_len
    prefix_size = 2  # Can adjust to 3 if needed
    prefixes = sorted(set(v[:prefix_size] for v in values))
    
    result_parts = []
    total_len = 0
    for prefix in prefixes:
        part = prefix + '*'
        if total_len + len(part) <= max_len:
            result_parts.append(part)
            total_len += len(part)
        else:
            break  # stop adding if length exceeded

    # Join and strip last '*' if needed
    return ''.join(result_parts).rstrip('*')

# Main function to process raw data
def format_raw_data(
    raw_data: Dict[str, Union[str, List[str]]], 
    max_len: int = 8, 
    delimiter: str = ', '
) -> str:
    
    # Normalize to lists
    normalized_data = {k: (v if isinstance(v, list) else [v]) for k, v in raw_data.items()}
    
    # Sort by length of list (ascending)
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))
    
    # Process each value list to compact format
    formatted_elements = []
    for _, values in sorted_items:
        formatted = compact_representation(values, max_len)
        formatted_elements.append(formatted)
    
    # Concatenate all formatted parts
    return delimiter.join(formatted_elements)


# ----------------------
# ✅ Example Test
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],  # ➡️ 'abcd*'
    'key2': 'xyz12345',  # ➡️ 'xyz12345'
    'key3': ['aaa0000'],  # ➡️ 'aaa0000'
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],  # ➡️ 'longte*'
    'key5': ['abc12', 'ab89', 'ac90']  # ➡️ 'ab*ac*'
}

result = format_raw_data(raw_data, max_len=8)
print("Formatted Result:", result)

Formatted Result: xyz12345, aaa0000, abcd*, a*, lo


In [35]:
from typing import List, Dict, Union

# Find longest common prefix of a list of strings
def longest_common_prefix(strs: List[str]) -> str:
    if not strs:
        return ''
    shortest = min(strs, key=len)
    for i in range(len(shortest)):
        for other in strs:
            if other[i] != shortest[i]:
                return shortest[:i]
    return shortest

# Compact representation under max_len constraint
def compact_representation(values: List[str], max_len: int = 8) -> str:
    if not values:
        return ''
    
    # Single item case
    if len(values) == 1:
        return values[0][:max_len]
    
    # Try full common prefix with '*'
    common_prefix = longest_common_prefix(values)
    candidate = common_prefix + '*' if common_prefix else ''
    if candidate and len(candidate) <= max_len and len(common_prefix) >= 2:  # Only meaningful prefix
        return candidate
    
    # If common prefix too short/long, group by small prefixes (e.g., first 2-3 chars)
    prefix_size = 2  # Can adjust to 3 for better accuracy
    prefixes = sorted(set(v[:prefix_size] for v in values))
    
    result_parts = []
    total_len = 0
    for prefix in prefixes:
        part = prefix + '*'
        if total_len + len(part) <= max_len:
            result_parts.append(part)
            total_len += len(part)
        else:
            break  # Don't exceed max length

    # Final concatenation
    return ''.join(result_parts).rstrip('*')

# Main function to process raw data and generate formatted string
def format_raw_data(
    raw_data: Dict[str, Union[str, List[str]]], 
    max_len: int = 8, 
    delimiter: str = ', '
) -> str:
    
    # Normalize all values to lists
    normalized_data = {k: (v if isinstance(v, list) else [v]) for k, v in raw_data.items()}
    
    # Sort keys by length of value list (ascending)
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))
    
    # Process each list to compact format
    formatted_elements = []
    for _, values in sorted_items:
        formatted = compact_representation(values, max_len)
        formatted_elements.append(formatted)
    
    # Join and return
    return delimiter.join(formatted_elements)

# ---------------------------
# ✅ Example test case
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],  # expect 'abcd*'
    'key2': 'xyz12345',                           # expect 'xyz12345'
    'key3': ['aaa0000'],                          # expect 'aaa0000'
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],  # expect 'longte*'
    'key5': ['abc12', 'ab89', 'ac90']              # expect 'ab*ac*'
}

result = format_raw_data(raw_data, max_len=8)
print("Formatted Result:", result)

Formatted Result: xyz12345, aaa0000, abcd*, ab*ac, lo


In [None]:
from typing import List, Dict, Union

# Find longest common prefix of a list of strings
def longest_common_prefix(strs: List[str]) -> str:
    if not strs:
        return ''
    shortest = min(strs, key=len)
    for i in range(len(shortest)):
        for other in strs:
            if other[i] != shortest[i]:
                return shortest[:i]
    return shortest

# Compact representation under max_len constraint
def compact_representation(values: List[str], max_len: int = 8) -> str:
    if not values:
        return ''
    
    # Single item case
    if len(values) == 1:
        return values[0][:max_len]  # Truncate if needed
    
    # Try full common prefix first
    common_prefix = longest_common_prefix(values)
    
    # First, see if common prefix alone is a good representation
    if common_prefix and len(common_prefix) <= max_len and len(common_prefix) >= 2:
        return common_prefix  # Example: 'longtext'
    
    # Next, try common prefix with '*' if still fits
    candidate = common_prefix + '*' if common_prefix else ''
    if candidate and len(candidate) <= max_len and len(common_prefix) >= 2:
        return candidate  # Example: 'abc*'

    # If common prefix is empty or too short, group by smaller prefixes (e.g., first 2 chars)
    prefix_size = 2  # Can adjust to 3 if more appropriate
    prefixes = sorted(set(v[:prefix_size] for v in values))
    
    result_parts = []
    total_len = 0
    for prefix in prefixes:
        part = prefix + '*'
        if total_len + len(part) <= max_len:
            result_parts.append(part)
            total_len += len(part)
        else:
            break  # Stop if adding next would exceed max length

    return ''.join(result_parts).rstrip('*')

# Main function to process raw data and generate formatted string
def format_raw_data(
    raw_data: Dict[str, Union[str, List[str]]], 
    max_len: int = 8, 
    delimiter: str = ', '
) -> str:
    
    # Normalize all values to lists
    normalized_data = {k: (v if isinstance(v, list) else [v]) for k, v in raw_data.items()}
    
    # Sort keys by length of value list (ascending)
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))
    
    # Process each list to compact format
    formatted_elements = []
    for _, values in sorted_items:
        formatted = compact_representation(values, max_len)
        formatted_elements.append(formatted)
    
    # Join and return
    return delimiter.join(formatted_elements)

# ---------------------------
# ✅ Example test case


Formatted Result: xyz12345, aaa0000, abcd, ab*ac, longtext


In [43]:
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],  # expect 'abcd*'
    'key2': 'xyz12345',                           # expect 'xyz12345'
    'key3': ['aaa0000'],                          # expect 'aaa0000'
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],  # expect 'longtext'
    'key5': ['abc12', 'ab89', 'ac90']              # expect 'ab*ac*'
}

result = format_raw_data(raw_data, max_len=8)
print("Formatted Result:", result)

Formatted Result: xyz12345, aaa0000, abcd, ab*ac, longtext


In [44]:
from typing import List, Dict, Union

# Find longest common prefix of a list of strings
def longest_common_prefix(strs: List[str]) -> str:
    if not strs:
        return ''
    shortest = min(strs, key=len)
    for i in range(len(shortest)):
        for other in strs:
            if other[i] != shortest[i]:
                return shortest[:i]
    return shortest

# Compact representation under max_len constraint
def compact_representation(values: List[str], max_len: int = 8) -> str:
    if not values:
        return ''
    
    # Single element case
    if len(values) == 1:
        return values[0][:max_len]  # Truncate if needed

    # Try full common prefix first
    common_prefix = longest_common_prefix(values)
    
    # Check if common prefix + '*' fits
    if common_prefix and len(common_prefix) + 1 <= max_len and len(common_prefix) >= 2:
        return common_prefix + '*'

    # If can't use common prefix, group by smaller prefixes (first 2 chars)
    prefix_size = 2  # Adjustable
    prefixes = sorted(set(v[:prefix_size] for v in values))
    
    result_parts = []
    total_len = 0
    for prefix in prefixes:
        part = prefix + '*'
        if total_len + len(part) <= max_len:
            result_parts.append(part)
            total_len += len(part)
        else:
            break  # Avoid overflow
    
    return ''.join(result_parts)

# Main function to process raw data and generate formatted string
def format_raw_data(
    raw_data: Dict[str, Union[str, List[str]]], 
    max_len: int = 8, 
    delimiter: str = ', '
) -> str:
    
    # Normalize all values to lists
    normalized_data = {k: (v if isinstance(v, list) else [v]) for k, v in raw_data.items()}
    
    # Sort keys by length of value list (ascending)
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))
    
    # Process each list to compact format
    formatted_elements = []
    for _, values in sorted_items:
        formatted = compact_representation(values, max_len)
        formatted_elements.append(formatted)
    
    # Join and return
    return delimiter.join(formatted_elements)

# ---------------------------
# ✅ Example test case
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],  # expect 'abcd*'
    'key2': 'xyz12345',                           # expect 'xyz12345'
    'key3': ['aaa0000'],                          # expect 'aaa0000'
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],  # expect 'longtext*'
    'key5': ['abc12', 'ab89', 'ac90']              # expect 'ab*ac*'
}

result = format_raw_data(raw_data, max_len=8)
print("Formatted Result:", result)

Formatted Result: xyz12345, aaa0000, abcd*, ab*ac*, lo*


In [45]:
from typing import List, Dict, Union

# Function to find the longest common prefix in a list of strings
def longest_common_prefix(strs: List[str]) -> str:
    if not strs:
        return ''
    shortest = min(strs, key=len)
    for i in range(len(shortest)):
        for other in strs:
            if other[i] != shortest[i]:
                return shortest[:i]
    return shortest

# Function to compactly represent the list of strings
def compact_representation(values: List[str], max_len: int = 8) -> str:
    if not values:
        return ''
    
    # Handle single element list directly
    if len(values) == 1:
        return values[0][:max_len]  # Truncate if needed

    # Try to use full common prefix if possible
    common_prefix = longest_common_prefix(values)
    if common_prefix and len(common_prefix) + 1 <= max_len:  # +1 for '*'
        return common_prefix + '*'

    # If common prefix too short or too long, group by smaller prefixes (default 2 chars)
    prefix_size = 2
    prefixes = sorted(set(v[:prefix_size] for v in values))
    
    # Now, try to concatenate prefixes with '*' and ensure not exceed max_len
    result_parts = []
    total_len = 0
    for prefix in prefixes:
        part = prefix + '*'
        if total_len + len(part) <= max_len:
            result_parts.append(part)
            total_len += len(part)
        else:
            break  # Prevent overflow
    
    return ''.join(result_parts)

# Main processing function
def format_raw_data(
    raw_data: Dict[str, Union[str, List[str]]], 
    max_len: int = 8, 
    delimiter: str = ', '
) -> str:
    # Normalize values to lists
    normalized_data = {k: (v if isinstance(v, list) else [v]) for k, v in raw_data.items()}
    
    # Sort by increasing length of the value list
    sorted_items = sorted(normalized_data.items(), key=lambda item: len(item[1]))
    
    # Process each value set
    formatted_elements = []
    for _, values in sorted_items:
        formatted = compact_representation(values, max_len)
        formatted_elements.append(formatted)
    
    # Join and return
    return delimiter.join(formatted_elements)

# ---------------------------
# ✅ Example input test case
raw_data = {
    'key1': ['abcd1234', 'abcd5678', 'abcd9999'],  # expect 'abcd*'
    'key2': 'xyz12345',                           # expect 'xyz12345'
    'key3': ['aaa0000'],                          # expect 'aaa0000'
    'key4': ['longtext1', 'longtext2', 'longtext3', 'longtext4'],  # expect 'longtext*'
    'key5': ['abc12', 'ab89', 'ac90']              # expect 'ab*ac*'
}

result = format_raw_data(raw_data, max_len=8)
print("Formatted Result:", result)

Formatted Result: xyz12345, aaa0000, abcd*, a*, lo*
