In [None]:
def find_distinctive_tokens(token_sets, max_occur):
    # 1. Count the number of clusters each token appears in
    token_cluster_count = {}
    for tokens in token_sets.values():
        for token in tokens:
            token_cluster_count[token] = token_cluster_count.get(token, 0) + 1

    # 2. Find distinctive tokens for each cluster
    distinctive = {}
    for cluster_id, tokens in token_sets.items():
        # Select tokens that appear in max_occur or fewer clusters
        distinctive_tokens = {token for token in tokens if token_cluster_count[token] <= max_occur}
        # If a cluster has distinctive tokens, add them to the dictionary
        if distinctive_tokens:
            distinctive[cluster_id] = distinctive_tokens

    return distinctive

# Example usage:
# Assume token_sets is a dictionary where keys are cluster IDs and values are sets of top tokens for each cluster.
# Assume max_occur is the maximum number of clusters a token can appear in to be considered distinctive.
# distinctive_tokens = find_distinctive_tokens(token_sets, max_occur)
# print(distinctive_tokens)


from collections import Counter

def get_top_tokens(cid, labels, corpusdf, k):
    # 1. Filter the documents for the given cluster ID
    cluster_docs = corpusdf[labels == cid]
    
    # 2. Concatenate all pseudo-documents into one large string
    all_tokens = ' '.join(cluster_docs['pseudodoc']).split()
    
    all_tokens=sorted(all_tokens)
    # 3. Count each unique token's frequency
    token_counts = Counter(all_tokens)
    
    # 4. Return the top k tokens, sorted by frequency and token in case of tie
    top_tokens = set(token for token, _ in token_counts.most_common(k))
    
    # 5. In case there are fewer than k tokens, return as many as available
    # Note: The set will naturally have fewer than k elements if not enough unique tokens are present.
    
    return top_tokens

# Example usage:
# Assuming the variables `labels` and `corpusdf` have been defined earlier and `cid` is the cluster ID you are interested in.
# top_tokens = get_top_tokens(cid, labels, corpusdf, k)
# print(top_tokens)
