In [None]:
for method,scores in data:
    visualise_metrics(scores)

    l = ['A','Z']
{i : name for i,name in enumerate(l)}

def visualise_clusters(vectorised_input, label_names, ):

    dist = 1 - cosine_similarity(vectorised_input)

    # Use multidimensional scaling to convert the dist matrix into a 2-dimensional array 

    MDS()

    # n_components=2 to plot results in a two-dimensional plane
    # "precomputed" because the  distance matrix dist is already computed
    # `random_state` set to 0 so that the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=0)

    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]



    #set up colors per clusters using a dict
    # #1b9e77 (green) #d95f02 (orange) #7570b3 (purple) #e7298a (pink)
    colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#000000'}

    
    #set up cluster names using a dict
    cluster_names = {i = name for i,name in enumerate(label_names)}
    cluster_colors = {colors[i] for i in cluster_names.keys()}

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(dict(x=xs, y=ys, label=cluster_names))

    #group by cluster
    groups = df.groupby('label')

    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9)) # set size
    ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
                label=cluster_names[name], 
                color=cluster_colors[name], 
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom=False,      # ticks along the bottom edge are off
            top=False,         # ticks along the top edge are off
            labelbottom=False)
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left=False,      # ticks along the bottom edge are off
            top=False,         # ticks along the top edge are off
            labelleft=False)
        
        ax.legend(numpoints=1)  #show legend with only 1 point
            
            
            plt.show() #show the plot
        
        linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

        fig, ax = plt.subplots(figsize=(15, 20)) # set size
        ax = dendrogram(linkage_matrix, orientation="right");
        plt.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom=False,      # ticks along the bottom edge are off
            top=False,         # ticks along the top edge are off
            labelbottom=False)

        plt.tight_layout() #show plot with tight layout

        #uncomment below to save figure
        plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters

In [None]:
# token to index dictionary is already in the tfidf model 
token2idx = tfidf_vectorizer.vocabulary_ 
# inverse of the dictionary 
idx2token = {v: k for k, v in token2idx.items()} 
  
# clf.coef_ yields matrix with classes as rows and tokens/features as columns 
# we don't have access to anything else, so we retrieve tokens weights from the inverse of this matrix 
idx2weight = {i: weight for i, weight in enumerate(clf.coef_.T)} 
#weight : vector of size 5  
 
top_n = 6 
  
# argsort on the clf.coef_ sorts each row (axis=1) increasingly and yields indices instead of the actual values 
argsorted_cls = np.argsort(clf.coef_, axis=1) 
# argsorted_cls: matrix of size C X D (C: number of classes, D: number of features) 
  
# we loop over the obtained, sorted indices, keeping the index number (representing the class index) 
for class_index, sorted_tokens in enumerate(argsorted_cls): 
    # using idx2target we can obtain classes actual name 
    print(f"Class {idx2token[class_index]} ({class_index}) and it's top {top_n} tokens:")
     
    # we need to inverse the obtained indices from the argsorted_cls, to make it decreasing 
    # we are interested in top 6 results 
    for token in sorted_tokens[::-1][:top_n]:  
        # we can use idx2weight to obtain back the token's weight 
        # from this we can check and verify both: 
        #   1) tokens are really ranked from top 1 to top 6 
        #   2) among classes, the highest value is being assigned to the class 
        #      to which the token has been located at as the top one 
        reformatted_weights = ', '.join([f"{x:.4f}" for x in idx2weight[token].tolist()]) 
        # idx2token allows us to obtain the token's actual name 
        print(f"Token {idx2token[token]} ({token}) has a weight:\n\t[{reformatted_weights}]") 
    print() 