In [1]:
# Imports (if running in JupyterLite this might take a while)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
print("Enter ideas. You can enter them one-by-one, or enter them all at once, separated with a newline.")
ideas = []
answer = ""
while answer != "exit":
    answer = input()
    ideas.extend([idea for idea in answer.split('\n') if idea.strip() and idea != "exit"])

print(ideas)

In [None]:
# Initialize CountVectorizer to convert text into numerical vectors
count_vectorizer = CountVectorizer()

# Fit and transform the text data to numerical vectors
idea_matrix = count_vectorizer.fit_transform(ideas)

# Convert the idea matrix to a numpy array for easier calculations
idea_array = idea_matrix.toarray()

# Calculate the centroid (mean) of the idea array along axis 0 (rows)
centroid = np.mean(idea_array, axis=0)

# Calculate cosine similarity between each idea and the centroid
cos_similarity = cosine_similarity(idea_array, centroid.reshape(1, -1))

# make it so that 0 is 'same' and 1 is very different:
distance_to_centroid = 1 - cos_similarity

# Create an object with the similarity scores for each idea
ideas_and_similarities = [{"idea": ideas[i], "similarity": distance_to_centroid[i][0]} for i in range(len(ideas))]
distance_matrix = pairwise_distances(idea_array, metric='cosine')


### MDS MultiDimensional Scaling
Not quite working yet, but might be a good way if we can transform the data properly.

In [None]:
# Center the data
mds = manifold.MDS(n_components=2, dissimilarity='precomputed', random_state=1)
distances = euclidean_distances(distance_to_centroid)
coords = mds.fit_transform(distances)

# Plot
plt.title('Idea Centroid Similarity')
fig = plt.figure(1)
ax = plt.axes([0.0, 0.0, 1.0, 1.0])

plt.scatter(coords[:, 0], coords[:, 1], color="navy", label="Ideas")
plt.legend(scatterpoints=1, loc="best", shadow=False)

# Thicker connection lines if points are close to each other
EPSILON = np.finfo(np.float32).eps
line_weights = distances.max() / (distances + EPSILON) * 100
np.fill_diagonal(line_weights, 0)

# Plot the edges
start_idx, end_idx = np.where(coords)
# a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [
    [coords[i, :], coords[j, :]] for i in range(len(coords)) for j in range(len(coords))
]
values = np.abs(line_weights)
lc = LineCollection(
    segments, zorder=0, cmap=plt.cm.Blues, norm=plt.Normalize(0, values.max())
)
lc.set_array(line_weights.flatten())
lc.set_linewidths(np.full(len(segments), 0.5))
ax.add_collection(lc)

# Add labels to each point
labels = []
for i, dist in enumerate(coords):
    # text = f"{i}: {txt[:5]}..." if len(txt) > 5 else txt
    dist_round = round(dist[0], 2)
    text = f"{i}"
    label = ax.annotate(text, (coords[i, 0], coords[i, 1]), xytext=(5, 3), textcoords='offset pixels')
    labels.append(label)

# Adjust labels to prevent overlap - doesn't quite work though
# adjust_text(labels, arrowprops=dict(arrowstyle='->', color='black'))

# Add x and y axes
ax.axhline(y=0, color='k', linestyle='--')
ax.axvline(x=0, color='k', linestyle='--')

plt.show()

for i, item in enumerate(ideas_and_similarities):
    print(f"{i} : {item["idea"]} - (similarity|dist|coords): ({round(item["similarity"], 2)}|{round(distances[i][0], 2)}|{round(coords[i][0], 2)})")


### Heatmap
Difficult to read properly. There's surely something better. (See other attempts below)

In [None]:
fig = plt.figure(figsize=(8, 6))
plt.imshow(distance_matrix, cmap='viridis', interpolation='nearest')
plt.colorbar(label='Similarity Score')
plt.title('Heatmap Graph of Similarity Scores')
plt.xticks(np.arange(9), np.arange(1, 10))
plt.yticks(np.arange(9), np.arange(1, 10))
plt.xlabel('Idea Index')
plt.ylabel('Idea Index')
plt.grid(visible=True, linestyle='--', linewidth=0.5)
# as_html = mpld3.fig_to_html(fig, include_libraries=False, template_type="simple")
# plt.savefig('static/plot.png')
# plt.close(fig)
plt.show()

### Networkx Graph
Not really suitable, but we might be able to improve something?

In [None]:
import networkx as nx

# Create a graph
G = nx.Graph()

# Add nodes to the graph
num_nodes = distance_matrix.shape[0]
for i in range(num_nodes):
    G.add_node(i)

#Add edges based on distance matrix
for i in range(num_nodes):
    for j in range(i+1, num_nodes):
        weight = distance_matrix[i][j]
        G.add_edge(i, j, weight=weight)

# Position nodes using spring layout
coords = nx.spring_layout(G)

#Draw nodes
nx.draw_networkx_nodes(G, coords, node_size=500)

#Draw edges with weights
nx.draw_networkx_edges(G, coords)
edge_labels = {(u, v): d['weight'] for u, v, d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, coords, edge_labels=edge_labels)

#Display the graph
plt.title('Distance Graph')
plt.axis('off')
plt.show()