In [None]:
# Imports (if running in JupyterLite this might take a while)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from scipy.integrate import odeint
import io

In [None]:
# IDEA INPUT
# decide how you want to enter ideas by setting any of the following variables to True or false.
# If none of them is True, then we assume the ideas are provided in the code.

enter_ideas_manually = False
enter_ideas_from_csv = True

ideas = []
if enter_ideas_manually:
    print("Enter ideas. You can enter them one-by-one, or enter them all at once, separated with a newline. Stop by typing 'exit'")
    while answer.lower != "exit":
        answer = input()
        ideas.extend([idea for idea in answer.split('\n') if idea.strip() and idea != "exit"])

if enter_ideas_from_csv:
    import ipywidgets as widgets
    from IPython.display import display

    file_upload = widgets.FileUpload()

    display(file_upload)


In [None]:
if enter_ideas_from_csv and len(file_upload.value) > 0:
    uploaded = file_upload.value[0]
    import codecs
    content = codecs.decode(uploaded.content, encoding="utf-8")
    ideas = content.split('\n')

In [None]:
print('Ideas: ', ideas)

In [None]:
# Initialize CountVectorizer to convert text into numerical vectors
count_vectorizer = CountVectorizer()

# Fit and transform the text data to numerical vectors
idea_matrix = count_vectorizer.fit_transform(ideas)

# Convert the idea matrix to a numpy array for easier calculations
idea_array = idea_matrix.toarray()

# Calculate the centroid (mean) of the idea array along axis 0 (rows)
centroid = np.mean(idea_array, axis=0)

# Add the centroid as another row/column
idea_array = np.vstack([idea_array, centroid])

# Calculate similarity & distances
cos_similarity = cosine_similarity(idea_array, centroid.reshape(1, -1))
pairwise_similarity = cosine_similarity(idea_array, idea_array)
pairwise_distance = pairwise_distances(idea_array, metric='cosine')

# Distance of the centroid to each other idea
centroid_distance = pairwise_distance[-1, :-1]

# make it so that 0 is 'same' and 1 is very different:
distance_to_centroid = 1 - cos_similarity


### Print info
# Create an object with the similarity scores for each idea
print('Cosine similarity: ')
for row in cos_similarity:
    print("{:.2f}".format(*row), sep='')

print('Distance to centroid: (-1 * x): ')
for row in distance_to_centroid:
    print("{:.2f}".format(*row), sep='')

ideas_and_similarities = [{"idea": ideas[i], "dist": distance_to_centroid[i][0]} for i in range(len(ideas))]


### MDS MultiDimensional Scaling
Not quite working yet, but might be a good way if we can transform the data properly.

In [None]:
r_state = np.random.RandomState()

# For reproducible results, set r_int to a fixed number.
r_int = r_state.randint(1, 1000000)
mds = manifold.MDS(n_components=2, dissimilarity='precomputed', random_state=r_int)
print(r_int)

coords = mds.fit_transform(pairwise_distance)
fig, ax = plt.subplots(figsize=(8, 6))

# Normalize the distance_to_centroid array for marker size scaling
marker_sizes = pow((1 - distance_to_centroid), 3) * 300
marker_sizes[-1] = 100   # Centroid marker size

scatter = ax.scatter(coords[:, 0], coords[:, 1], c=distance_to_centroid, cmap='viridis', s=marker_sizes)

# Stronger Connection Lines for stronger pairwise similarities: 
segments = []
line_weights = []
for i in range(len(coords)):
    for j in range(len(coords)):
        segments.append([coords[i], coords[j]])
        line_weights.append(pow(pairwise_similarity[i,j], 2))

lc = LineCollection(
    segments, zorder=0, cmap=plt.cm.Blues, norm=plt.Normalize(0, 1), linewidths=line_weights
)
ax.add_collection(lc)

# Add labels to each point
labels = []
for i, dist in enumerate(coords):
    if i == len(coords)-1:
        labels.append(ax.annotate(f"Centroid", (coords[i, 0], coords[i, 1]), xytext=(7, 3), textcoords='offset pixels'))
    else:
        dist_round = round(dist[0], 2)
        text = f"{i+1}"
        label = ax.annotate(text, (coords[i, 0], coords[i, 1]), xytext=(7, 3), textcoords='offset pixels')
        labels.append(label)

# Add a colorbar to show the mapping of colors to distances
cbar = fig.colorbar(scatter, ax=ax)
cbar.set_label('Distance to Centroid')

# Set the axis labels and title
ax.set_title('Distance to Centroid Visualization')

# Show the plot
plt.show()




In [None]:
print(f"# \t dist to center \t coords")
for i, item in enumerate(ideas_and_similarities):
    print(f"{i+1} \t {item['idea']} \t {round(item['dist'], 2)} \t ({round(coords[i][0], 2)}|{round(coords[i][1], 2)})")

### Heatmap
Difficult to read properly. There's surely something better. (See other attempts below)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.imshow(pairwise_distance, cmap='viridis', interpolation='nearest')
plt.colorbar(label='Similarity Score (0 = same)')
plt.title('Heatmap Graph of Similarity Scores')

n = 3  # Show every nth label
start = 1  # Start from this value
length = len(pairwise_distance)
end = length + start  # End at this value
labels = np.arange(start, end, n)  # Generate the labels

# Set the x-ticks and labels
len_aranged = np.arange(length)

# Create a list of empty strings for the unlabeled ticks
all_labels = ['' for _ in range(length)]

# Assign the generated labels to the appropriate positions
for i, label in enumerate(labels):
    all_labels[i * n] = str(label)

# Set the x-ticks and labels
ax.set_xticks(len_aranged)
ax.set_xticklabels(all_labels)

ax.set_yticks(len_aranged)
ax.set_yticklabels(all_labels)

plt.xlabel('Idea Index')
plt.ylabel('Idea Index')
plt.grid(visible=True, linestyle='--', linewidth=0.5)
# as_html = mpld3.fig_to_html(fig, include_libraries=False, template_type="simple")
# plt.savefig('static/plot.png')
# plt.close(fig)
plt.show()

### Networkx Graph
Not really suitable, but we might be able to improve something?

In [None]:
import networkx as nx

# Create a graph
G = nx.Graph()

# Add nodes to the graph
num_nodes = pairwise_distance.shape[0]
for i in range(num_nodes):
    G.add_node(i)

#Add edges based on distance matrix
for i in range(num_nodes):
    for j in range(i+1, num_nodes):
        weight = pairwise_distance[i][j]
        G.add_edge(i, j, weight=weight)

# Position nodes using spring layout
coords = nx.spring_layout(G)

#Draw nodes
nx.draw_networkx_nodes(G, coords, node_size=500)

#Draw edges with weights
nx.draw_networkx_edges(G, coords)
edge_labels = {(u, v): d['weight'] for u, v, d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, coords, edge_labels=edge_labels)

#Display the graph
plt.title('Distance Graph')
plt.axis('off')
plt.show()