In [None]:
# Install dependencies on Jupyter
%pip install nltk   # In Analyzer.py

In [None]:
# Imports (if running in JupyterLite this might take a while)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy.integrate import odeint
# Things for preprocessing the ideas (cleaning up the text)
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# IDEA INPUT
# decide how you want to enter ideas by setting any of the following variables to True or false.
# If none of them is True, then we assume the ideas are provided in the code.

enter_ideas_manually = False
enter_ideas_from_csv = True

ideas = []
if enter_ideas_manually:
    print("Enter ideas. You can enter them one-by-one, or enter them all at once, separated with a newline. Stop by typing 'exit'")
    while answer.lower != "exit":
        answer = input()
        ideas.extend([idea for idea in answer.split('\n') if idea.strip() and idea != "exit"])

if enter_ideas_from_csv:
    import ipywidgets as widgets
    from IPython.display import display

    file_upload = widgets.FileUpload()

    display(file_upload)

In [None]:
if enter_ideas_from_csv and len(file_upload.value) > 0:
    uploaded = file_upload.value[0]
    import codecs
    content = codecs.decode(uploaded.content, encoding="utf-8")
    ideas = content.split('\n')

In [None]:
if len(ideas) == 0:
    ideas = [
        "Placeholder ideas in case you want to enter some manually here",
        "This will only be used if there's no file provided"
    ]
    
print('Ideas: ', ideas)

### Preprocessing steps to clean the data

In [None]:
from Analyzer import Analyzer
from IPython.display import display, FileLink
import os
import csv

# Initialize CountVectorizer to convert text into numerical vectors
count_vectorizer = CountVectorizer()
count_analyzer = Analyzer(ideas, count_vectorizer)
count_analyzer.process_all()

file_path = os.path.join(os.getcwd(), 'CountVectorizer_ideas_similarity_distance.csv')
similarity = count_analyzer.get_similarity()
distance = count_analyzer.get_distance()
with open(file_path, 'w', newline="") as f:
    writer = csv.writer(f, delimiter="|", quoting=csv.QUOTE_ALL)
    header = ["#", "Idea", "Cos Similarity", "Dist to centroid"]
    writer.writerow(header)
    print(header)
    for i, idea in enumerate(count_analyzer.get_ideas()):
        line = [i+1, idea,round(similarity[i][0], 2), round(distance[i][0], 2)]
        writer.writerow(line)
        print(line)

local_file = FileLink(file_path, result_html_prefix="Click here to download: ")
display(local_file)


# Vectorize the text using TF-IDF
# On long ideas, this has much higher similarities than CountVectorizer 
# with minimal differences, which makes it harder to make sense of it

# tfidf_vectorizer = TfidfVectorizer()
# tfidf_analyzer = Analyzer(ideas, tfidf_vectorizer)
# tfidf_analyzer.process_all()