In [37]:
import spacy
import pandas as pd

# Load spaCy's English tokenizer
nlp = spacy.load("en_core_web_sm")

# Get spaCy's list of stopwords
stop_words = nlp.Defaults.stop_words

def preprocess_text(text):
    # Convert to lowercase and remove punctuation
    doc = nlp(text.lower())
    cleaned_text = " ".join([token.text for token in doc if token.text not in stop_words and not token.is_punct])
    return cleaned_text

# Create a DataFrame
df = pd.read_csv('data/all-responses.csv')

# Drop NaN values first
df_cleaned = df.dropna(subset=['Q23 - Do you have any ideas for new technology or examples you\'ve seen from other...'])

# Apply preprocessing to the cleaned column #.apply(preprocess_text)
df_cleaned['cleaned_response'] = df_cleaned['Q23 - Do you have any ideas for new technology or examples you\'ve seen from other...'].apply(preprocess_text)

# Convert the column to a list and ensure all entries are strings
cleaned_responses_list = df_cleaned['cleaned_response'].tolist()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['cleaned_response'] = df_cleaned['Q23 - Do you have any ideas for new technology or examples you\'ve seen from other...'].apply(preprocess_text)


In [39]:
from top2vec import Top2Vec

model = Top2Vec(cleaned_responses_list, min_count=5)

2024-12-11 21:11:11,307 - top2vec - INFO - Pre-processing documents for training
2024-12-11 21:11:11,315 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2024-12-11 21:11:14,003 - top2vec - INFO - Creating joint document/word embedding
2024-12-11 21:11:14,969 - top2vec - INFO - Creating lower dimension embedding of documents
2024-12-11 21:11:15,075 - top2vec - INFO - Finding dense areas of documents
2024-12-11 21:11:15,078 - top2vec - INFO - Finding topics


In [40]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(topic_sizes)

topic_words, word_scores, topic_nums = model.get_topics(2)

for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(words)

[74 19]
0
['buses' 'traffic' 'parking' 'bus' 'road' 'areas' 'street' 'lights'
 'transport' 'city' 'public' 'technology' 'crossing' 'council' 'data'
 'app' 'screens' 'light' 'stops' 'people' 'interactive' 'tcc' 'boards'
 'phone' 'need' 'bins' 'use' 'digital' 'smart' 'time' 'green' 'rubbish'
 'tauranga' 'etc' 'red' 'like' 'water' 'stop' 'maybe']
1
['rubbish' 'time' 'like' 'maybe' 'stop' 'red' 'use' 'street' 'road' 'city'
 'app' 'stops' 'etc' 'people' 'public' 'traffic' 'phone' 'lights' 'light'
 'crossing' 'boards' 'smart' 'need' 'bins' 'screens' 'council' 'areas'
 'interactive' 'green' 'parking' 'bus' 'tcc' 'technology' 'water'
 'tauranga' 'buses' 'digital' 'data' 'transport']


In [41]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=0, num_docs=6)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----")
    print(doc)
    print("-----")
    print()

Document: 58, Score: 0.7433348894119263
-----
minutes away buses stop 
 billboards displaying upcoming events 
 traffic congestion eased smart tech monitoring build adjusting traffic lights accordingly
-----

Document: 55, Score: 0.6855706572532654
-----
maybe safety based technology eg linking police certain areas nt phone emergency services possibly interactive maps
-----

Document: 14, Score: 0.6505788564682007
-----
smart pedestrian cycle crossings able detect approaching cyclists pedestrians change green prioritise encourage cyclings walking 

 traffic lights turn green buses approach
-----

Document: 13, Score: 0.6424726247787476
-----
bus depot central christchurch amazed saw covered safe location digital time boards pleased tauranga time boards tell bus coming 
 maybe security cameras know
-----

Document: 65, Score: 0.6379818916320801
-----
digital signage buses bus stops eta interactive screens    like new extra smart bins public use
-----

Document: 49, Score: 0.614995241165