In [None]:
# ---------------------------------------------------------------
# Visualizing Qualitative VR Think-Aloud Data (HeritageRoots)
# ---------------------------------------------------------------
# Requirements:
# pip install pandas nltk matplotlib seaborn wordcloud scikit-learn networkx
# ---------------------------------------------------------------

import json
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
import numpy as np

# ---------- 1. Load dataset ----------
with open("heritageroots_ux_transcripts.json", "r") as f:
    data = json.load(f)

participants = data["participants"]

# Flatten into DataFrame
rows = []
for p in participants:
    for turn in p["transcript"]:
        if turn["speaker"] == "Participant":
            rows.append({
                "participant_id": p["id"],
                "bio": p["bio"],
                "time": turn["time"],
                "text": turn["text"]
            })

df = pd.DataFrame(rows)

# ---------- 2. Preprocess & sentiment scoring ----------
nltk.download("vader_lexicon")
nltk.download("stopwords")

sia = SentimentIntensityAnalyzer()
df["compound"] = df["text"].apply(lambda x: sia.polarity_scores(x)["compound"])
df["label"] = pd.cut(df["compound"],
                     bins=[-1, -0.05, 0.05, 1],
                     labels=["negative", "neutral", "positive"])

# ---------- 3. WORD CLOUDS ----------
stop_words = set(stopwords.words("english"))

def make_wordcloud(texts, title):
    text = " ".join(texts)
    wc = WordCloud(width=900, height=500,
                   background_color="white",
                   stopwords=stop_words,
                   colormap="viridis").generate(text)
    plt.figure(figsize=(10,5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title, fontsize=16)
    plt.show()

for label in ["positive", "neutral", "negative"]:
    subset = df[df["label"] == label]
    if not subset.empty:
        make_wordcloud(subset["text"], f"Word Cloud â€“ {label.capitalize()} Sentiment")

# ---------- 4. THEME MAP (unsupervised clustering of frequent words) ----------
# Build a term-document matrix
vectorizer = CountVectorizer(stop_words="english", max_features=50)
X = vectorizer.fit_transform(df["text"])
terms = vectorizer.get_feature_names_out()

# PCA projection for visualization
pca = PCA(n_components=2)
coords = pca.fit_transform(X.toarray().T)
plt.figure(figsize=(8,6))
plt.scatter(coords[:,0], coords[:,1])
for i, term in enumerate(terms):
    plt.text(coords[i,0]+0.01, coords[i,1]+0.01, term, fontsize=9)
plt.title("Theme Map (PCA of Term Frequencies)")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()

# ---------- 5. SENTIMENT-OVER-TIME GRAPH ----------
# Convert timestamps like [00:01:23] -> seconds
def time_to_seconds(t):
    try:
        parts = t.strip("[]").split(":")
        return int(parts[0])*3600 + int(parts[1])*60 + int(parts[2])
    except:
        return np.nan

df["seconds"] = df["time"].apply(time_to_seconds)

plt.figure(figsize=(10,5))
sns.lineplot(data=df.sort_values("seconds"), x="seconds", y="compound", hue="participant_id", alpha=0.5, legend=False)
sns.lineplot(x=df["seconds"], y=df["compound"], color="black", linewidth=2, label="Overall trend")
plt.title("Sentiment Over Time Across All Participants")
plt.xlabel("Time (seconds)")
plt.ylabel("Sentiment (compound score)")
plt.legend()
plt.show()

# ---------- 6. CO-OCCURRENCE MATRIX ----------
vectorizer = CountVectorizer(stop_words="english", max_features=20)
X = vectorizer.fit_transform(df["text"])
Xc = (X.T * X)  # co-occurrence matrix
Xc.setdiag(0)

terms = vectorizer.get_feature_names_out()
co_occurrence = pd.DataFrame(Xc.toarray(), index=terms, columns=terms)

plt.figure(figsize=(8,6))
sns.heatmap(co_occurrence, cmap="YlGnBu")
plt.title("Word Co-Occurrence Matrix (Top 20 Terms)")
plt.show()

# Optional: visualize co-occurrence as network
G = nx.Graph()
for i, t1 in enumerate(terms):
    for j, t2 in enumerate(terms):
        if i < j and co_occurrence.iloc[i,j] > 0:
            G.add_edge(t1, t2, weight=co_occurrence.iloc[i,j])

plt.figure(figsize=(10,8))
pos = nx.spring_layout(G, k=0.5)
edges = G.edges()
weights = [G[u][v]['weight'] for u,v in edges]
nx.draw(G, pos, with_labels=True, node_color="lightblue", edge_color=weights,
        width=[w/2 for w in weights], edge_cmap=plt.cm.Blues, font_size=10)
plt.title("Co-occurrence Network of Frequent Words")
plt.show()
