<a href="https://colab.research.google.com/github/graccelinn/Unstructured_Assignment_3/blob/main/Task_E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [5]:
# Load in the data
url = "https://raw.githubusercontent.com/graccelinn/Unstructured_Assignment_3/main/campaigns_file.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,title,description,amount_usd,goal_usd,duration_days,cover_image,url,image_labels,binary
0,Help Baby Jenson Fight a Rare Brain Cancer,"Dear friends, family, and kind-hearted strange...",49046.0,55000.0,15.0,https://images.gofundme.com/vfQ_a6e1pE7GZrlsdM...,https://www.gofundme.com/f/help-baby-jenson-fi...,"Child, Hospital, Patient, Health Care, Medical...",1
1,Dennis’s Fight Against Stage 4 Esophageal Cancer,Dennis was diagnosed with esophageal cancer on...,40758.0,50000.0,22.0,https://images.gofundme.com/ybPT_3fBXkCoE-5h3P...,https://www.gofundme.com/f/denniss-fight-again...,"Mountainous landforms, Mountain, People in nat...",1
2,Support Daniel DeMeza's Fight Against Cancer,Daniel DeMeza is a 20 year old kindhearted and...,15755.0,20000.0,2.0,https://images.gofundme.com/AMU6oV2WJKrYi40Eft...,https://www.gofundme.com/f/support-daniel-deme...,"T-shirt, Arm, Wrist, Chair, Hearing, Active Sh...",1
3,Stand with Alexis Gleason in Her Fight Against...,From Lisa Pinkham:\n\nLife can change in an in...,5220.0,10000.0,2.0,https://images.gofundme.com/DLe9STiNA-dP-2_kuA...,https://www.gofundme.com/f/alexis-gleason-in-h...,"Smile, Cheek, Happiness, Eyebrow, Facial hair,...",0
4,Support Eric Yeakel through Cancer Treatment,"On Tuesday, September 2, our dear friend Eric ...",35089.0,50000.0,28.0,https://images.gofundme.com/j256Flpa_mxzBPBEDl...,https://www.gofundme.com/f/support-eric-yeakel...,"Hand, Happiness, Formal wear, Event, Entertain...",1


In [12]:
# Concatenate description and image_labels
df["text_combined"] = df["description"].fillna("") + " " + df["image_labels"].fillna("")

# Create Bag-of-Words
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df["image_labels"])

# Fit LDA with n topics
n = 3
lda = LatentDirichletAllocation(n_components=n, random_state=42)
lda.fit(X)

# Get feature names (words)
words = vectorizer.get_feature_names_out()

# 1️⃣ Topic-word probabilities (reshape so each row = word)
topic_word_probs = lda.components_.T  # transpose so words are rows
topic_word_probs = topic_word_probs / topic_word_probs.sum(axis=0)  # normalize to probabilities

lda_topic_word_probabilities = pd.DataFrame(topic_word_probs, index=words, columns=[f"Topic_{i}" for i in range(1, n+1)])
lda_topic_word_probabilities.index.name = "word"
print("Topic-word probabilities (one word per row):")
print(lda_topic_word_probabilities.head())

# 2️⃣ Document-topic percentages with index and title
doc_topic_df = pd.DataFrame(lda.transform(X), columns=[f"Topic_{i}" for i in range(1, n+1)])
doc_topic_df.insert(0, "title", df["title"])              # add title as a column
doc_topic_df.insert(0, "index", df.index)                 # add original index as a column
doc_topic_df.insert(0, "amount_usd", df["amount_usd"])    # add amount donated as a column

lda_document_topic_percentages = doc_topic_df
print("\nDocument-topic percentages with index and title:")
print(lda_document_topic_percentages.head())

# Optional: save to CSV
lda_topic_word_probabilities.to_csv("lda_topic_word_probabilities.csv")
lda_document_topic_percentages.to_csv("lda_document_topic_percentages.csv", index=False)


Topic-word probabilities (one word per row):
           Topic_1   Topic_2   Topic_3
word                                  
29        0.000393  0.000064  0.000112
33        0.000393  0.000064  0.000112
54        0.000393  0.000064  0.000112
abdomen   0.000099  0.000612  0.001847
academic  0.000098  0.001603  0.000116

Document-topic percentages with index and title:
   amount_usd  index                                              title  \
0     49046.0      0         Help Baby Jenson Fight a Rare Brain Cancer   
1     40758.0      1   Dennis’s Fight Against Stage 4 Esophageal Cancer   
2     15755.0      2       Support Daniel DeMeza's Fight Against Cancer   
3      5220.0      3  Stand with Alexis Gleason in Her Fight Against...   
4     35089.0      4       Support Eric Yeakel through Cancer Treatment   

    Topic_1   Topic_2   Topic_3  
0  0.022226  0.955500  0.022275  
1  0.526590  0.447622  0.025788  
2  0.149272  0.817031  0.033698  
3  0.931180  0.030226  0.038594  
4  0.028511