<a href="https://colab.research.google.com/github/graccelinn/Unstructured_Assignment_3/blob/main/TaskC_and_D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Task C: Create Binary Column

In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Load in the data
url = "https://raw.githubusercontent.com/graccelinn/Unstructured_Assignment_3/main/campaigns_with_labels_animals.csv"
campaigns = pd.read_csv(url)
campaigns.head()

Unnamed: 0,title,description,amount,goal,created,cover_image,url,image_labels
0,2nd Annual Massage fundraiser for Homeward Ani...,Last years fundraiser for the Homeward Animal ...,"$1,000",1K,"August 20th, 2025",https://images.gofundme.com/3hguyHJimIvbn6Cr6m...,https://www.gofundme.com/f/2nd-annual-massage-...,"Dog, Vertebrate, Carnivores, Snout, Working an..."
1,GCFA: Fundraiser for High Shelter Census and H...,CALLING ALL ANIMAL LOVERS & SHELTER SUPPORTERS...,"$1,495",5K,"July 16th, 2025",https://images.gofundme.com/bSnOcdg33RMZfoYWlt...,https://www.gofundme.com/f/help-gcfa-ferrets-b...,"Snout, Advertising, Photo caption, Rodent, Fur..."
2,Help for Balkanabat Dog Haven,"My name is Dmitry, I live in Balkanabat, Turkm...",$174,13K,"September 25th, 2025",https://images.gofundme.com/ca0ib-jbVfhNWaslYL...,https://www.gofundme.com/f/help-for-balkanabat...,"Dog, Carnivores, Street dog, Snout, Working an..."
3,Support Daniel's Eagle Scout Project for the A...,"Hi, my name is Daniel Watrous. I am a Life Sco...","$1,500",1.5K,"April 19th, 2024",https://images.gofundme.com/H-EKZ8nalH3eeYBBBK...,https://www.gofundme.com/f/support-daniels-eag...,"Facial expression, Smile, Scout, Carnivores, F..."
4,Marathon Madness for a Animal Shelter & Dement...,"Hey, I’m Emma ☺️, and I’ve made the (probably ...",£825,500,"September 19th, 2024",https://images.gofundme.com/26ICpSFlt_932tMOW7...,https://www.gofundme.com/f/marathon-madness-fo...,"Body of water, Coast, Sea, Coastal and oceanic..."


In [3]:
# Clean the data frame

# --- 1. Define exchange rates (approx as of 2025-10-02) ---
rates = {
    "USD": 1.0,
    "GBP": 1.25,   # 1 GBP ≈ 1.25 USD
    "EUR": 1.05,   # 1 EUR ≈ 1.05 USD
    "NOK": 0.095   # 1 NOK ≈ 0.095 USD
}

# --- 2. Detect currency symbols robustly ---
def detect_currency(x):
    if pd.isna(x):
        return "USD"
    x = str(x).strip()
    if x.startswith("$"):
        return "USD"
    elif x.startswith("£") or x.startswith("¬£"):
        return "GBP"
    elif x.startswith("€") or x.startswith("‚Ç¨"):
        return "EUR"
    elif x.lower().startswith("kr"):
        return "NOK"
    return "USD"

campaigns["currency"] = campaigns["amount"].apply(detect_currency)

# --- 3. Clean 'amount' to numeric USD ---
def clean_amount(x, currency):
    if pd.isna(x):
        return np.nan
    # keep digits and decimal only
    val = re.sub(r"[^\d.]", "", str(x))
    if val == "":
        return np.nan
    try:
        val = float(val.replace(",", ""))
    except:
        return np.nan
    return val * rates[currency]

campaigns["amount_usd"] = campaigns.apply(lambda row: clean_amount(row["amount"], row["currency"]), axis=1)

# --- 4. Clean 'goal' to numeric USD ---
def clean_goal(x, currency):
    if pd.isna(x):
        return np.nan
    val = str(x).upper().replace(",", "").strip()
    multiplier = 1
    if val.endswith("K"):
        multiplier = 1000
        val = val[:-1]
    elif val.endswith("M"):
        multiplier = 1_000_000
        val = val[:-1]
    val = re.sub(r"[^\d.]", "", val)
    if val == "":
        return np.nan
    try:
        val = float(val)
    except:
        return np.nan
    return val * multiplier * rates[currency]

campaigns["goal_usd"] = campaigns.apply(lambda row: clean_goal(row["goal"], row["currency"]), axis=1)

# --- 5. Clean 'created' column ---
reference_date = datetime(2025, 10, 2)  # scrape date

def clean_created(x):
    x = str(x).strip()

    # Handle relative dates (e.g. "2 d ago")
    if "d ago" in x:
        days = int(re.search(r"(\d+)", x).group(1))
        return days

    # Remove ordinal suffixes (1st → 1, 2nd → 2, etc.)
    x = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', x)

    # Try parsing with pandas datetime
    dt = pd.to_datetime(x, errors="coerce")
    days = (reference_date - dt).days
    return days

campaigns["duration_days"] = campaigns["created"].apply(clean_created)

# --- Final cleaned dataframe ---
cleaned_df = campaigns[["title", "description", "amount_usd", "goal_usd", "duration_days", "cover_image", "url", "image_labels"]]
cleaned_df.head()

Unnamed: 0,title,description,amount_usd,goal_usd,duration_days,cover_image,url,image_labels
0,2nd Annual Massage fundraiser for Homeward Ani...,Last years fundraiser for the Homeward Animal ...,1000.0,1000.0,43.0,https://images.gofundme.com/3hguyHJimIvbn6Cr6m...,https://www.gofundme.com/f/2nd-annual-massage-...,"Dog, Vertebrate, Carnivores, Snout, Working an..."
1,GCFA: Fundraiser for High Shelter Census and H...,CALLING ALL ANIMAL LOVERS & SHELTER SUPPORTERS...,1495.0,5000.0,78.0,https://images.gofundme.com/bSnOcdg33RMZfoYWlt...,https://www.gofundme.com/f/help-gcfa-ferrets-b...,"Snout, Advertising, Photo caption, Rodent, Fur..."
2,Help for Balkanabat Dog Haven,"My name is Dmitry, I live in Balkanabat, Turkm...",174.0,13000.0,7.0,https://images.gofundme.com/ca0ib-jbVfhNWaslYL...,https://www.gofundme.com/f/help-for-balkanabat...,"Dog, Carnivores, Street dog, Snout, Working an..."
3,Support Daniel's Eagle Scout Project for the A...,"Hi, my name is Daniel Watrous. I am a Life Sco...",1500.0,1500.0,531.0,https://images.gofundme.com/H-EKZ8nalH3eeYBBBK...,https://www.gofundme.com/f/support-daniels-eag...,"Facial expression, Smile, Scout, Carnivores, F..."
4,Marathon Madness for a Animal Shelter & Dement...,"Hey, I’m Emma ☺️, and I’ve made the (probably ...",1031.25,625.0,378.0,https://images.gofundme.com/26ICpSFlt_932tMOW7...,https://www.gofundme.com/f/marathon-madness-fo...,"Body of water, Coast, Sea, Coastal and oceanic..."


In [4]:
# Create binary column

# Compute median
median_value = cleaned_df["amount_usd"].median()

# Create binary column (1 = high money, 0 = low money)
cleaned_df["binary"] = (cleaned_df["amount_usd"] >= median_value).astype(int)

cleaned_df[["amount_usd", "binary"]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df["binary"] = (cleaned_df["amount_usd"] >= median_value).astype(int)


Unnamed: 0,amount_usd,binary
0,1000.0,0
1,1495.0,1
2,174.0,0
3,1500.0,1
4,1031.25,0


## Task D: Logistic Regression

In [5]:
campaigns_file = cleaned_df.copy()
campaigns_file = campaigns_file.dropna(subset=["duration_days"])
campaigns_file.head()

Unnamed: 0,title,description,amount_usd,goal_usd,duration_days,cover_image,url,image_labels,binary
0,2nd Annual Massage fundraiser for Homeward Ani...,Last years fundraiser for the Homeward Animal ...,1000.0,1000.0,43.0,https://images.gofundme.com/3hguyHJimIvbn6Cr6m...,https://www.gofundme.com/f/2nd-annual-massage-...,"Dog, Vertebrate, Carnivores, Snout, Working an...",0
1,GCFA: Fundraiser for High Shelter Census and H...,CALLING ALL ANIMAL LOVERS & SHELTER SUPPORTERS...,1495.0,5000.0,78.0,https://images.gofundme.com/bSnOcdg33RMZfoYWlt...,https://www.gofundme.com/f/help-gcfa-ferrets-b...,"Snout, Advertising, Photo caption, Rodent, Fur...",1
2,Help for Balkanabat Dog Haven,"My name is Dmitry, I live in Balkanabat, Turkm...",174.0,13000.0,7.0,https://images.gofundme.com/ca0ib-jbVfhNWaslYL...,https://www.gofundme.com/f/help-for-balkanabat...,"Dog, Carnivores, Street dog, Snout, Working an...",0
3,Support Daniel's Eagle Scout Project for the A...,"Hi, my name is Daniel Watrous. I am a Life Sco...",1500.0,1500.0,531.0,https://images.gofundme.com/H-EKZ8nalH3eeYBBBK...,https://www.gofundme.com/f/support-daniels-eag...,"Facial expression, Smile, Scout, Carnivores, F...",1
4,Marathon Madness for a Animal Shelter & Dement...,"Hey, I’m Emma ☺️, and I’ve made the (probably ...",1031.25,625.0,378.0,https://images.gofundme.com/26ICpSFlt_932tMOW7...,https://www.gofundme.com/f/marathon-madness-fo...,"Body of water, Coast, Sea, Coastal and oceanic...",0


In [6]:
campaigns_file.to_csv("campaigns_file.csv", index=False)

In [7]:
# Feature Engineering
def prepare_features(text_column):
    vectorizer = CountVectorizer(max_features=1000)
    X_text = vectorizer.fit_transform(campaigns_file[text_column].fillna("")).toarray()
    X_duration = campaigns_file["duration_days"].values.reshape(-1, 1)
    return np.hstack((X_text, X_duration))

# Prepare feature sets
X_labels = prepare_features("image_labels")
X_desc = prepare_features("description")

# Combine both BoW matrices + duration
X_combined = np.hstack((X_labels[:, :-1], X_desc[:, :-1], campaigns_file["duration_days"].values.reshape(-1, 1)))
y = campaigns_file["binary"]

In [8]:
# Train & Evaluate Models
def train_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    return cm, acc

# Model 1: Image Labels + Duration
cm_labels, acc_labels = train_and_evaluate(X_labels, y)

# Model 2: Description Text + Duration
cm_desc, acc_desc = train_and_evaluate(X_desc, y)

# Model 3: Combined Features
cm_combined, acc_combined = train_and_evaluate(X_combined, y)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### Display the results

In [9]:
print("Model 1: Image Labels + Duration")
print("Confusion Matrix:\n", cm_labels)
print("Accuracy:", round(acc_labels * 100, 2), "%\n")

print("Model 2: Description Text + Duration")
print("Confusion Matrix:\n", cm_desc)
print("Accuracy:", round(acc_desc * 100, 2), "%\n")

print("Model 3: Combined Image Labels and Description + Duration")
print("Confusion Matrix:\n", cm_combined)
print("Accuracy:", round(acc_combined * 100, 2), "%")

Model 1: Image Labels + Duration
Confusion Matrix:
 [[106  15]
 [ 32  45]]
Accuracy: 76.26 %

Model 2: Description Text + Duration
Confusion Matrix:
 [[105  16]
 [ 21  56]]
Accuracy: 81.31 %

Model 3: Combined Image Labels and Description + Duration
Confusion Matrix:
 [[106  15]
 [ 23  54]]
Accuracy: 80.81 %


### Trying Description without Duration

In [10]:
# Load dataset
df1 = campaigns_file.copy()

# Prepare BoW features for description
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X_desc = vectorizer.fit_transform(df1["description"].fillna("")).toarray()
y = df1["binary"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_desc, y, test_size=0.2, random_state=42)

# Train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy * 100:.2f}%")

Confusion Matrix:
 [[100  21]
 [ 38  39]]
Accuracy: 70.20%


### Conclusion
Combining Image + Text provides a richer feature set, allowing the model to leverage both visual appeal and narrative strength.
Adding duration accounts for time-based exposure, which influences fundraising success. After trying Description without the duration, we received 55.96% of accuracy.

Visual + Duration content is moderately predictive (63.21%).
Text + Duration performs slightly worse (60.1%).
Combining both improves accuracy to 67.88%, confirming that multimodal features (text + image) are effective for predicting campaign success.