<a href="https://colab.research.google.com/github/graccelinn/Unstructured_Assignment_3/blob/main/TaskC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Task C: Create Binary Column

In [45]:
import pandas as pd
import numpy as np
import re
from datetime import datetime, timedelta

In [46]:
# Load in the data
url = "https://raw.githubusercontent.com/graccelinn/Unstructured_Assignment_3/main/campaigns_with_labels.csv"
campaigns = pd.read_csv(url)
campaigns.head()

Unnamed: 0,title,description,amount,goal,created,cover_image,url,image_labels
0,Help Baby Jenson Fight a Rare Brain Cancer,"Dear friends, family, and kind-hearted strange...","$49,046",55K,"September 17th, 2025",https://images.gofundme.com/vfQ_a6e1pE7GZrlsdM...,https://www.gofundme.com/f/help-baby-jenson-fi...,"Child, Hospital, Patient, Health Care, Medical..."
1,Dennis’s Fight Against Stage 4 Esophageal Cancer,Dennis was diagnosed with esophageal cancer on...,"$40,758",50K,"September 10th, 2025",https://images.gofundme.com/ybPT_3fBXkCoE-5h3P...,https://www.gofundme.com/f/denniss-fight-again...,"Mountainous landforms, Mountain, People in nat..."
2,Support Daniel DeMeza's Fight Against Cancer,Daniel DeMeza is a 20 year old kindhearted and...,"$15,755",20K,2 d ago,https://images.gofundme.com/AMU6oV2WJKrYi40Eft...,https://www.gofundme.com/f/support-daniel-deme...,"T-shirt, Arm, Wrist, Chair, Hearing, Active Sh..."
3,Stand with Alexis Gleason in Her Fight Against...,From Lisa Pinkham:\n\nLife can change in an in...,"$5,220",10K,2 d ago,https://images.gofundme.com/DLe9STiNA-dP-2_kuA...,https://www.gofundme.com/f/alexis-gleason-in-h...,"Smile, Cheek, Happiness, Eyebrow, Facial hair,..."
4,Support Eric Yeakel through Cancer Treatment,"On Tuesday, September 2, our dear friend Eric ...","$35,089",50K,"September 4th, 2025",https://images.gofundme.com/j256Flpa_mxzBPBEDl...,https://www.gofundme.com/f/support-eric-yeakel...,"Hand, Happiness, Formal wear, Event, Entertain..."


In [47]:
# Clean the data frame

# --- 1. Define exchange rates (approx as of 2025-10-02) ---
rates = {
    "USD": 1.0,
    "GBP": 1.25,   # 1 GBP ≈ 1.25 USD
    "EUR": 1.05,   # 1 EUR ≈ 1.05 USD
    "NOK": 0.095   # 1 NOK ≈ 0.095 USD
}

# --- 2. Detect currency symbols robustly ---
def detect_currency(x):
    if pd.isna(x):
        return "USD"
    x = str(x).strip()
    if x.startswith("$"):
        return "USD"
    elif x.startswith("£") or x.startswith("¬£"):
        return "GBP"
    elif x.startswith("€") or x.startswith("‚Ç¨"):
        return "EUR"
    elif x.lower().startswith("kr"):
        return "NOK"
    return "USD"

campaigns["currency"] = campaigns["amount"].apply(detect_currency)

# --- 3. Clean 'amount' to numeric USD ---
def clean_amount(x, currency):
    if pd.isna(x):
        return np.nan
    # keep digits and decimal only
    val = re.sub(r"[^\d.]", "", str(x))
    if val == "":
        return np.nan
    try:
        val = float(val.replace(",", ""))
    except:
        return np.nan
    return val * rates[currency]

campaigns["amount_usd"] = campaigns.apply(lambda row: clean_amount(row["amount"], row["currency"]), axis=1)

# --- 4. Clean 'goal' to numeric USD ---
def clean_goal(x, currency):
    if pd.isna(x):
        return np.nan
    val = str(x).upper().replace(",", "").strip()
    multiplier = 1
    if val.endswith("K"):
        multiplier = 1000
        val = val[:-1]
    elif val.endswith("M"):
        multiplier = 1_000_000
        val = val[:-1]
    val = re.sub(r"[^\d.]", "", val)
    if val == "":
        return np.nan
    try:
        val = float(val)
    except:
        return np.nan
    return val * multiplier * rates[currency]

campaigns["goal_usd"] = campaigns.apply(lambda row: clean_goal(row["goal"], row["currency"]), axis=1)

# --- 5. Clean 'created' column ---
reference_date = datetime(2025, 10, 2)  # scrape date

def clean_created(x):
    x = str(x).strip()

    # Handle relative dates (e.g. "2 d ago")
    if "d ago" in x:
        days = int(re.search(r"(\d+)", x).group(1))
        dt = reference_date - timedelta(days=days)
        return dt

    # Remove ordinal suffixes (1st → 1, 2nd → 2, etc.)
    x = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', x)

    # Try parsing with pandas datetime
    dt = pd.to_datetime(x, errors="coerce")
    return dt

campaigns["date_created_clean"] = campaigns["created"].apply(clean_created)

# --- Final cleaned dataframe ---
cleaned_df = campaigns[["title", "description", "amount_usd", "goal_usd", "date_created_clean", "cover_image", "url", "image_labels"]]
cleaned_df.head()

Unnamed: 0,title,description,amount_usd,goal_usd,date_created_clean,cover_image,url,image_labels
0,Help Baby Jenson Fight a Rare Brain Cancer,"Dear friends, family, and kind-hearted strange...",49046.0,55000.0,2025-09-17,https://images.gofundme.com/vfQ_a6e1pE7GZrlsdM...,https://www.gofundme.com/f/help-baby-jenson-fi...,"Child, Hospital, Patient, Health Care, Medical..."
1,Dennis’s Fight Against Stage 4 Esophageal Cancer,Dennis was diagnosed with esophageal cancer on...,40758.0,50000.0,2025-09-10,https://images.gofundme.com/ybPT_3fBXkCoE-5h3P...,https://www.gofundme.com/f/denniss-fight-again...,"Mountainous landforms, Mountain, People in nat..."
2,Support Daniel DeMeza's Fight Against Cancer,Daniel DeMeza is a 20 year old kindhearted and...,15755.0,20000.0,2025-09-30,https://images.gofundme.com/AMU6oV2WJKrYi40Eft...,https://www.gofundme.com/f/support-daniel-deme...,"T-shirt, Arm, Wrist, Chair, Hearing, Active Sh..."
3,Stand with Alexis Gleason in Her Fight Against...,From Lisa Pinkham:\n\nLife can change in an in...,5220.0,10000.0,2025-09-30,https://images.gofundme.com/DLe9STiNA-dP-2_kuA...,https://www.gofundme.com/f/alexis-gleason-in-h...,"Smile, Cheek, Happiness, Eyebrow, Facial hair,..."
4,Support Eric Yeakel through Cancer Treatment,"On Tuesday, September 2, our dear friend Eric ...",35089.0,50000.0,2025-09-04,https://images.gofundme.com/j256Flpa_mxzBPBEDl...,https://www.gofundme.com/f/support-eric-yeakel...,"Hand, Happiness, Formal wear, Event, Entertain..."


In [50]:
# Create binary column

# Compute median
median_value = cleaned_df["amount_usd"].median()

# Create binary column (1 = high $$, 0 = low $$)
cleaned_df["binary"] = (cleaned_df["amount_usd"] >= median_value).astype(int)

cleaned_df[["amount_usd", "binary"]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df["binary"] = (cleaned_df["amount_usd"] >= median_value).astype(int)


Unnamed: 0,amount_usd,binary
0,49046.0,1
1,40758.0,1
2,15755.0,1
3,5220.0,0
4,35089.0,1
