# Audience Segmentation

## Set-up: Import + Config

In [1]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


## Load data

Set file path according to location of CSV. Tip: Find the file using the left sidebar, click the three dots for more options, and copy the path.

In [2]:
dataset_file_path = "./PBL_DA_FA24_CAPSTONE_DATA.csv"

Detect CSV's encoding format using `chardet` library ... because this CSV has weird encoding

In [3]:
import chardet

with open(dataset_file_path, 'rb') as f:
    chardet_result = chardet.detect(f.read())
print(chardet_result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


Load the CSV.

In [4]:
df = pd.read_csv(dataset_file_path, encoding=chardet_result['encoding'])
df.head()

Unnamed: 0,Audience Segment,Creative Size,Creative Messaging,Location,App/URL,Exchange,Device Make,Impressions,Clicks,Viewable Impressions,Measurable Impressions,Total Conversions,Gross Cost
0,Online Behavior>United States>Validated Demogr...,300x250,Subscribe Today - 10% Discount,"Los Angeles, California",abcnews.go.com,Google Ad Manager,Apple PC,10524,55.0,7601,10242,11.0,6136.133902
1,Online Behavior>United States>Finance Â» Finan...,320x50,Sign up Today - 10% Off,"Orlando-Daytona Beach, Florida",accuweather.com,PubMatic,Smartphone,18324,60.0,3145,13703,8.0,6996.231838
2,Online Behavior>United States>Hobbies and Leis...,300x250,Subscribe Today - 10% Discount,"San Francisco, California",accuweather.com,Google Ad Manager,Smartphone,7862,20.0,2152,6039,2.0,6506.433687
3,Online Behavior>United States>Intent Â» Auto B...,300x250,Sign up Now - 10% Off,"Grand Rapids-Kalamazoo, Michigan",accuweather.com,Xandr - Monetize SSP (AppNexus),Smartphone,10501,90.0,4126,7654,8.0,6244.517921
4,Online Behavior>United States>Beauty and Fitne...,300x250,Sign up Now - 10% Off,"Chicago, Illinois",accuweather.com,Magnite DV+,Apple PC,23156,73.0,4468,18821,15.0,6173.41289


Inspect `Audience Segment` column more closely. Notice formatting issues (e.g., Â» instead of >).

In [5]:
df["Audience Segment"].head()

0    Online Behavior>United States>Validated Demogr...
1    Online Behavior>United States>Finance Â» Finan...
2    Online Behavior>United States>Hobbies and Leis...
3    Online Behavior>United States>Intent Â» Auto B...
4    Online Behavior>United States>Beauty and Fitne...
Name: Audience Segment, dtype: object

## Data cleansing

### Remove null rows

In [6]:
# Filter rows where "Audience Segment" is NaN
nan_rows = df[df['Audience Segment'].isna()]
nan_rows

Unnamed: 0,Audience Segment,Creative Size,Creative Messaging,Location,App/URL,Exchange,Device Make,Impressions,Clicks,Viewable Impressions,Measurable Impressions,Total Conversions,Gross Cost
727,,,,,,,,,,,,,
728,,,,,,,,,,,,,


In [7]:
# Remove rows with NaN values in the "Audience Segment" column
df = df.dropna(subset=['Audience Segment'])

### Clean up delimiter (>)

In [8]:
# Clean up the 'Â»' character (and standardize surrounding whitespace)
df['Audience Segment'] = df['Audience Segment'].str.replace('Â»', '>', regex=False)
df['Audience Segment'] = df['Audience Segment'].str.replace('»', '>', regex=False)
df['Audience Segment'] = df['Audience Segment'].str.replace(' > ', '>', regex=False)
df['Audience Segment'] = df['Audience Segment'].str.replace('>', ' > ', regex=False)
df['Audience Segment']

0      Online Behavior > United States > Validated De...
1      Online Behavior > United States > Finance > Fi...
2      Online Behavior > United States > Hobbies and ...
3      Online Behavior > United States > Intent > Aut...
4      Online Behavior > United States > Beauty and F...
                             ...                        
722    US > Brand Propensities > Automotive > Ford Cr...
723    US Technology > Social Media > Likely Behavior...
724    US > B2B > B2B Decision Maker Responsibilities...
725                  Transactional > Q3 Fashion Shoppers
726    Lotame > Style, Fashion & Clothing > Men's Clo...
Name: Audience Segment, Length: 727, dtype: object

### Create tokens

In [9]:
# Example code for preprocessing
df['tokens'] = df['Audience Segment'].apply(lambda x: [token.strip() for token in x.split('>')])
df['tokens']

0      [Online Behavior, United States, Validated Dem...
1      [Online Behavior, United States, Finance, Fina...
2      [Online Behavior, United States, Hobbies and L...
3      [Online Behavior, United States, Intent, Auto ...
4      [Online Behavior, United States, Beauty and Fi...
                             ...                        
722    [US, Brand Propensities, Automotive, Ford Cred...
723    [US Technology, Social Media, Likely Behavior,...
724    [US, B2B, B2B Decision Maker Responsibilities,...
725                 [Transactional, Q3 Fashion Shoppers]
726    [Lotame, Style, Fashion & Clothing, Men's Clot...
Name: tokens, Length: 727, dtype: object

#### Eliminate noisy tokens

In [10]:
noise_tokens = ['United States', 'US', 'Online Behavior', 'Interest',
                'Propensity Models', 'Intent', 'validated', 'Reach', 'OnAudience',
                'ConneXions', 'Brands', 'The Changing Consumer', 'Lotame',
                'Mobile - US', 'Brand Propensities']

In [11]:
# Remove noise tokens from each token list
df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token not in noise_tokens])

# Display filtered tokens to verify
df['tokens']

0      [Validated Demographic, Gender and Age Combine...
1      [Finance, Financial Planning and Management, R...
2            [Hobbies and Leisure, Outdoors, Equestrian]
3                        [Auto Buyers, Car Make, Audi Q]
4      [Beauty and Fitness, Fitness, Fitness Instruct...
                             ...                        
722           [Automotive, Ford Credit Buyer Propensity]
723    [US Technology, Social Media, Likely Behavior,...
724    [B2B, B2B Decision Maker Responsibilities, Rea...
725                 [Transactional, Q3 Fashion Shoppers]
726    [Style, Fashion & Clothing, Men's Clothing Sho...
Name: tokens, Length: 727, dtype: object

In [12]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_weighted_embedding(text):
    # Tokenize by splitting on '>'
    tokens = [token.strip() for token in text.split('>') if token.strip()]
    n = len(tokens)  # Number of tokens

    # Generate embeddings and weights
    weighted_embeddings = []
    for i, token in enumerate(tokens):
        embedding = model.encode(token)

        # Calculate weight: proportional to position (closer to leaf = higher weight)
        weight = (i + 1) / n  # You can experiment with different weighting schemes here
        weighted_embeddings.append(embedding * weight)

    # Sum the weighted embeddings to get the final embedding
    final_embedding = np.sum(weighted_embeddings, axis=0)
    return final_embedding

# Apply the function to each category in your DataFrame
df['weighted_embeddings'] = df['Audience Segment'].apply(get_weighted_embedding)

# Convert list of embeddings to a 2D array
X = np.array(df['weighted_embeddings'].tolist())

# Perform clustering
from sklearn.cluster import KMeans
num_clusters = 10  # Adjust based on your data
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Display results by cluster
for cluster_num in sorted(df['cluster'].unique()):
    print(f"Cluster {cluster_num + 1}:")
    cluster_items = df[df['cluster'] == cluster_num]['Audience Segment']
    for item in cluster_items:
        print(f"- {item}")
    print()


Cluster 1:
- US > B2B > B2B Decision Maker Responsibilities > Purchasing Utilities
- Online Behavior > United States > B2B > Occupation > Musician
- US > B2B > B2B Decision Maker Responsibilities > Financial Services
- Online Behavior > United States > B2B > Occupation > MechanicalEngineer
- Online Behavior > United States > Law and Government > Public Safety
- Online Behavior > United States > Jobs and Education > Education
- US > B2B > Purchase DM > Technology Services, Hardware and > or Software > I have no input into the final decision
- Online Behavior > United States > The Changing Consumer > Internet Connection
- Online Behavior > United States > Intent > Shopping > Consumer Electronics > Computers, Laptops
- Online Behavior > United States > Law and Government > Military
- US > B2B > B2B Decision Maker Responsibilities > Company Liability or Insurance
- US > B2B > B2B Decision Maker Responsibilities > Employee Benefits
- Online Behavior > United States > The Changing Consumer >



In [13]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_weighted_embedding(text):
    # Tokenize by splitting on '>'
    tokens = [token.strip() for token in text.split('>') if token.strip()]
    n = len(tokens)  # Number of tokens

    # Generate embeddings and weights
    weighted_embeddings = []
    for i, token in enumerate(tokens):
        embedding = model.encode(token)

        # Calculate weight: proportional to position (closer to leaf = higher weight)
        weight = (i + 1) / n  # You can experiment with different weighting schemes here
        weighted_embeddings.append(embedding * weight)

    # Sum the weighted embeddings to get the final embedding
    final_embedding = np.sum(weighted_embeddings, axis=0)
    return final_embedding

# Apply the function to each category in your DataFrame
df['weighted_embeddings'] = df['Audience Segment'].apply(get_weighted_embedding)

# Convert list of embeddings to a 2D array
X = np.array(df['weighted_embeddings'].tolist())

# Perform clustering
num_clusters = 10  # Adjust based on your data
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Find representative label for each cluster
def find_representative_label(cluster_df):
    # Calculate the centroid of the cluster
    cluster_center = np.mean(np.array(cluster_df['weighted_embeddings'].tolist()), axis=0)

    # Find the item closest to the cluster center
    similarities = cosine_similarity([cluster_center], np.array(cluster_df['weighted_embeddings'].tolist()))
    closest_idx = similarities.argmax()

    # Get the representative item as the label
    representative_label = cluster_df.iloc[closest_idx]['Audience Segment']
    return representative_label

# Apply the function to each cluster to get the representative label
cluster_labels = {}
for cluster_num in df['cluster'].unique():
    cluster_df = df[df['cluster'] == cluster_num]
    cluster_label = find_representative_label(cluster_df)
    cluster_labels[cluster_num] = cluster_label

# Add the cluster label to the DataFrame
df['cluster_label'] = df['cluster'].map(cluster_labels)

# Display results by cluster with labels
for cluster_num in sorted(df['cluster'].unique()):
    print(f"Cluster {cluster_num + 1}: ({df[df['cluster'] == cluster_num]['cluster_label'].iloc[0]})")
    cluster_items = df[df['cluster'] == cluster_num]['Audience Segment']
    for item in cluster_items:
        print(f"- {item}")
    print()

Cluster 1: (US > B2B > B2B Decision Maker Responsibilities > Television providers > services)
- US > B2B > B2B Decision Maker Responsibilities > Purchasing Utilities
- Online Behavior > United States > B2B > Occupation > Musician
- US > B2B > B2B Decision Maker Responsibilities > Financial Services
- Online Behavior > United States > B2B > Occupation > MechanicalEngineer
- Online Behavior > United States > Law and Government > Public Safety
- Online Behavior > United States > Jobs and Education > Education
- US > B2B > Purchase DM > Technology Services, Hardware and > or Software > I have no input into the final decision
- Online Behavior > United States > The Changing Consumer > Internet Connection
- Online Behavior > United States > Intent > Shopping > Consumer Electronics > Computers, Laptops
- Online Behavior > United States > Law and Government > Military
- US > B2B > B2B Decision Maker Responsibilities > Company Liability or Insurance
- US > B2B > B2B Decision Maker Responsibilit



In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
import pandas as pd
from transformers import pipeline

# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define noise tokens
noise_tokens = [
    'United States', 'US', 'Online Behavior', 'Interest',
    'Propensity Models', 'Intent', 'validated', 'Reach', 'OnAudience',
    'ConneXions', 'Brands', 'The Changing Consumer', 'Lotame',
    'Mobile - US', 'Brand Propensities', 'Interest Propensities'
]

# Preprocess tokens by splitting and removing noise tokens
df['tokens'] = df['Audience Segment'].apply(lambda x: [token.strip() for token in x.split('>')])
df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token not in noise_tokens])

# Function to get weighted embedding
def get_weighted_embedding(tokens):
    n = len(tokens)
    weighted_embeddings = []

    # Check if there are tokens left after filtering
    if n == 0:
        return np.zeros(model.get_sentence_embedding_dimension())

    for i, token in enumerate(tokens):
        embedding = model.encode(token)
        weight = (i + 1) / n  # Weight inversely proportional to position (closer to leaf = higher weight)
        weighted_embeddings.append(embedding * weight)

    final_embedding = np.sum(weighted_embeddings, axis=0)
    return final_embedding

# Apply the function to the 'tokens' column
df['weighted_embeddings'] = df['tokens'].apply(get_weighted_embedding)

# Convert list of embeddings to a 2D array
X = np.array(df['weighted_embeddings'].tolist())

# Perform clustering
num_clusters = 8  # Adjust based on your data
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Function to generate a representative label from tokens
def generate_cluster_label(cluster_df):
    all_tokens = [token for tokens in cluster_df['tokens'] for token in tokens]
    token_counts = Counter(all_tokens)

    # Select the most common tokens as cluster labels
    common_tokens = token_counts.most_common(3)  # Top 3 tokens
    if common_tokens:
        # Join the top tokens to create a representative label
        return ", ".join(token for token, count in common_tokens)
    return "Other"

# Assign cluster labels using the derived tokens
df['cluster_label'] = df.groupby('cluster').apply(generate_cluster_label).reset_index(drop=True)

# Display results by cluster with labels
for cluster_num in sorted(df['cluster'].unique()):
    print(f"Cluster {cluster_num + 1}: ({df[df['cluster'] == cluster_num]['cluster_label'].iloc[0]})")
    cluster_items = df[df['cluster'] == cluster_num]['tokens']
    for item in cluster_items:
        print(f"- {item}")
    print()

Cluster 1: (Media and Entertainment, Arts and Entertainment, Hobbies and Interest)
- ['Finance', 'Financial Planning and Management', 'Retirement and Pension']
- ['B2B', 'B2B Decision Maker Responsibilities', 'Purchasing Utilities']
- ['US Financial', 'Likely Credit Card', 'Card in Own Name', 'Any major Credit/Debit Card (Financial)']
- ['B2B', 'B2B Decision Maker Responsibilities', 'Financial Services']
- ['Activities and Interests', 'Financial Aid']
- ['Finance', 'Insurance', 'Home Insurance']
- ['Finance', 'Investing', 'Derivatives']
- ['B2B', 'Purchase DM', 'Technology Services, Hardware and', 'or Software', 'I have no input into the final decision']
- ['US Financial', 'Estimated Discretionary Spending (Financial)', 'Greater than $2,499']
- ['Sociodemographic', 'Estimated Current Home Value', '$160,000-$199,999']
- ['Personal Finance', 'Estate Planning']
- ['US Financial', 'Likely Attitude and Behavior', 'Bank Selection', 'Customer Service Very Important (Financial)']
- ['Real Esta

  df['cluster_label'] = df.groupby('cluster').apply(generate_cluster_label).reset_index(drop=True)


## Label documents with pre-defined labels

In [15]:
# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Categories with related words
categories = {
    "Sports": ["sports", "soccer", "tennis", "basketball", "fitness", "athlete", "game", "team", "player", "workout", "NFL", "UFC"],
    "Finance": ["finance", "investment", "retirement", "planning", "banking", "savings", "stocks", "trading", "budget", "wealth", "credit", "credit card" "mortgage", "insurance"],
    "Entertainment": ["entertainment", "movies", "tv", "music", "streaming", "shows", "film", "concert", "series", "media", "news", "local news", "politics"],
    "Automotive": ["automotive", "car", "vehicle", "buying", "dealer", "maintenance", "repair", "driving", "truck", "road", "motorcycle"],
    "Demographics": ["age", "age", "age", "gender", "demographics", "demo", "population", "segment", "group", "generation", "household", "income", "education", "sociodemographic"],
    "Outdoors": ["outdoors", "nature", "equestrian", "hiking", "camping", "adventure", "wildlife", "trail", "park", "wilderness"],
    "Travel": ["travel", "tourism", "vacation", "trip", "destination", "flight", "hotel", "resort", "cruise", "journey", "airlines"],
    "Food and Drink": ["food", "drink", "cuisine", "restaurant", "cooking", "recipe", "dining", "gourmet", "snack", "meal", "flavor", "nutrition"],
    "Style": ["fashion", "clothing", "accessories", "trends", "design", "outfit", "wardrobe", "couture", "runway", "beauty"],
    "Technology": ["technology", "gadgets", "software", "smartphone" "hardware", "innovation", "device", "digital", "electronics", "AI", "tech", "internet", "telecom", "mobile"],
    "Health": ["health", "wellness", "nutrition", "medical", "disease", "treatment", "exercise", "therapy", "prevention", "mental health", "fitness"],
    "Beauty": ["beauty", "cosmetics", "skincare", "makeup", "grooming", "aesthetics", "fragrance", "personal care", "salon"],
    "Home Goods": ["home", "furniture", "decor", "appliances", "bedding", "kitchenware", "interior", "garden", "accessories", "living space"],
    "Relationships": ["relationship", "dating", "marriage", "family", "friendship", "connection", "partnership", "community"],
    "Education": ["education", "learning", "school", "college", "university", "courses", "teaching", "training", "study", "knowledge"],
    "Pets": ["pets", "dogs", "cats", "animal", "care", "training", "grooming", "veterinary", "adoption", "pet supplies"],
    "Real Estate": ["real estate", "property", "housing", "homeownership", "rental", "mortgage", "realty", "land", "development"],
    "B2B": ["B2B", "business to business", "B2B decision maker", "enterprise", "commercial", "client relations"],
    "Life Events": ["life event", "new parent", "job search", "graduation", "movers", "retirement", "marriage", "divorce", "military"],
    "Public Services": ["law and government", "public safety", "military", "government", "legal", "insurance", "emergency services"]
}

def apply_position_weights(tokens, weight_factor):
    """Apply position-based weights to tokens, giving more weight to later tokens"""
    if isinstance(tokens, str):
        tokens = tokens.split()

    n = len(tokens)
    if n == 0:
        return ""

    # Calculate weights for each position
    # Later positions get higher weights
    weights = [1 + (i * weight_factor) for i in range(n)]

    # Repeat tokens based on their weights
    weighted_tokens = []
    for token, weight in zip(tokens, weights):
        # Round weight to nearest integer and repeat token that many times
        repeats = max(1, round(weight))
        weighted_tokens.extend([token] * repeats)

    return " ".join(weighted_tokens)

def calculate_category_embeddings(categories, weight_factor):
    """Calculate embeddings for each category by combining all keywords"""
    category_embeddings = {}
    for category, keywords in categories.items():
        # Apply position weights to keywords
        weighted_text = apply_position_weights(keywords, weight_factor)
        category_embeddings[category] = model.encode(weighted_text, convert_to_tensor=True)
    return category_embeddings

def assign_best_category(tokens, category_embeddings, threshold, weight_factor):
    """Assign category based on cosine similarity with threshold"""
    if not tokens:  # Handle empty tokens
        return "Other"

    # Apply position weights to input tokens
    if isinstance(tokens, list):
        weighted_text = apply_position_weights(tokens, weight_factor)
    else:
        weighted_text = apply_position_weights(str(tokens), weight_factor)

    # Generate document embedding
    doc_embedding = model.encode(weighted_text, convert_to_tensor=True)

    # Calculate similarities with all categories
    similarities = {}
    for category, cat_embedding in category_embeddings.items():
        similarity = util.pytorch_cos_sim(doc_embedding, cat_embedding).item()
        similarities[category] = similarity

    # Find best match
    best_category = max(similarities.items(), key=lambda x: x[1])

    # Return best category if above threshold, otherwise "Other"
    if best_category[1] >= threshold:
        return best_category[0]
    return "Other"

def classify_segments(df, min_threshold=0.3, weight_factor=0.1):
    """Main function to classify all segments in the dataframe"""
    # Calculate category embeddings once with weight factor
    category_embeddings = calculate_category_embeddings(categories, weight_factor)

    # Apply classification to each row with weight factor
    df['assigned_category'] = df['tokens'].apply(
        lambda x: assign_best_category(x, category_embeddings, min_threshold, weight_factor)
    )

    return df

def display_categories(df):
    # Calculate the percentage of labeled rows
    total_rows = len(df)
    labeled_rows = len(df[df['assigned_category'] != 'Other'])
    pct_labeled = (labeled_rows / total_rows) * 100 if total_rows > 0 else 0

    print(f"Labeled: {pct_labeled:.2f}%")

    # Group by assigned category and display segments
    grouped = df.groupby('assigned_category')
    for category, group in grouped:
        print(f"\n{category} ({len(group)} items):")
        for segment in group['Audience Segment']:
            print(f"- {segment}")

df = classify_segments(df, min_threshold=0.225, weight_factor=0.4)
# display_categories(df)

In [16]:
# Rename column 'assigned_category' to 'Audience Segment - Label'
df.rename(columns={'assigned_category': 'Audience Segment - Label'}, inplace=True)

# Drop 'tokens' column
df.drop(columns=['tokens', 'weighted_embeddings', 'cluster', 'cluster_label'], inplace=True)

# Move 'Audience Segment - Label' column to next to original 'Audience Segment' column
columns = df.columns.tolist()

audience_segment_index = columns.index('Audience Segment')
columns.insert(audience_segment_index + 1, columns.pop(columns.index('Audience Segment - Label')))

df = df[columns]

In [17]:
# Final check
df.head()

Unnamed: 0,Audience Segment,Audience Segment - Label,Creative Size,Creative Messaging,Location,App/URL,Exchange,Device Make,Impressions,Clicks,Viewable Impressions,Measurable Impressions,Total Conversions,Gross Cost
0,Online Behavior > United States > Validated De...,Demographics,300x250,Subscribe Today - 10% Discount,"Los Angeles, California",abcnews.go.com,Google Ad Manager,Apple PC,10524,55.0,7601,10242,11.0,6136.133902
1,Online Behavior > United States > Finance > Fi...,Finance,320x50,Sign up Today - 10% Off,"Orlando-Daytona Beach, Florida",accuweather.com,PubMatic,Smartphone,18324,60.0,3145,13703,8.0,6996.231838
2,Online Behavior > United States > Hobbies and ...,Outdoors,300x250,Subscribe Today - 10% Discount,"San Francisco, California",accuweather.com,Google Ad Manager,Smartphone,7862,20.0,2152,6039,2.0,6506.433687
3,Online Behavior > United States > Intent > Aut...,Automotive,300x250,Sign up Now - 10% Off,"Grand Rapids-Kalamazoo, Michigan",accuweather.com,Xandr - Monetize SSP (AppNexus),Smartphone,10501,90.0,4126,7654,8.0,6244.517921
4,Online Behavior > United States > Beauty and F...,Health,300x250,Sign up Now - 10% Off,"Chicago, Illinois",accuweather.com,Magnite DV+,Apple PC,23156,73.0,4468,18821,15.0,6173.41289


In [18]:
# Save labeled data to CSV
df.to_csv('data_cleaned.csv', index=False) 