# Weighted Score Notebook

This notebook computes similarities between brand names using the final weighted metric implemented in `similarities.py`. The process involves:

- **Computing Similarity:**  
  The final metric is applied to compare brand names and generate similarity scores.

- **Threshold Filtering:**  
  Only brand pairs with similarity scores above a specified threshold are considered.

- **Grouping:**  
  Brands with high similarity are grouped together, consolidating variations of the same brand into unified clusters.

- **Output:**  
  Creates an output in the same format as the reference one provided by Criteo

In [13]:
import pandas as pd
pd.set_option("display.max_rows", None)
import unicodedata
import gzip
import json
from similarities import filter_brands, weighted_score


In [14]:
df = pd.read_csv("df_fully_cleaned.csv")
df_software = df[df['category_level_1']=='Software']
df.head(5)

Unnamed: 0,row_id,brand_name,language_code,google_category_id,row_weight,clean_brand_name,category_level_1,category_level_2,category_level_3,category_level_4,category_level_5,category_level_6,category_level_7
0,0,,en,691,0.505044,-1,Home & Garden,Lawn & Garden,Gardening,Disease Control,,,
1,1,,ja,325,0.632272,-1,Electronics,Computers,Desktop Computers,,,,
2,2,,nl,176,0.492944,-1,Apparel & Accessories,Clothing Accessories,Neckties,,,,
3,3,,pt,1033,0.582456,-1,Sporting Goods,Outdoor Recreation,Hunting & Shooting,Archery,,,
4,4,100%,de,499845,0.355131,100,Sporting Goods,Outdoor Recreation,Winter Sports & Activities,Skiing & Snowboarding,,,


In [15]:
df = df.sample(frac=0.2, random_state=1) 

In [16]:
df.shape

(12582, 13)

In [17]:
df['brand_name'].nunique()

6186

In [18]:

filtered_brands_custom = filter_brands(
    df,
    'clean_brand_name',
    threshold=65,  
    score_func=weighted_score,  
    brand_weight=2.0,  
    common_weight=1.0
)

print("Brand pairs with Weighted score above threshold:")
for b1, b2, score in filtered_brands_custom:
    print(f"{b1} and {b2}: {score}%")


Brand pairs with Weighted score above threshold:
nike and nike swim: 66%
rick owens and rick owens kids: 75%
calvin klein and calvin klein jeans: 75%
calvin klein and calvin klein performance: 75%
gucci and gucci eyewear: 66%
gucci and gucci kids: 66%
offwhite and offwhite kids: 66%
goya and stine goya: 66%
disney and disney princess: 66%
versace and versace kids: 66%
versace and versace jeans: 66%
armani jeans and armani: 66%
balenciaga and balenciaga kids: 66%
dkny and dkny sport: 66%
ralph lauren collection and ralph lauren: 75%
ralph lauren collection and lauren ralph lauren: 75%
karl lagerfeld and karl lagerfeld kids: 75%
marc jacobs and marc jacobs kids: 75%
stella mccartney kids and stella mccartney: 75%
michael kors and michael kors collection: 75%
michael kors and michael michael kors: 100%
missoni and missoni home: 66%
missoni and m missoni: 66%
missoni and missoni mare: 66%
agua bendita and agua by agua bendita: 75%
joseph joseph and joseph: 100%
fendi and fendi kids: 66%
ca

In [19]:
def group_brands_by_common_words(filtered_brands):
    """
    Groups brands into clusters based on similarity and assigns a key based on common words,
    preserving their order from the first brand in the cluster.

    Parameters:
      filtered_brands: List of tuples (brand1, brand2, score) from filter_brands function.

    Returns:
      A dictionary where:
        - Keys are the common words among brands in a cluster, preserving their order.
        - Values are lists of brands that share those common words.
    """
    from collections import defaultdict

    brand_graph = defaultdict(set)

    # Build a similarity graph
    for brand1, brand2, _ in filtered_brands:
        brand_graph[brand1].add(brand2)
        brand_graph[brand2].add(brand1)

    # Perform connected components clustering
    visited = set()
    clusters = []

    def dfs(brand, cluster):
        """Depth-first search to find all connected brands."""
        if brand in visited:
            return
        visited.add(brand)
        cluster.append(brand)
        for neighbor in brand_graph[brand]:
            dfs(neighbor, cluster)

    for brand in brand_graph:
        if brand not in visited:
            cluster = []
            dfs(brand, cluster)
            clusters.append(cluster)

    # Convert clusters into a dictionary with common words as keys, preserving order
    clustered_dict = {}

    for cluster in clusters:
        if not cluster:
            continue

        # Tokenize the first brand while maintaining word order
        first_brand_words = cluster[0].split()

        # Find common words while keeping the original order
        for brand in cluster[1:]:
            brand_words = set(brand.split())  # Use a set for quick lookup
            first_brand_words = [word for word in first_brand_words if word in brand_words]

        # Create the key as a string of common words in order
        key = " ".join(first_brand_words) if first_brand_words else cluster[0]

        clustered_dict[key] = cluster

    return clustered_dict


brand_clusters_dict = group_brands_by_common_words(filtered_brands_custom)
print(brand_clusters_dict)


{'nike': ['nike', 'nike swim'], 'rick owens': ['rick owens', 'rick owens kids'], 'calvin klein': ['calvin klein', 'calvin klein jeans', 'calvin klein performance'], 'gucci': ['gucci', 'gucci kids', 'gucci eyewear'], 'offwhite': ['offwhite', 'offwhite kids'], 'goya': ['goya', 'stine goya'], 'disney': ['disney', 'disney princess'], 'versace': ['versace', 'versace jeans', 'versace kids'], 'armani': ['armani jeans', 'armani', 'armani exchange'], 'balenciaga': ['balenciaga', 'balenciaga kids'], 'dkny': ['dkny', 'dkny sport'], 'ralph lauren': ['ralph lauren collection', 'ralph lauren', 'polo ralph lauren', 'polo ralph lauren kids', 'lauren ralph lauren'], 'karl lagerfeld': ['karl lagerfeld', 'karl lagerfeld kids'], 'marc jacobs': ['marc jacobs', 'marc jacobs kids'], 'stella mccartney': ['stella mccartney kids', 'stella mccartney'], 'michael kors': ['michael kors', 'michael kors collection', 'michael michael kors'], 'missoni': ['missoni', 'missoni mare', 'missoni home', 'm missoni'], 'agua be

In [20]:
groups = pd.DataFrame(list(brand_clusters_dict.items()), columns=["Key", "Values"])

groups["Values"] = groups["Values"].apply(lambda x: ', '.join(x))
print(groups.to_string(index=False))

                Key                                                                                                Values
               nike                                                                                       nike, nike swim
         rick owens                                                                           rick owens, rick owens kids
       calvin klein                                            calvin klein, calvin klein jeans, calvin klein performance
              gucci                                                                      gucci, gucci kids, gucci eyewear
           offwhite                                                                               offwhite, offwhite kids
               goya                                                                                      goya, stine goya
             disney                                                                               disney, disney princess
            versace     

In [21]:
def map_brands_to_clusters(df, brand_column, cluster_column, brand_clusters_dict):
    """
    Adds a new column to the DataFrame that maps each brand name to its corresponding cluster key.

    Parameters:
      df: Pandas DataFrame containing the brand names.
      brand_column: Name of the column with original brand names.
      cluster_column: Name of the new column to store the mapped cluster keys.
      brand_clusters_dict: Dictionary where keys are common words and values are lists of similar brands.

    Returns:
      A modified DataFrame with an additional column containing the cluster key for each brand name.
    """
    brand_to_cluster = {}

    # Create a mapping of brand to its cluster key
    for key, brands in brand_clusters_dict.items():
        for brand in brands:
            brand_to_cluster[brand] = key

    # Create a new column with the cluster key
    df[cluster_column] = df[brand_column].apply(lambda x: brand_to_cluster.get(x, x))

    return df

df = map_brands_to_clusters(df, 'clean_brand_name', 'brand_cluster', brand_clusters_dict)
df.head(5)


Unnamed: 0,row_id,brand_name,language_code,google_category_id,row_weight,clean_brand_name,category_level_1,category_level_2,category_level_3,category_level_4,category_level_5,category_level_6,category_level_7,brand_cluster
54284,54284,inlzdz,en,5598,0.289238,inlzdz,Apparel & Accessories,Clothing,Outerwear,Coats & Jackets,,,,inlzdz
32632,32632,Gestuz,de,212,0.431939,gestuz,Apparel & Accessories,Clothing,Shirts & Tops,,,,,gestuz
36847,36847,Nike,en,3141,0.402295,nike,Sporting Goods,Athletics,Soccer,Soccer Gloves,,,,nike
57307,57307,,de,504637,0.549798,-1,Hardware,Plumbing,Plumbing Fixture Hardware & Parts,Drain Components,,,,-1
5732,5732,Kirin,en,204,0.306462,kirin,Apparel & Accessories,Clothing,Pants,,,,,kirin


In [22]:
#drop clean_brand_name column
df.head(10)

Unnamed: 0,row_id,brand_name,language_code,google_category_id,row_weight,clean_brand_name,category_level_1,category_level_2,category_level_3,category_level_4,category_level_5,category_level_6,category_level_7,brand_cluster
54284,54284,inlzdz,en,5598,0.289238,inlzdz,Apparel & Accessories,Clothing,Outerwear,Coats & Jackets,,,,inlzdz
32632,32632,Gestuz,de,212,0.431939,gestuz,Apparel & Accessories,Clothing,Shirts & Tops,,,,,gestuz
36847,36847,Nike,en,3141,0.402295,nike,Sporting Goods,Athletics,Soccer,Soccer Gloves,,,,nike
57307,57307,,de,504637,0.549798,-1,Hardware,Plumbing,Plumbing Fixture Hardware & Parts,Drain Components,,,,-1
5732,5732,Kirin,en,204,0.306462,kirin,Apparel & Accessories,Clothing,Pants,,,,,kirin
37507,37507,Honda,en,3020,0.613543,honda,Vehicles & Parts,Vehicle Parts & Accessories,Motor Vehicle Parts,Motor Vehicle Wheel Systems,,,,honda
59378,59378,,it,5866,0.551702,-1,Home & Garden,Lawn & Garden,Outdoor Power Equipment,Tractors,,,,-1
9558,9558,Andoer,en,505295,0.309084,andoer,Electronics,Electronics Accessories,Power,Power Adapters & Chargers,,,,andoer
12354,12354,XT-XINTE,en,2414,0.407141,xtxinte,Electronics,Electronics Accessories,Computer Components,Storage Devices,,,,xtxinte
26348,26348,Advantus,en,925,0.236345,advantus,Office Supplies,Filing & Organization,File Boxes,,,,,advantus


In [23]:
df[['clean_brand_name', 'brand_cluster']].head(500)
check = df[df['clean_brand_name'] != df['brand_cluster']][['clean_brand_name', 'brand_cluster']].head(500)

In [24]:
check.head(50)

Unnamed: 0,clean_brand_name,brand_cluster
9750,armani jeans,armani
8340,ralph lauren collection,ralph lauren
50229,stella mccartney kids,stella mccartney
56890,canada goose kids,canada goose
38868,filling pieces,pieces
24066,christian dior,dior
33030,nike swim,nike
51593,paco rabanne,rabanne
8684,christian dior,dior
16760,fanatics authentic,fanatics


In [25]:
check.shape

(172, 2)

In [26]:
df.shape

(12582, 14)

In [27]:
df['brand_cluster'].nunique()

5720

In [28]:
#save the dataframe to a csv file
df.to_csv("df_mapped.csv", index=False)

In [29]:
groups = (
    df.groupby("brand_cluster", as_index=False)
      .agg(
          row_ids=("row_id", list),
          brand_names=("brand_name", list),
          categories=("google_category_id", list),
          row_weights=("row_weight", list),
          languages=("language_code", list),
      )
      # Rename brand_cluster to slug for consistency
      .rename(columns={"brand_cluster": "slug"})
      .assign(
          # Create a dict mapping each language to the corresponding brand name
          localization=lambda x: x[["languages", "brand_names"]].apply(
              lambda row: dict(zip(*row)), axis=1
          ),
          n_brand_names=lambda x: x["brand_names"].apply(lambda l: len(set(l))),
          total_weight=lambda x: x["row_weights"].apply(sum),
          n_languages=lambda x: x["languages"].apply(lambda l: len(set(l))),
          n_categories=lambda x: x["categories"].apply(lambda l: len(set(l))),
      )
      .sort_values(by="n_categories", ascending=False)
      .reset_index(drop=True)
      .reset_index()                        # Make a fresh numeric index
      .rename(columns={"index": "group_id"}) # Call that index "group_id"
      .assign(group_id=lambda x: x["group_id"] - 1)
)

desired_cols = [
    "group_id", "slug", "row_ids", "brand_names", "categories", 
    "row_weights", "languages", "localization", 
    "n_brand_names", "total_weight", "n_languages", "n_categories"
]
groups = groups[desired_cols]

groups.head(10)


Unnamed: 0,group_id,slug,row_ids,brand_names,categories,row_weights,languages,localization,n_brand_names,total_weight,n_languages,n_categories
0,-1,-1,"[57307, 59378, 33579, 5946, 8662, 5943, 38581,...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[504637, 5866, 499898, 5423, 1505, 1463, 525, ...","[0.5497975721, 0.5517016236, 0.5084593272, 0.4...","[de, it, de, fr, en, en, ko, it, en, es, pt, k...","{'de': nan, 'it': nan, 'fr': nan, 'en': nan, '...",67,1043.086962,10,1155
1,0,disney,"[45686, 23448, 43300, 51506, 48882, 46640, 703...","[Disney, Disney, Disney, Disney, Disney, Disne...","[674, 1243, 6397, 211, 397, 3601, 5409, 499845...","[0.4732669698, 0.3633569839, 0.273265832, 0.41...","[es, en, en, en, en, en, en, en, en, en, en, e...","{'es': 'Disney', 'en': 'Disney'}",2,23.282313,2,58
2,1,vevor,"[8038, 52528, 29662, 39966, 52968, 17904, 1119...","[VEVOR, VEVOR, VEVOR, VEVOR, VEVOR, VEVOR, VEV...","[952, 586, 3436, 3684, 616, 1194, 1184, 8236, ...","[0.2630395468, 0.2980161196, 0.349981633, 0.38...","[en, en, en, en, en, en, en, en, en, en, en, e...",{'en': 'VEVOR'},1,14.008752,1,43
3,2,bosch,"[57553, 58572, 17145, 14797, 13931, 23363, 130...","[Bosch, Bosch, Bosch, Bosch, Bosch, Bosch, BOS...","[264, 503737, 2549, 618, 1203, 2727, 619, 8236...","[0.3479219774, 0.3704948612, 0.5129732222, 0.4...","[en, de, es, de, de, en, fr, en, es, en, en, p...","{'en': 'Bosch', 'de': 'Bosch', 'es': 'Bosch', ...",2,15.920517,6,31
4,3,calvin klein,"[871, 32193, 52314, 41944, 2802, 21630, 39362,...","[Calvin Klein, Calvin Klein, Calvin Klein, Cal...","[5183, 2668, 5424, 212, 178, 1831, 479, 2580, ...","[0.3805605416, 0.4281684108, 0.3221414653, 0.5...","[en, de, en, nl, de, de, en, de, en, en, de, d...","{'en': 'Calvin Klein', 'de': 'Calvin Klein', '...",4,13.444201,5,26
5,4,sanrio,"[11578, 31601, 54627, 17790, 19468, 10934, 157...","[Sanrio, Sanrio, Sanrio, Sanrio, Sanrio, Sanri...","[6065, 5622, 961, 3584, 478, 4546, 173, 2271, ...","[0.5170857679, 0.349981633, 0.4898816621, 0.42...","[en, en, en, en, en, en, en, en, en, en, en, e...",{'en': 'Sanrio'},2,10.722983,1,26
6,5,xiaomi,"[2648, 45421, 61693, 45144, 6412, 46810, 22359...","[Xiaomi, Xiaomi, Xiaomi, XIAOMI, Xiaomi, XIAOM...","[3809, 155, 2524, 4745, 608, 267, 4745, 249, 5...","[0.3416868727, 0.3088440546, 0.3006515236, 0.3...","[en, en, en, en, en, es, en, en, en, en, es, e...","{'en': 'XIAOMI', 'es': 'XIAOMI', 'fr': 'XIAOMI'}",2,13.090949,3,25
7,6,philips,"[13171, 24225, 41839, 41103, 22340, 49375, 646...","[Philips, Philips, Philips Hue, Philips, Phili...","[2524, 533, 3329, 505666, 2690, 533, 2690, 738...","[0.4662755742, 0.4987983083, 0.3980701599, 0.3...","[de, pt, en, es, en, fr, de, en, en, en, en, d...","{'de': 'Philips', 'pt': 'Philips', 'en': 'Phil...",3,12.324644,5,23
8,7,samsung,"[7308, 37106, 42779, 21799, 48775, 7900, 4873,...","[SAMSUNG, Samsung, Samsung, Samsung, Samsung, ...","[264, 328, 276, 680, 679, 505295, 276, 201, 50...","[0.4842467192, 0.4699362053, 0.4398828814, 0.4...","[es, en, de, en, en, en, en, es, fr, fr, es, e...","{'es': 'Samsung', 'en': 'Samsung', 'de': 'Sams...",2,12.186709,4,22
9,8,adidas,"[34508, 53595, 35983, 25847, 56543, 43039, 884...","[adidas, adidas, adidas, Adidas, Adidas, Adida...","[7313, 204, 5608, 100, 173, 173, 2271, 1112, 1...","[0.4298689674, 0.5017505179, 0.314223896, 0.40...","[de, fr, en, en, de, es, de, en, de, de, de, e...","{'de': 'Adidas', 'fr': 'adidas', 'en': 'adidas...",3,17.796716,7,22


In [30]:
filtered_brands_custom = filter_brands(
    df_software,
    'clean_brand_name',
    threshold=0,
    score_func=weighted_score,  
    brand_weight=2.0,  
    common_weight=1.0
)

In [31]:
print("Brand pairs with Weighted score above threshold:")
for b1, b2, score in filtered_brands_custom:
    print(f"{b1} and {b2}: {score}%")

Brand pairs with Weighted score above threshold:
university games and outright games: 50%
university games and rockstar games: 40%
university games and ci games: 40%
university games and maximum games: 50%
university games and 505 games: 40%
university games and telltale games: 50%
university games and skybound games: 40%
university games and nordic games: 50%
university games and 2k games: 40%
university games and merge games: 50%
university games and epic games: 50%
outright games and rockstar games: 40%
outright games and ci games: 40%
outright games and maximum games: 50%
outright games and 505 games: 40%
outright games and telltale games: 50%
outright games and skybound games: 40%
outright games and nordic games: 50%
outright games and 2k games: 40%
outright games and merge games: 50%
outright games and epic games: 50%
bethesda softworks and bethesda: 33%
nintendo and nintendo switch: 66%
rockstar games and ci games: 33%
rockstar games and maximum games: 40%
rockstar games and 505

In [42]:
import plotly.graph_objects as go
import networkx as nx
import matplotlib.pyplot as plt

# Create the network graph
G = nx.Graph()

# Assuming ⁠ filtered_for_graph ⁠ contains tuples (brand1, brand2, score)
# Add all nodes first
for brand1, brand2, score in filtered_brands_custom:
    G.add_node(brand1)
    G.add_node(brand2)

    # Only add edges with score > 50%
    if score > 50:
        G.add_edge(brand1, brand2, weight=score)

# Find connected components (each group of linked brands)
connected_components = list(nx.connected_components(G))

# Generate a color for each connected component
num_components = len(connected_components)
color_map = plt.cm.get_cmap('tab10', num_components)  # Use a color palette

# Assign colors to nodes based on their component
node_colors = {}
for i, component in enumerate(connected_components):
    color = color_map(i)[:3]  # Get RGB values
    color = f'rgb({int(color[0]*255)},{int(color[1]*255)},{int(color[2]*255)})'  # Convert to Plotly format
    for node in component:
        node_colors[node] = color

# Layout (positioning of nodes)
pos = nx.spring_layout(G, k=0.3, iterations=50, seed=42)

# Edge traces
edge_traces = []
for u, v, weight in G.edges(data='weight'):
    edge_width = weight / 30  # Adjust scaling as needed
    edge_traces.append(
        go.Scatter(
            x=[pos[u][0], pos[v][0], None],
            y=[pos[u][1], pos[v][1], None],
            mode='lines',
            line=dict(width=edge_width, color='red'),
            hoverinfo='none'
        )
    )

# Node trace (each connected group has a unique color)
node_trace = go.Scatter(
    x=[pos[node][0] for node in G.nodes()],
    y=[pos[node][1] for node in G.nodes()],
    mode='markers+text',
    marker=dict(
        size=15,
        color=[node_colors[node] for node in G.nodes()],  # Assign colors per component
        line_width=2
    ),
    text=[node for node in G.nodes()],
    textposition='top center',
    hoverinfo='text'
)

# Create the Plotly figure
fig = go.Figure(data=[node_trace] + edge_traces,
                layout=go.Layout(
                    title='Brand Similarity Network (Connected Brands)',
                    showlegend=False,
                    hovermode='closest',
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    plot_bgcolor='white',
                    paper_bgcolor='white'
                ))

fig.show()

In [33]:
groups.shape

(5720, 12)

In [34]:
groups.to_csv("final_groups.csv")