# Softer Matching
- a matching strategy based on cosine similarity using BERT embeddings
- input: topics_filtered.csv (expanded), post_cleaned.csv
- output: post_filtered.csv

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')

# Function to get word embeddings from BERT
def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Load the fashion lexicon from 'topics_filtered.csv'
fashion_lexicon_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/topics_filtered.csv')
fashion_keywords = set(fashion_lexicon_df['keyword group'].dropna().astype(str))

# Precompute embeddings for fashion keywords
fashion_keyword_embeddings = {keyword: get_word_embedding(keyword) for keyword in fashion_keywords}

# Function to calculate cosine similarity between two word embeddings
def calculate_cosine_similarity(embedding1, embedding2):
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

# Soft match function using cosine similarity
def soft_match_keywords(keywords, threshold=0.7):
    fashion_related_keywords = []
    for keyword in keywords:
        keyword_embedding = get_word_embedding(keyword)
        # Find the closest fashion keyword using cosine similarity
        max_similarity = 0
        for fashion_keyword, fashion_embedding in fashion_keyword_embeddings.items():
            similarity = calculate_cosine_similarity(keyword_embedding, fashion_embedding)
            if similarity > max_similarity:
                max_similarity = similarity
        # Include the keyword if the similarity exceeds the threshold
        if max_similarity >= threshold:
            fashion_related_keywords.append(keyword)
    return fashion_related_keywords

# Load the cleaned data
df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/post_cleaned.csv')

# Apply the soft match filter to the 'cleaned_post_text' column
df['fashion_text'] = df['cleaned_post_text'].apply(lambda x: soft_match_keywords(x, threshold=0.7))

# Save the final output with only fashion-related keywords
df.to_csv('/Users/ddyilin/Documents/GitHub/fashion/post_filtered.csv', index=False)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Count empty values
empty_values_count = df['filtered_fashion_keywords'].apply(lambda x: len(x) == 0).sum()

# Count non-empty values
non_empty_values_count = df['filtered_fashion_keywords'].apply(lambda x: len(x) > 0).sum()

# Print the results
print(f"Number of empty values in 'filtered_fashion_keywords': {empty_values_count}")
print(f"Number of non-empty values in 'filtered_fashion_keywords': {non_empty_values_count}")
