In [53]:
import pandas as pd
import json
from torch.utils.data import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## Load the json file and convert it to dataframe

In [66]:
# Load JSON data from file
with open('C:/Users/danie/anaconda3/envs/daniel_dev/git/data/test_audiences.json', 'r') as file:
    data = json.load(file)
# Extract test_audiences data
test_audiences = data['test_audiences']
# Convert to DataFrame
target_df = pd.DataFrame(test_audiences)
# Display DataFrame
print(target_df)

   segment_id                                        description
0           1  PartnerSolutions > 208838 > Interest > Home & ...
1           2  PartnerSolutions > 208758 > Interest > Sports ...
2           3  Technology & Computing - MediaGroup DACH - Ind...
3           4  Media & Enertainment - MediaGroup DACH - Movie...
4           5                         letriq age: 18-19 @adality
5           6                       letriq age: females @adality
6           7  HDGQ Branche/Industrie: automobil/fahrzeugbau/...
7           8  HDGQ Branche/Industrie: Finanzservices/Banken/...


## Load the source csv file

In [56]:
df = pd.read_csv("~/anaconda3/envs/daniel_dev/git/data/source_segments_angepasst.csv", encoding = "ISO-8859-1", sep=';')
df.drop(columns=['Unnamed: 5'], inplace=True)
source_df = df.copy()
source_df

Unnamed: 0,label_id_long,label_id,parent_id,segment_description,label_name
0,10000000000,1,0,,Demographic
1,10100000000,101,1,age,Demographic Age Range
2,10101000000,10101,101,age,Demographic Age Range 18-20
3,10102000000,10102,101,age,Demographic Age Range 18-24
4,10103000000,10103,101,age,Demographic Age Range 21-24
...,...,...,...,...,...
1625,51100000000,510,5,Frequency of video gaming,Purchases & Consumption Video Gaming
1626,51101000000,51001,510,Frequency of video gaming,Purchases & Consumption Video Gaming Console...
1627,51102000000,51002,510,Frequency of video gaming,Purchases & Consumption Video Gaming eSports
1628,51103000000,51003,510,Frequency of video gaming,Purchases & Consumption Video Gaming Mobile ...


## Build a Semantic Text Similarity Class

In [67]:
# Implement Semantic Text Similarity Class
# implement Classifier
# The SentenceBERT model is used which is a modification of BERT that was developed especially for text similarity tasks
# It is much faster than BERT and thus more scalable
class STS():
  # generate the SentenceBERT model
  def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    super().__init__()
    self.model = SentenceTransformer(model_name)

  # function to generate the embeddings through the .encode function which generates an embedding using the CLS token
  def generate_embeddings(self, source_df, target_df):
    source_embeddings = self.model.encode(source_df["label_name"])
    target_embeddings = self.model.encode(target_df["description"])
    return source_embeddings, target_embeddings
  
  # compare each of the target segment with each of the source segments and safe the one with the highest cosine similarity
  # In the context of comparing embeddings, cosine similarity measures the cosine of the angle between the two embedding vectors.
  # If the angle is small (i.e., the vectors point in roughly the same direction), the cosine similarity will be close to 1, indicating high similarity.
  # If the angle is large (i.e., the vectors point in different directions), the cosine similarity will be close to -1 or 0, indicating low similarity.
  def find_most_similar(self, source_df, target_df):
        source_embeddings, target_embeddings = self.generate_embeddings(source_df, target_df)
        similarities = cosine_similarity(target_embeddings, source_embeddings)
        max_similarities_idx = np.argmax(similarities, axis=1)
        max_similarities = np.max(similarities, axis=1)
        return max_similarities_idx, max_similarities


In [68]:
# Instantiation and similarities generation
sts_instance = STS()
max_similarities_idx, max_similarities = sts_instance.find_most_similar(source_df, target_df)

## Print the output in a readable manner

In [81]:
# use i to iterate through the target_df
i = 0
for idx in max_similarities_idx:
    if idx in source_df.index:
        # Search for label_id_long, segment_description, and label_name in the source_df to output which segment has the highest similarity
        label_id = source_df.loc[idx, 'label_id_long']
        label_description = source_df.loc[idx, 'segment_description']
        label_name = source_df.loc[idx, 'label_name']
        print(f"Target Segment '{target_df.loc[i, 'segment_id']}', Description: '{target_df.loc[i, 'description']}_")
        print("is most similar to")
        print(f"Source Segment '{label_id}', Description: '{label_description}', label_name: '{label_name}' of the source list")
        print(f"The cosine similarity score is: {max_similarities[i]}")
        print("-----------------------------------------------------------------------------")
    else:
        print(f"No data found for index: {idx}")
    i = i + 1   

Target Segment '1', Description: 'PartnerSolutions > 208838 > Interest > Home & Garden > Home Appliances > Kitchenware_
is most similar to
Source Segment '21408020000', Description: 'interets related to home and garden', label_name: 'Interest  Home & Garden  Kitchen and Dining Products  Tableware' of the source list
The cosine similarity score is: 0.6466765403747559
-----------------------------------------------------------------------------
Target Segment '2', Description: 'PartnerSolutions > 208758 > Interest > Sports > Football > Bayern Munich_
is most similar to
Source Segment '22340000000', Description: 'interest in consuming sports or equipment', label_name: 'Interest  Sports  Soccer' of the source list
The cosine similarity score is: 0.5065775513648987
-----------------------------------------------------------------------------
Target Segment '3', Description: 'Technology & Computing - MediaGroup DACH - Industrie 4.0 / Industry 4.0_
is most similar to
Source Segment '225040100

In [70]:
max_similarities

array([0.64667654, 0.50657755, 0.45226443, 0.49069205, 0.4975306 ,
       0.51691425, 0.32770324, 0.4813367 ], dtype=float32)