## Keyword Similarity Demo

0. [BASELINE] Get data and get baseline using Fuzzy Match!
1. Load USE, BERT and PALM
2. Get embeddings from each LM for the taxonomy and store
3. [EMBEDDING MATCH] Input Keyword and output closest matches from the taxonomy
4. Compare the embedding based keyword category matching with fuzzy matching
5. Input Keyword List (file) and output closest matches for each

### Setup

In [None]:
!pip install fuzzywuzzy
!pip install tensorflow-text
!pip install -U google-generativeai

### Utility Functions

In [None]:
def get_overlap_score(df, compare_category_col):
  return (df[df['category'] == df[compare_category_col]].shape[0]*100)/df.shape[0]

### Prepare Data

In [None]:
# prompt: Read data from a csv file stored in google drive
import pickle
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
ls drive/MyDrive/1_gPS_Ads/Projects/S1_Classifier/

#### EDU data processing

In [None]:
# Read the CSV file
import pandas as pd
data = pd.read_csv('drive/MyDrive/1_gPS_Ads/Projects/S1_Classifier/s1_tech_keyword_category.csv')

In [None]:
data['campaign_name_clean'] = data.campaign_name.apply(lambda x: x.replace('_', ' '))
data['ad_group_name_clean'] = data.ad_group_name.apply(lambda x: x.replace('-', ' '))
data['ad_group_name_clean'] = data.ad_group_name_clean.apply(lambda x: ' '.join(x.split()))
data['keyword_text'] = data['AdGroupCriterion_keyword_text']

In [None]:
data_subset = data[['campaign_name_clean', 'ad_group_name_clean', 'keyword_text']]
data_subset.columns = ['level1', 'category', 'keyword']

In [None]:
data_subset[data_subset.level1 == 'Online School']

# Only process for non-online schools since these are not well classified.
data_subset = data_subset[(data_subset.level1 != 'Online School') &
                          (data_subset.level1 != 'Online Course')]
data_subset = data_subset.sample(600, random_state=1).copy()

#### TECH data processing

In [None]:
data.head()

In [None]:
data_subset = data.copy()

OOM, use sample

In [None]:
data_subset = data_subset.sample(600, random_state=1).copy()

In [None]:
set(data.category) - set(data_subset.category)

### Keyword Overlap Baseline

In [None]:
from fuzzywuzzy import fuzz

In [None]:
category = data_subset.category.unique()
keywords = data_subset.keyword.unique()

In [None]:
fuzz.partial_token_sort_ratio(category[0], keywords[0])

In [None]:
data_subset.head()

In [None]:
category_list = data_subset.category.to_list()
keyword_list = data_subset.keyword.to_list()

In [None]:
import tqdm
tsr_scores_dict = {}
ptsr_scores_dict = {}
fuzzy_score = {}

for i, keyword in enumerate((keyword_list)):
  for j, category in enumerate(category_list):
    #print(keyword, "|", category, ":", fuzz.token_sort_ratio(keyword, category))

    tsr_scores_dict[category] = fuzz.token_sort_ratio(keyword, category)
    ptsr_scores_dict[category] = fuzz.partial_token_sort_ratio(keyword, category)

  tsr_score_df = pd.DataFrame(tsr_scores_dict.items(), columns=['category', 'score'])
  ptsr_score_df = pd.DataFrame(ptsr_scores_dict.items(), columns=['category', 'score'])
  fuzzy_score[keyword] = [tsr_score_df.sort_values('score', ascending=False).reset_index(drop=True).loc[0,'category'],
     ptsr_score_df.sort_values('score', ascending=False).reset_index(drop=True).loc[0,'category']]

In [None]:
fuzzy_score_df = pd.DataFrame(fuzzy_score.items(), columns = ['keyword', 'category_list'])

In [None]:
fuzzy_score_final_df = pd.DataFrame(fuzzy_score_df.category_list.tolist(), columns = ['fuzzy_tsr_category', 'fuzzy_ptsr_category'], index=fuzzy_score_df['keyword']).reset_index()
data_with_fuzzy = pd.merge(data_subset, fuzzy_score_final_df, on='keyword', how='inner')

In [None]:
data_with_fuzzy

### Keyword Overlap Accuracy

In [None]:
# @title Fuzzy TSR Accuracy
get_overlap_score(data_with_fuzzy, compare_category_col = 'fuzzy_tsr_category')

In [None]:
# @title Fuzzy PTSR Accuracy
get_overlap_score(data_with_fuzzy, compare_category_col = 'fuzzy_ptsr_category')

### Load USE

In [None]:
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import seaborn as sns
from sklearn.metrics import pairwise

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # Imports TF ops for preprocessing.

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5", "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4"]
USE_MODEL = hub.load(module_url)
print ("module %s loaded" % module_url)

In [None]:
def use_embed(input):
  return USE_MODEL(input)

### Load BERT (+ preprocessor)

In [None]:
# load model for BERT
bert_model_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4" # @param {type: "string"} ["https://tfhub.dev/google/experts/bert/wiki_books/2", "https://tfhub.dev/google/experts/bert/wiki_books/mnli/2", "https://tfhub.dev/google/experts/bert/wiki_books/qnli/2", "https://tfhub.dev/google/experts/bert/wiki_books/qqp/2", "https://tfhub.dev/google/experts/bert/wiki_books/squad2/2", "https://tfhub.dev/google/experts/bert/wiki_books/sst2/2",  "https://tfhub.dev/google/experts/bert/pubmed/2", "https://tfhub.dev/google/experts/bert/pubmed/squad2/2", "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"]
# Preprocessing must match the model, but all the above use the same.
bert_preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

In [None]:
BERT_PREPROCESS_MODEL = hub.load(bert_preprocess_url)
BERT_MODEL = hub.load(bert_model_url)

In [None]:
def get_bert_tensor(text_list):
  inputs = BERT_PREPROCESS_MODEL(text_list)
  outputs = BERT_MODEL(inputs)
  return tf.reduce_mean(outputs['sequence_output'], axis=1) #outputs['pooled_output']

In [None]:
get_bert_tensor(['get embeddings for these words individually']).shape

### Load PALM

In [None]:
# load palm

import numpy as np
import google.generativeai as palm

In [None]:
# get palm api key
API_KEY = "" # @param {type: "string"}
palm.configure(api_key=API_KEY)

In [None]:
for model in palm.list_models():
  if 'embedText' in model.supported_generation_methods:
    print(model.name)

In [None]:
# get palm embeddings
x = 'What do squirrels eat?'

close_to_x = 'nuts and acorns'

different_from_x = 'This morning I woke up in San Francisco, and took a walk to the Bay Bridge. It was a good, sunny morning with no fog.'

model = "models/embedding-gecko-001"

# Create an embedding

def get_palm_tensor(text_list):
  embeddings_series = pd.Series(text_list).apply(palm_embeddings)
  return embeddings_series.tolist()

def palm_embeddings(x):
  model ="models/embedding-gecko-001"
  return palm.generate_embeddings(model=model, text=x)['embedding']

### Taxonomy embeddings

In [None]:
#data_subset.category.unique()

In [None]:
tax_category_use_embeddings = dict(zip(data_subset.category.unique(), use_embed(data_subset.category.unique())))

In [None]:
tax_category_bert_embeddings = dict(zip(data_subset.category.unique(), get_bert_tensor(data_subset.category.unique())))

In [None]:
# tax_category_palm_embeddings
tax_category_palm_embeddings = dict(zip(data_subset.category.unique(), get_palm_tensor(data_subset.category.unique())))

### Keyword embeddings

In [None]:
keyword_use_embeddings = dict(zip(data_subset.keyword.unique(), use_embed(data_subset.keyword.unique())))
keyword_bert_embeddings = dict(zip(data_subset.keyword.unique(), get_bert_tensor(data_subset.keyword.unique())))
keyword_palm_embeddings = dict(zip(data_subset.keyword.unique(), get_palm_tensor(data_subset.keyword.unique())))

### Get Closest Match

In [None]:
# @title Calculations
def get_cosine_bw_embeddings(embeddings_1, embeddings_2):
  return pairwise.cosine_similarity(embeddings_1, embeddings_2)

def get_best_match(keyword_embeddings, tax_category_embeddings):
  keyword_cosine_embedding_matches = {}
  for keyword in keyword_embeddings.keys():
    cosine_similarity = get_cosine_bw_embeddings(list(tax_category_embeddings.values()), [list(keyword_embeddings[keyword])])
    best_match_category = list(tax_category_embeddings.keys())[pd.Series(list(cosine_similarity)).apply(lambda x: x[0]).idxmax()]
    keyword_cosine_embedding_matches[keyword] = best_match_category
  return keyword_cosine_embedding_matches

keyword_cosine_use_matches = get_best_match(keyword_use_embeddings, tax_category_use_embeddings)
keyword_cosine_bert_matches = get_best_match(keyword_bert_embeddings, tax_category_bert_embeddings)
keyword_cosine_palm_matches = get_best_match(keyword_palm_embeddings, tax_category_palm_embeddings)

bert_recommendations = pd.DataFrame(keyword_cosine_bert_matches, index=[0]).T.reset_index().rename(columns={'index': 'keyword', 0: 'bert_category'})
use_recommendations = pd.DataFrame(keyword_cosine_use_matches, index=[0]).T.reset_index().rename(columns={'index': 'keyword', 0: 'use_category'})
palm_recommendations = pd.DataFrame(keyword_cosine_palm_matches, index=[0]).T.reset_index().rename(columns={'index': 'keyword', 0: 'palm_category'})


data_subset_recommendations = pd.merge(data_subset, bert_recommendations, on='keyword')
data_subset_recommendations = pd.merge(data_subset_recommendations, use_recommendations, on='keyword')
data_subset_recommendations = pd.merge(data_subset_recommendations, palm_recommendations, on='keyword')
data_subset_recommendations = pd.merge(data_subset_recommendations, fuzzy_score_final_df, on='keyword')

In [None]:
# @title Embedding Accuracy
print("BERT Accuracy: ", get_overlap_score(data_subset_recommendations, 'bert_category'))
print("USE Accuracy: ", get_overlap_score(data_subset_recommendations, 'use_category'))
print("PaLM Accuracy: ", get_overlap_score(data_subset_recommendations, 'palm_category'))

Using a combination of fuzzy matching + embeddings

In [None]:
# @title Combined Accuracy
data_subset_recommendations['fuzzy_match'] = np.where(data_subset_recommendations.fuzzy_tsr_category == data_subset_recommendations.category, 1, 0)
data_subset_recommendations['excl_use_match'] = np.where(((data_subset_recommendations.use_category == data_subset_recommendations.category)
& (data_subset_recommendations.fuzzy_match == 0)), 1, 0)
data_subset_recommendations['excl_palm_match'] = np.where(((data_subset_recommendations.palm_category == data_subset_recommendations.category)
& (data_subset_recommendations.fuzzy_match == 0)), 1, 0)

print("Use + Fuzzy Combined Accuracy: ", ((sum(data_subset_recommendations.fuzzy_match) + sum(data_subset_recommendations.excl_use_match)) * 100)/data_subset_recommendations.shape[0])
print("PaLM + Fuzzy Combined Accuracy: ",((sum(data_subset_recommendations.fuzzy_match) + sum(data_subset_recommendations.excl_palm_match)) * 100)/data_subset_recommendations.shape[0])

In [None]:
category_list = list(set(data_subset_recommendations.category.to_list()))

In [None]:
# @title F1 Scores
from sklearn import metrics
print( "Fuzzy TSR F1: ", metrics.f1_score(data_subset_recommendations.category.to_list(), data_subset_recommendations.fuzzy_tsr_category.to_list(), average='macro'))
#print( "BERT F1: ", metrics.f1_score(data_subset_recommendations.category.to_list(), data_subset_recommendations.bert_category.to_list(), average='macro'))
print( "USE F1: ", metrics.f1_score(data_subset_recommendations.category.to_list(), data_subset_recommendations.use_category.to_list(), average='macro'))
print( "PaLM F1: ", metrics.f1_score(data_subset_recommendations.category.to_list(), data_subset_recommendations.palm_category.to_list(), average='macro'))

In [None]:
# @title
# pd.merge(pd.DataFrame(zip(category_list, metrics.f1_score(data_subset_recommendations.category.to_list(), data_subset_recommendations.fuzzy_tsr_category.to_list(), average=None, labels=category_list), metrics.f1_score(data_subset_recommendations.category.to_list(), data_subset_recommendations.palm_category.to_list(), average=None, labels=category_list))),
# pd.DataFrame(data_subset_recommendations.groupby('category').count()['palm_category']).reset_index(), left_on=0, right_on='category')

Sample MisClassifications

In [None]:
# @title
def get_sample_misclassifications(df, column, sample=100):
  return df[df.category != df[column]].sample(min(df[df.category != df[column]].shape[0], sample))[['keyword', 'category', column]]

In [None]:
# @title Fuzzy Misclassifications
get_sample_misclassifications(data_subset_recommendations, 'fuzzy_tsr_category')

In [None]:
# @title BERT Misclassifications
get_sample_misclassifications(data_subset_recommendations, 'bert_category')

In [None]:
# @title USE Misclassifications
get_sample_misclassifications(data_subset_recommendations, 'use_category')

In [None]:
# @title PaLM Misclassifications
get_sample_misclassifications(data_subset_recommendations, 'palm_category')

## PCA & tSNE on Keywords

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

#### Using USE

In [None]:
keyword_use_np_embeddings = {}
for keyword in keyword_use_embeddings.keys():
  keyword_use_np_embeddings[keyword] = np.array(keyword_use_embeddings[keyword])
keyword_use_embeddings_df = pd.DataFrame(list(keyword_use_np_embeddings.values()))
keyword_use_embeddings_df.head()

In [None]:
keyword_use_embeddings_df.shape

In [None]:
pca = PCA(n_components=2)
pca_df = pd.DataFrame(pca.fit_transform(keyword_use_embeddings_df), index=keyword_use_embeddings.keys())

In [None]:
pca_df.head()

In [None]:
import plotly.express as px
kmeans = KMeans(n_clusters=6)
cluster_ids = kmeans.fit_predict(pca_df)
pca_df['cluster_ids'] = cluster_ids

In [None]:
pca_df = pca_df.reset_index()
pca_df.columns = ['keyword', 'pca_0', 'pca_1', 'cluster_ids']

In [None]:
pca_df = pd.merge(pca_df, data_subset[['keyword', 'category']], on='keyword')

In [None]:
pca_df['cluster_ids'] = pca_df.cluster_ids.astype(str)
fig = px.scatter(x='pca_0', y='pca_1', data_frame=pca_df, color='category', hover_data=['category'])
fig.update_layout(
    autosize=False,
    width=1600,
    height=1000,)
fig.layout.xaxis.color = 'black'
fig.layout.yaxis.color = 'black'
fig.update_yaxes(linecolor="black")
fig.update_xaxes(linecolor="black")
fig.update_layout({
'plot_bgcolor': 'rgba(256, 256, 256, 1)',
'paper_bgcolor': 'rgba(256, 256, 256, 1)',
})
fig.update_traces(marker={'size': 15})

#### Use PaLM

In [None]:
keyword_palm_embeddings_df = pd.DataFrame(list(keyword_palm_embeddings.values()))
keyword_palm_embeddings_df.head()
pca = PCA(n_components=2)
pca_df = pd.DataFrame(pca.fit_transform(keyword_palm_embeddings_df), index=keyword_palm_embeddings.keys())

In [None]:
import plotly.express as px
kmeans = KMeans(n_clusters=6)
cluster_ids = kmeans.fit_predict(pca_df)
pca_df['cluster_ids'] = cluster_ids
pca_df = pca_df.reset_index()
pca_df.columns = ['keyword', 'pca_0', 'pca_1', 'cluster_ids']
pca_df = pd.merge(pca_df, data_subset[['keyword', 'category']], on='keyword')

In [None]:
pca_df['cluster_ids'] = pca_df.cluster_ids.astype(str)
fig = px.scatter(x='pca_0', y='pca_1', data_frame=pca_df, color='category', hover_data=['category', 'keyword'])
fig.update_layout(
    autosize=False,
    width=1600,
    height=1000,)
fig.layout.xaxis.color = 'black'
fig.layout.yaxis.color = 'black'
fig.update_yaxes(linecolor="black")
fig.update_xaxes(linecolor="black")
fig.update_layout({
'plot_bgcolor': 'rgba(256, 256, 256, 1)',
'paper_bgcolor': 'rgba(256, 256, 256, 1)',
})
fig.update_traces(marker={'size': 15})

In [None]:
#tSNE plot

import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
from ast import literal_eval

# Convert to a list of lists of floats
matrix = np.array(list(keyword_palm_embeddings.values()))

# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)
vis_dims.shape

In [None]:
tsne_df = pd.DataFrame(vis_dims, index=keyword_palm_embeddings.keys()).reset_index()
tsne_df.columns = ['keyword', 'x', 'y']
tsne_df = pd.merge(tsne_df, data_subset[['keyword', 'category']], on='keyword')

In [None]:
#pca_df['cluster_ids'] = pca_df.cluster_ids.astype(str)
fig = px.scatter(x='x', y='y', data_frame=tsne_df, color='category', hover_data=['keyword']) #level1
fig.update_layout(
    autosize=False,
    width=1600,
    height=1000,)
fig.layout.xaxis.color = 'black'
fig.layout.yaxis.color = 'black'
fig.update_yaxes(linecolor="black")
fig.update_xaxes(linecolor="black")
fig.update_layout({
'plot_bgcolor': 'rgba(256, 256, 256, 1)',
'paper_bgcolor': 'rgba(256, 256, 256, 1)',
})
fig.update_traces(marker={'size': 15})

In [None]:
#pca_df['cluster_ids'] = pca_df.cluster_ids.astype(str)
fig = px.scatter(x='x', y='y', data_frame=tsne_df, color='category', hover_data=['keyword'])
fig.update_layout(
    autosize=False,
    width=1600,
    height=1000,)
fig.layout.xaxis.color = 'black'
fig.layout.yaxis.color = 'black'
fig.update_yaxes(linecolor="black")
fig.update_xaxes(linecolor="black")
fig.update_layout({
'plot_bgcolor': 'rgba(256, 256, 256, 1)',
'paper_bgcolor': 'rgba(256, 256, 256, 1)',
})
fig.update_traces(marker={'size': 15})

## Keyword Matching Demo (User Input)

In [None]:
# @title Available Taxonomy Categories
data_subset.category.unique()

In [None]:
# @title ad-hoc classification
adhoc_keywords = 'ppt' #@param
adhoc_keywords = adhoc_keywords.split(",")
adhoc_keywords = [keyword.strip() for keyword in adhoc_keywords]
ex_keywords = pd.DataFrame(adhoc_keywords, columns=['keyword'])

# get embeddings for given keywords
ex_keyword_use_embeddings = dict(zip(ex_keywords.keyword.unique(), use_embed(ex_keywords.keyword.unique())))
ex_keyword_bert_embeddings = dict(zip(ex_keywords.keyword.unique(), get_bert_tensor(ex_keywords.keyword.unique())))
ex_keyword_palm_embeddings = dict(zip(ex_keywords.keyword.unique(), get_palm_tensor(ex_keywords.keyword.unique())))

# get matches
ex_keyword_cosine_use_matches = get_best_match(ex_keyword_use_embeddings, tax_category_use_embeddings)
ex_keyword_cosine_bert_matches = get_best_match(ex_keyword_bert_embeddings, tax_category_bert_embeddings)
ex_keyword_cosine_palm_matches = get_best_match(ex_keyword_palm_embeddings, tax_category_palm_embeddings)

# get recommendations
ex_bert_recommendations = pd.DataFrame(ex_keyword_cosine_bert_matches, index=[0]).T.reset_index().rename(columns={'index': 'keyword', 0: 'bert_category'})
ex_use_recommendations = pd.DataFrame(ex_keyword_cosine_use_matches, index=[0]).T.reset_index().rename(columns={'index': 'keyword', 0: 'use_category'})
ex_palm_recommendations = pd.DataFrame(ex_keyword_cosine_palm_matches, index=[0]).T.reset_index().rename(columns={'index': 'keyword', 0: 'palm_category'})

# get keyword fuzzy recommendations
ex_tsr_scores_dict = {}
ex_ptsr_scores_dict = {}
ex_fuzzy_score = {}

for i, keyword in enumerate((adhoc_keywords)):
  for j, category in enumerate(category_list):
    ex_tsr_scores_dict[category] = fuzz.token_sort_ratio(keyword, category)
    ex_ptsr_scores_dict[category] = fuzz.partial_token_sort_ratio(keyword, category)

  ex_tsr_score_df = pd.DataFrame(ex_tsr_scores_dict.items(), columns=['category', 'score'])
  ex_ptsr_score_df = pd.DataFrame(ex_ptsr_scores_dict.items(), columns=['category', 'score'])
  ex_fuzzy_score[keyword] = [ex_tsr_score_df.sort_values('score', ascending=False).reset_index(drop=True).loc[0,'category'],
     ex_ptsr_score_df.sort_values('score', ascending=False).reset_index(drop=True).loc[0,'category']]

ex_fuzzy_score_df = pd.DataFrame(ex_fuzzy_score.items(), columns = ['keyword', 'category_list'])
ex_fuzzy_score_final_df = pd.DataFrame(ex_fuzzy_score_df.category_list.tolist(), columns = ['fuzzy_tsr_category', 'fuzzy_ptsr_category'], index=ex_fuzzy_score_df['keyword']).reset_index()

# get outputs
print("Output from fuzzy matching algorithms:")
display(ex_fuzzy_score_final_df)
print("Outputs from different embedding models:")
print("BERT: ")
display(ex_bert_recommendations)
print("USE: ")
display(ex_use_recommendations)
print("PALM: ")
display(ex_palm_recommendations)