# Embedding Encoding

## Install and Import Python Packages

In [141]:
%%capture
!pip3 install seaborn
!pip3 install tensorflow
!pip3 install tensorflow_hub
!pip3 install tensorflow_datasets
!pip install google-cloud-aiplatform

In [153]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import os
import csv
from typing import Generator, List, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm
import math
import functools
from typing import List, Optional
import time
import vertexai
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel
from sklearn.metrics.pairwise import cosine_similarity


## Remove existing metadata and coeffecients

In [143]:
!rm -r tmp
!rm -r tmp2 # for gecko embeddings (not currently used)

## Create metadata path 

In [144]:
import os

log_dir='tmp'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
log_dir='tmp2' # for gecko embedding (not currently used)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

## Load Business Descriptions from BigQuery into Pandas Dataframe
Select ~10,000 samples from 100 million rows

### Test some filters

In [145]:
%%bigquery examples --project cf-data-analytics
SELECT
     *
 FROM
   `cf-data-analytics.us_businesses.description_v3`
WHERE name like "%Equifax%"


Query is running:   0%|          |

Downloading:   0%|          |

In [146]:
examples[examples['name'].str.contains('Equifax')]["summary"][0]

'Equifax Workforce Solutions is located in the US and best described as a Equifax Workforce Solutions, formerly known as TALX, is a wholly owned subsidiary of Equifax. It is based in St. Louis, Missouri. The company was originally founded in 1972 under the name Interface Technology Inc. The company maintains a database named The Work Number that holds and maintains employment and payroll information on 54 million American people. As of 2015, the company was the largest source of employment information in the United States, and collects information from over 7,000 employers.. They are based in St. Louis MO 63146 and their address is 11432 Lackland Rd. They operate in the Credit reporting services industry.'

### Download Businesses

In [147]:
%%bigquery df
SELECT
     name,
     description,
     country,
     LANGUAGE as language,
     postal,
     city,
     state,
    address_l1 as address,
    summary,
    CONCAT(name," ",country," ",state," ",city," ",postal," ",address_l1) AS location
    
FROM `cf-data-analytics.us_businesses.description_v3`
   #WHERE RAND() < 100000/(SELECT COUNT(*) FROM `cf-data-analytics.us_businesses.description_v2`) # selectm random rows
   WHERE summary like "%Credit%" or summary like '%credit%'
   and description is not null
   and country is not null
   and language = "EN"
   and state is not null
   and industry is not null
   and address_l1 is not null
   and summary is not null
   and summary != 'None'


Query is running:   0%|          |

Downloading:   0%|          |

In [148]:
df = df.replace('\n','', regex=True)

In [149]:
len(df)

19101

In [150]:
df.head(3)

Unnamed: 0,name,description,country,language,postal,city,state,address,summary,location
0,Yamamoto Credit Jewelers,Ret jewelry,US,EN,96720,Hilo,HI,168 Kamehameha Ave,Yamamoto Credit Jewelers is located in the US ...,Yamamoto Credit Jewelers US HI Hilo 96720 168 ...
1,Catholic Legal Immigration Network,"Catholic Legal Immigration Network, Inc. is th...",US,EN,20017,Washington,DC,415 Michigan Ave NE,Catholic Legal Immigration Network is located ...,Catholic Legal Immigration Network US DC Washi...
2,花旗銀行,The company operates as a bank. Its products a...,US,EN,57108,Sioux Falls,SD,5800 S Corporate Pl,花旗銀行 is located in the US and best described a...,花旗銀行 US SD Sioux Falls 57108 5800 S Corporate Pl


## Convert Description to Embeddings

### Semantic Embeddings

In [123]:
summary_lst = df['summary'].values.tolist() # convert to list; required for tensor conversion
location_lst = df['name'].values.tolist() 

In [124]:
aiplatform.init(project='cf-data-analytics')
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]
def generate_batches(
    sentences: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]

def encode_text_to_embedding_batched(
    sentences: List[str], api_calls_per_second: int = 10, batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

In [125]:
is_successful, semantic_emb = encode_text_to_embedding_batched(df["summary"])

  0%|          | 0/3821 [00:00<?, ?it/s]

In [151]:
# get tf-idf embeddings for names

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000,analyzer='char', ngram_range=(2,5))
tfidf_matrix = vectorizer.fit_transform(location_lst)

# Get feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Convert the sparse matrix to a dense array 
tfidf_emb = tfidf_matrix.toarray()

# Create a DataFrame (optional)
df_tfidf = pd.DataFrame(tfidf_emb, columns=feature_names)

In [166]:
def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities

similarities = get_tf_idf_query_similarity(vectorizer, tfidf_matrix, "United Parcel Service")

In [177]:
top_indices = np.argpartition(similarities, -5)[-5:][::-1]
for ind in top_indices:
    print(location_lst[ind])
    print(similarities[ind])
    

United Parcel Service Credit Union
0.8916386187438192
United Services Credit Union
0.4742094204093619
Third Party Mvs Services Inc.
0.38841085279662396
K 3 Services
0.38389258320394
Ip Services
0.3775555020110155


In [174]:
np.argpartition(similarities, -5)

array([ 9054,     0,     2, ...,  1145, 11549, 14002])

In [127]:
print(len(tfidf_emb[0]))
#print(len(semantic_emb[0]))

10000


In [128]:
# Concatenate based on index
combined_emb = np.concatenate((tfidf_emb, semantic_emb), axis=1)

In [129]:
len(combined_emb)

19101

In [130]:
len(combined_emb[0])

10768

### Dimensionality Reduction

In [131]:
pca = PCA(n_components=500) 
emb_reduced = pca.fit_transform(combined_emb)

In [132]:
len(emb_reduced[0])

500

## Export to TSV

In [133]:
df["Index"] = df.index+1

In [134]:
len(df)

19101

In [135]:
col_lst = [] # create a list of column names
for col in df.columns:
  col_lst.append(col)

col_array = np.array([col_lst])
labels = np.concatenate((col_array, df), axis=0)


In [136]:
def save_tsv(labels, filepath, log_dir):
    with open(os.path.join(log_dir, filepath), 'w') as f:
      writer = csv.writer(f, delimiter='\t')
      writer.writerows(labels) # write labels

In [137]:
print(len(emb_reduced))
print(len(labels))

19101
19102


In [138]:
LOG_DIR = "tmp"
save_tsv(emb_reduced, "embeddings.tsv", LOG_DIR) # save .tsv metadata
save_tsv(labels, "labels.tsv", LOG_DIR) # save .tsv metadata

In [139]:
len(emb_reduced)

19101

In [140]:
len(labels)

19102

### go to go/embedding-projector