# Objectives

This notebook will show a pipeline to do the following:
1. Cluster articles by similarity - will focus mainly on perfect matches to group syndicated content
2. Semantic search for a given set of key words
3. Ad prediction
4. Augmentations: 
    * Article summarization
    * Article sentiment (overall)

The result will be a sorted list of articles, grouped by key word search.  This will serve as an example of replacing the current search results.  

In [1]:
import re
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from simhash import Simhash, SimhashIndex
from itertools import combinations_with_replacement
from collections import defaultdict
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import connected_components
from haystack import Pipeline
from haystack.utils import print_documents
from haystack.nodes import (
    PreProcessor,
    DensePassageRetriever,
    SentenceTransformersRanker,
)
from haystack.document_stores import FAISSDocumentStore
from tensorflow import keras
from keras.models import model_from_json
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, Lambda
from xgboost import XGBClassifier

INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/


In [76]:
def get_features(s: str, width: int=3) -> list:
    """
    Returns list of substrings of a given width.  Example: 'how are' -> ['how', 'owa', 'war', 'are']
    
    :param s: String to parse
    :param width: Length of the sliding window.  The default, 3, results in 1 character on each side of 
        each characters position.  
    """
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


def hash_and_index_documents(docs: pd.DataFrame, text_col: str, distance_threshold: int=1):
    """
    Use SimHash to hash the documents and index them for efficient searching and distance calculation.
    
    :param docs: Dataframe of documents
    :param text_col: Name of the dataframe column with the document text
    :param distance_threshold: Hash distances > than this value will not be considered similar.  
        This value must be an integer.  The default, 1, limits similarity to exact matches.
    """
    docs_dict = docs[text_col].to_dict()
    hashes = [(str(k), Simhash(get_features(v))) for k, v in docs_dict.items()]
    index = SimhashIndex(hashes, k=distance_threshold)
    return (hashes, index)


def determine_clusters(index) -> dict:
    """
    The SimHash library does not have a way to determine clusters, so this function serves that 
    purpose.  The SimHash index.bucket is a dictionary whose keys are the hash keys and whose 
    values are lists of [hash, doc_id].  So by splitting these on commas and taking the second 
    element, it is possible to get a list of document Ids in the same bucket (cluster).
    
    :param index: SimHash index object
    """
    clusters = {}
    cluster_id = 0
    for simhash_key, hashes in index.bucket.items():
        similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
        clusters[cluster_id] = set(similar_doc_ids)
        cluster_id += 1
    return clusters


def build_adjacency_matrix_from_clusters(clusters: dict, nbr_documents: int):
    """
    Given the clusters, build a square, sparse adjanceny matrix that shows which documents 
    are connected (documents are connected if they are in the same cluster).  
    
    :param clusters: The cluster dictionary from determine_clusters()
    :param nbr_documents: The total number of documents.  The matrix will have this many rows and columns.
    """
    # create a list of tuples representing the edges in a graph
    edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])
    # create a square adjency matrix from the edge list
    matrix_shape = (nbr_documents, nbr_documents)
    rows, cols = zip(*edges)
    sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)
    return sparse_mat


def get_one_sample_text_per_cluster(df: pd.DataFrame, text_col) -> dict:
    """
    Clusters have similar texts, so take the first one per cluster to reduce the number of documents 
    that the similarity search has to run through.  Haystack expects the input to be a list of dictionaries 
    in the following format, which is what this function creates:
        [{'doc_id': 0, 'content': 'this is a document'}, {'doc_id': 1, 'content': 'this is another document'}]
    
    :param df: Dataframe with the texts
    :param text_col: Name of the column with the document text
    """
    grouped = df.groupby('cluster').first().rename(columns={text_col: 'content'})
    return grouped[['doc_id', 'content']].to_dict(orient='records')


def build_faiss_document_store(doc_list, haystack_processor, haystack_doc_store, haystack_retriever, save=True, save_path="haystack_doc_store.faiss"):
    """
    Runs the doc_list through Haystack's PreProcessor, writes the result to a FAISS document store, and 
    then updates the doc store with the embeddings from the provided retriever.  
    """
    documents = haystack_processor.process(doc_list)
    haystack_doc_store.write_documents(documents, duplicate_documents='overwrite')
    # embed the documents that have been indexed, using the embedding method defined in haystack_retriever
    haystack_doc_store.update_embeddings(haystack_retriever)
    if save:
        haystack_doc_store.save(save_path)
    return haystack_doc_store


def load_faiss_document_store(file_path):
    return FAISSDocumentStore.load(file_path)


def build_haystack_pipeline_for_semantic_search(retriever, ranker=None):
    """
    Builds a Haystack pipeline for semantic search. The ranker is optional, but it can help if the 
    sorted results of the retriever are inadequate - this is more of a problem with retrievers that 
    do not use embeddings, like TF-IDF and BM25/Elastic Search. 
    """
    p = Pipeline()
    p.add_node(component=retriever, name="Retriever", inputs=["Query"])
    if ranker:
        p.add_node(component=ranker, name="Ranker", inputs=["Retriever"])
    return p


def clean_search_term(s: str, lowercase=False) -> str:
    """
    Cleans the search term text
    """
    if lowercase:
        s = ' '.join(''.join(c.lower() for c in w if c.isalnum()) for w in s.split())
    else:
        s = ' '.join(''.join(c for c in w if c.isalnum()) for w in s.split())
    return s


def sum_layer(x):
    """
    Sum layer for the embedding model - allows aggregation from word to document level
    """
    return tf.reduce_sum(input_tensor=x, axis=0, keepdims=False)


def map_cluster_sample_doc_ids_to_ad_probability(cluster_samples: dict, pred_probs: np.ndarray) -> dict:
    """
    Creates dict mapping of the doc_ids in cluster_samples to their predicted ad probabilities.
    """
    ad_probs = {}
    for i, doc in enumerate(cluster_samples):
        ad_probs[doc['doc_id']] = pred_probs[i, 1]
    return ad_probs


def map_ad_probs_to_clusters(ad_probs: dict, doc_id_to_cluster_map: dict) -> dict:
    """
    Creates dict mapping of the cluster_ids to their predicted ad probabilities.  Since there 
    is 1 document per cluster in cluster_samples, the probability that that document is an ad 
    applies to the entire cluster.
    """
    cluster_ad_probs = {}
    for doc_id, ad_prob in ad_probs.items():
        cluster = doc_id_to_cluster_map[doc_id]
        cluster_ad_probs[cluster] = ad_prob
    return cluster_ad_probs


## Data Ingestion and Hashing

In [3]:
# ingest and hash documents
blackwing_data = pd.read_csv('blackwing_3m_9k.csv')
blackwing_objs, index = hash_and_index_documents(
    docs=blackwing_data,
    text_col='text',
    distance_threshold=1
)
print(f"{len(blackwing_data)} documents hashed")

8952 documents hashed


## Consolidating Similar Docs

In [4]:
# get the cluster for each document
clusters = determine_clusters(index)
sparse_mat = build_adjacency_matrix_from_clusters(clusters=clusters, nbr_documents=len(blackwing_data))
nbr_clusters, cluster = connected_components(sparse_mat)
blackwing_data['cluster'] = cluster
blackwing_data = blackwing_data.reset_index(drop=False).rename(columns={'index':'doc_id'})
doc_id_to_cluster_map = dict(zip(blackwing_data['doc_id'], blackwing_data['cluster']))
print(f"{nbr_clusters} clusters found")

420 clusters found


In [31]:
# all subsequent steps should operate on cluster_samples instead of entire dataset
cluster_samples = get_one_sample_text_per_cluster(df=blackwing_data, text_col="text")

## Semantic Search

In [6]:
# create the components of the Haystack search pipeline

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0
)

document_store = FAISSDocumentStore(
    similarity='dot_product',
    return_embedding=True,
    faiss_index_factory_str="Flat"
)

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=16,
    use_gpu=True,
    embed_title=True,
    use_fast_tokenizers=True,
)

ranker = SentenceTransformersRanker(
    model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2"
)


INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
INFO 

In [7]:
# build or load document store
document_store = build_faiss_document_store(
    doc_list=cluster_samples, 
    haystack_processor=processor, 
    haystack_doc_store=document_store, 
    haystack_retriever=retriever, 
    save=True, 
    save_path="haystack_doc_store.faiss"
)

100%|██████████| 420/420 [00:00<00:00, 675.37docs/s]


Writing Documents:   0%|          | 0/1946 [00:00<?, ?it/s]

INFO - haystack.document_stores.faiss -  Updating embeddings for 1922 docs...


Updating Embedding:   0%|          | 0/1922 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/1936 [00:00<?, ? Docs/s]

In [77]:
pipe = build_haystack_pipeline_for_semantic_search(retriever=retriever, ranker=None)

In [9]:
searches = blackwing_data['term'].apply(lambda x: clean_search_term(x)).tolist()
searches[0:20]

['Long term care',
 'General Electric',
 'honeywell',
 'Siemens AND AI',
 'AI AND Healthcare',
 'AI AND hospitals',
 'Oracle AND AI',
 'healthcare AND innovat',
 'healthcare AND AI',
 'pharma AND AI',
 'Oracle AND data management NOT PRNewswire',
 'healthcare AND trend',
 'healthcare AND artificial intelligence',
 'artificial intelligence AND healthcare',
 'robots AND healthcare',
 'innovation AND healthcare',
 'telemedicine',
 'EHR',
 'electronic health record',
 'longterm care']

In [87]:
# run a search
nbr_results_to_return = int(25) # need to limit results, otherwise all docs in corpus will be returned
result = pipe.run(
    query=searches[4],
    params={"top_k": nbr_results_to_return}
)
print_documents(result)


Query: AI AND Healthcare

{   'content': 'The healthcare IT market is estimated to be worth $40 billion '
               'with a 7-14% growth by the end of fiscal year 2024.MMM shares '
               'were down more than 1% at $157.83 apiece near the close of '
               'trading today similar to the performance of MassDevice and '
               'MDO?s MedTech 100 Index....3M said it plans to prioritize '
               'large fast-growing commercial markets in its 2022 growth '
               'strategy including innovations in ......3M?s healthcare '
               'segment expects organic sales growth to be in the ?mid-single '
               'digits? for the year.',
    'name': None}

{   'content': 'The key industry participants include:Abbott Laboratories '
               'Dr?gerwerk AG General Electric Company Honeywell International '
               'Inc. Medtronic plc Oracle Corporation Siemens AG Allscripts '
               'Healthcare Solutions Inc. AMD Global Telemed

## Predicting Ads

In [15]:
# load a pre-trained vectorizer
pickled_vectorizer = pickle.load(open("ad_filter_vectorizer.pkl", "rb"))
vectorizer = TextVectorization.from_config(pickled_vectorizer['config'])
vectorizer.set_weights(pickled_vectorizer['weights'])
# vocab = vectorizer.get_vocabulary()

In [70]:
embedding_model = keras.models.load_model("ad_filter_embedding_model")
print("Loaded embedding model from disk")





Loaded embedding model from disk


In [71]:
model_file_path = "xgb_ad_classifier.pkl"
xgb_model = pickle.load(open(model_file_path, "rb"))

In [78]:
# note that ignore DOES NOT mean it is an ad, but it is an indication that it might be
blackwing_y = blackwing_data.disposition.map({"SELECT": 0, "IGNORE": 1})

# vectorize the documents
blackwing_test = [np.array(doc['content']) for doc in cluster_samples]

# embed the documents
blackwing_test = np.stack([embedding_model(doc).numpy() for doc in blackwing_test])

# make predictions
blackwing_pred = xgb_model.predict(blackwing_test)
blackwing_pred_prob = xgb_model.predict_proba(blackwing_test)

In [79]:
# get ad probabilities for clusters
ad_probs_of_doc_ids_in_cluster_samples = map_cluster_sample_doc_ids_to_ad_probability(
    cluster_samples=cluster_samples, 
    pred_probs=blackwing_pred_prob
)

cluster_ad_probs = map_ad_probs_to_clusters(
    ad_probs=ad_probs_of_doc_ids_in_cluster_samples, 
    doc_id_to_cluster_map=doc_id_to_cluster_map
)

cluster_ad_probs

{0: 0.9682984,
 1: 0.28476533,
 2: 0.36191425,
 3: 0.7084052,
 4: 0.8456159,
 5: 0.8591308,
 6: 0.75022084,
 7: 0.08375392,
 8: 0.009791594,
 9: 0.118267186,
 10: 0.31038052,
 11: 0.0007375895,
 12: 0.22928743,
 13: 0.36704558,
 14: 0.8738608,
 15: 0.6519277,
 16: 0.7881865,
 17: 0.37217233,
 18: 0.3882757,
 19: 0.7747711,
 20: 0.035497766,
 21: 0.9571656,
 22: 0.0853042,
 23: 0.16937944,
 24: 0.22529134,
 25: 0.22861862,
 26: 0.20061988,
 27: 0.94811416,
 28: 0.9461136,
 29: 0.07775918,
 30: 0.01388536,
 31: 0.45798704,
 32: 0.0071714646,
 33: 0.6899225,
 34: 0.64147544,
 35: 0.18925129,
 36: 0.328015,
 37: 0.8245093,
 38: 0.75209326,
 39: 0.766727,
 40: 0.8653293,
 41: 0.82954305,
 42: 0.53005713,
 43: 0.4358212,
 44: 0.4077822,
 45: 0.5282003,
 46: 0.65352255,
 47: 0.99403304,
 48: 0.14576975,
 49: 0.8367934,
 50: 0.95285016,
 51: 0.8253634,
 52: 0.7282593,
 53: 0.55754507,
 54: 0.530323,
 55: 0.80354166,
 56: 0.7567996,
 57: 0.5516414,
 58: 0.05337317,
 59: 0.036768276,
 60: 0.8814

#### Output Option 1 (Ideal)

Semantic search results, sorted by relevance and probability of being an ad, and de-duplicated syndicated content.

In [117]:
for si, s in enumerate(searches):
    print(si, s)

0 Long term care
1 General Electric
2 honeywell
3 Siemens AND AI
4 AI AND Healthcare
5 AI AND hospitals
6 Oracle AND AI
7 healthcare AND innovat
8 healthcare AND AI
9 pharma AND AI
10 Oracle AND data management NOT PRNewswire
11 healthcare AND trend
12 healthcare AND artificial intelligence
13 artificial intelligence AND healthcare
14 robots AND healthcare
15 innovation AND healthcare
16 telemedicine
17 EHR
18 electronic health record
19 longterm care
20 Honeywell
21 AI AND oncology
22 artificial intelligence AND oncology
23 oncology
24 pharmacy
25 Telemedicine
26 Mobile AND health
27 Artificial intelligence AND health
28 healthcare AND trend
29 healthcare AND artificial intelligence
30 artificial intelligence AND healthcare
31 robots AND healthcare
32 innovation AND healthcare
33 telemedicine
34 EHR
35 electronic health record
36 longterm care
37 Siemens
38 pharmacy AND technology
39 longterm care
40 Medtronic
41 Medtronic
42 Medtronic AND innovation
43 Medtronic AND innovation
44 Abb

957 Boeing AND Helicopter
958 Raytheon
959 RollsRoyce AND aviation
960 RollsRoyce AND aerospace
961 RollsRoyce AND aircraft
962 Rolls AND Royce AND aviation
963 Rolls AND Royce AND aerospace
964 Rolls AND Royce AND aircraft
965 Dassault Aviation
966 Lockheed Martin
967 Aerojet
968 Sikorsky
969 US AND Department of Transportation
970 FAA
971 Tim Hortons
972 ExxonMobil
973 BASF
974 agronomy AND agriculture
975 Moon AND 2024
976 Registered Nurses
977 employment AND trends
978 Bayer NOT Leverkusen NOT aflak
979 ExxonMobil
980 Exxon
981 First Nations AND Energy
982 First Nations AND Resources
983 indigenous AND Energy
984 indigenous AND Energy
985 indigenous AND Resources
986 Canada Pension Plan
987 Canada Pension Plan Investment Board
988 Cargill
989 statistics canada
990 canada AND statistics
991 Canada AND employment
992 Canada AND employment
993 Canada AND pension plan
994 Canadian pension plan
995 women AND advisors
996 computer engineering
997 AI AND conference
998 Brexit AND hospital

1706 United Airlines
1707 Hawaiian Airlines
1708 Airbus
1709 A220
1710 Boeing
1711 Embraer
1712 Airbus AND helicopter
1713 C295
1714 AirAsia
1715 Air France
1716 British Airways
1717 IATA
1718 Boeing AND Helicopter
1719 Hyatt
1720 pertamina
1721 A320neo AND engine
1722 American Express AND business AND credit
1723 Dassault
1724 Sarah Derry AND Accor
1725 American Airlines AND cargo
1726 Air France AND cargo
1727 air cargo
1728 pilot training
1729 Marriott
1730 accor
1731 ihg
1732 hyatt
1733 travel AND executive
1734 hotel AND executive
1735 American AND aircraft
1736 United AND aircraft
1737 Qatar AND aircraft
1738 Emirates AND aircraft
1739 aircraft AND environment
1740 aircraft AND safet
1741 Wyndham NOT Theatre NOT Theater NOT Herald NOT Vale NOT Star Weekly NOT Wyndham Clark NOT Wyndham Street NOT Wyndham Drive NOT Lucy Wyndham NOT Wyndham Manor NOT Wyndham Council NOT Wyndham Lakes
1742 Airbus
1743 Embraer
1744 Boeing
1745 GM AND autonomous
1746 Honeywell AND aerospace
1747 AI AND

2456 Johnson  Johnson AND coronavirus AND vaccines
2457 Johnson AND Johnson AND coronavirus AND vaccine
2458 Johnson AND Johnson AND coronavirus AND vaccines
2459 Johnson  Johnson
2460 Johnson  Johnson NOT Captioning made possible by Johnson  Johnson
2461 UnitedHealth
2462 Unitedhealth
2463 Staples
2464 unitedhealth
2465 UnitedHealth
2466 Johnson  Johnson
2467 Johnson  Johnson
2468 UnitedHealth
2469 JPMorgan
2470 UnitedHealth
2471 Johnson  Johnson
2472 Staples NOT Staples Center NOT household NOT pantry NOT stocked NOT closet NOT grocery NOT cooking
2473 Johnson  Johnson
2474 Johnson  Johnson
2475 Johnson  Johnson
2476 Johnson  Johnson
2477 JPMorgan AND covid
2478 Johnson  Johnson
2479 JohnsonJohnson
2480 Fairness AND data
2481 Fairness AND data
2482 JPMorgan Chase
2483 Johnson  Johnson
2484 Anthony Fauci
2485 Anthony Fauci
2486 Anthony Fauci
2487 CDC
2488 CDC
2489 Johnson  Johnson
2490 urgent care
2491 Realworld data
2492 urgent care
2493 urgent care
2494 urgent care
2495 Pfizer AND r

3206 businesses AND airport
3207 business AND airport
3208 FBO
3209 FBO
3210 fixedbase operator
3211 Honeywell
3212 Honeywell
3213 Honeywell
3214 Embraer AND Honeywell
3215 Embraer AND Honeywell
3216 Embraer
3217 Embraer
3218 Honeywell
3219 Embraer
3220 Embraer
3221 Bombardier
3222 Bombardier
3223 Embraer
3224 Embraer
3225 Bombardier
3226 Bombardier
3227 honeywell
3228 Honeywell
3229 Honeywell
3230 Honeywell
3231 honeywell
3232 Fujitsu
3233 Honeywell
3234 Fujitsu
3235 Honeywell
3236 Motorola
3237 Honeywell
3238 inclusion AND research
3239 Kongsberg
3240 GSE NOT PRNewswire NOT Business Wire NOT GLOBE NEWSWIRE NOT Marketwired NOT Ghana
3241 honeywell
3242 Honeywell AND aerospace
3243 Honeywell
3244 Schlumberger
3245 Honeywell
3246 Honeywell
3247 honeywell
3248 Honeywell
3249 Siemens
3250 Jacobs engineering
3251 Jacobs AND engineering
3252 AECOM
3253 Honeywell
3254 ABB
3255 Honeywell
3256 business intelligence
3257 business intelligence
3258 honeywell
3259 Fujitsu
3260 Honeywell
3261 Fuji

3956 Johnson AND Johnson AND coronavirus AND booster
3957 Moderna AND Covid AND vaccine
3958 Moderna AND Covid AND vaccines
3959 Moderna AND Covid19 AND vaccine
3960 Moderna AND Covid19 AND vaccines
3961 Moderna AND coronavirus AND vaccine
3962 Moderna AND coronavirus AND vaccines
3963 Moderna AND Covid AND booster
3964 Moderna AND Covid19 AND booster
3965 Moderna AND coronavirus AND booster
3966 Pfizer AND covid AND vaccines
3967 Pfizer AND coronavirus AND vaccines
3968 Pfizer AND BioNTech AND covid AND vaccines
3969 Pfizer AND BioNTech AND coronavirus AND vaccines
3970 Pfizer AND BioNTech AND covid19 AND vaccines
3971 JJ
3972 Johnson  Johnson
3973 Moderna
3974 Pfizer
3975 pfizer AND covid19 AND vaccine
3976 pfizer AND covid19 AND booster
3977 Pfizer
3978 Johnson  Johnson NOT Captioning made possible by Johnson  Johnson
3979 astrazeneca NOT astrazeneca may be able to help
3980 Pfizer AND vaccin
3981 Johnson  Johnson
3982 Pfizer NOT Forecast Report
3983 Johnson  Johnson
3984 ventilator

4705 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
4706 PFAS
4707 PFAS
4708 rehab AND regulat
4709 rehab AND legislat
4710 rehab AND polic
4711 hospital AND regulat
4712 hospital AND legislat
4713 hospital AND rehab
4714 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
4715 Lawsuit AND PFAS
4716 3M AND PFAS
4717 Clift NOT Theresa Clift
4718 contaminat AND groundwater
4719 PFAS
4720 Clift NOT Theresa Clift
4721 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
4722 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
4723 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
4724 PFAS
4725 PFAS
4726 plastics AND manufactur
4727 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
4728 Per and Polyfluoroalkyl Substances
4729 PFAS
4730 aircraft AND regulat
4731 aircraft AND law
4732 aircraft AND environment
4733 aircraft AND safet
4734 pipeline AND safety
4735 pipeline AND explosion
4736

5455 Mike Roman AND 3M
5456 3M AND earplugs
5457 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
5458 3M AND PFAS
5459 3M NOT 3M ago
5460 Mike Roman AND 3M
5461 Mike Roman AND 3M
5462 Mike Roman AND 3M
5463 3M AND pandemic
5464 3M AND pandemic
5465 3M AND pandemic
5466 3M NOT 3M ago
5467 3M NOT 3M ago
5468 3M NOT 3M ago
5469 3M NOT 3M ago
5470 3M NOT 3m
5471 Carbon neutrality
5472 3M AND COVID
5473 3M AND COVID19
5474 3M NOT 3M ago
5475 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
5476 DuPont AND PFAS
5477 DuPont AND PFAS
5478 Chemours AND PFAS
5479 Chemours AND PFAS
5480 Congress AND PFAS
5481 echocardiography
5482 food AND regulation
5483 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
5484 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
5485 Per and Polyfluoroalkyl Substances
5486 Per and Polyfluoroalkyl Substances
5487 regulation AND PFAS
5488 regulation AND PFAS
5489 regulation AND Per and Polyfluor

6205 Phillips AND 66
6206 Chevron
6207 Chevron AND refiner
6208 automo AND policy
6209 retail AND tax
6210 electric AND auto
6211 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
6212 United AND Auto NOT police NOT highway NOT patrol NOT crime NOT stolen NOT drunk
6213 chevron
6214 PFAS
6215 PFAS
6216 Chevron AND pipeline
6217 Phillips 66 AND pipeline
6218 Kinder Morgan AND pipeline
6219 Kinder Morgan
6220 nurse AND regulat
6221 nurse AND regulat
6222 nurse AND polic
6223 nurse practitioner
6224 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
6225 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
6226 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
6227 regulation AND PFAS
6228 regulation AND PFAS
6229 EPA AND PFAS
6230 EPA AND PFAS
6231 PFAS
6232 PFAS
6233 city AND superintendent
6234 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
6235 PFAS Action Plan
6236 PFAS
6237 bottled water
6238 ci

6955 PFAS
6956 PFAS
6957 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
6958 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
6959 contaminat AND groundwater
6960 PFAS
6961 PFAS
6962 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
6963 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
6964 Sustainability AND initiative
6965 PFAS
6966 Airbnb
6967 Honeywell
6968 Visa AND Olympics
6969 processor AND industrial
6970 3M AND COVID
6971 3M AND COVID19
6972 3M AND coronavirus
6973 3M AND pandemic
6974 Nikki Haley
6975 Procter  Gamble AND Olympics
6976 Procter  Gamble AND Olympics
6977 Procter  Gamble AND Olympic
6978 Procter  Gamble AND Olympic
6979 ProcterGamble AND Olympics
6980 ProcterGamble AND Olympics
6981 ProcterGamble AND Olympic
6982 ProcterGamble AND Olympic
6983 Airbnb AND Olympic
6984 Airbnb AND Olympic
6985 Airbnb AND Olympics
6986 Airbnb AND Olympics
6987 Coca Cola AND Olympic
6988 Coca Cola AND Olympic

7704 congress AND small business
7705 congress AND small business
7706 congress AND small businesses
7707 congress AND small businesses
7708 white house AND small businesses
7709 white house AND small businesses
7710 white house AND small business
7711 white house AND small business
7712 hospital AND legislat
7713 nursing AND legislat
7714 nursing AND legislat
7715 nursing AND legislative
7716 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
7717 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
7718 PFAS
7719 PFAS
7720 First State Investments
7721 FDA AND PFAS
7722 FDA AND Per and Polyfluoroalkyl Substances
7723 3M AND PFAS
7724 3M AND Per and Polyfluoroalkyl Substances
7725 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
7726 PFAS NOT Pension Fund Administrator NOT Pension Fund Administrators
7727 3M AND PFAS
7728 PFAS
7729 PFAS
7730 legislation AND small business
7731 legislation AND small business
7732 3M NOT 3M ago
7733 3M N

8454 Bristol Myers
8455 3M AND Per and Polyfluoroalkyl Substances
8456 3M AND Per and Polyfluoroalkyl Substances
8457 3M AND Perfluorooctanoic Acid
8458 3M AND Perfluorooctanoic Acid
8459 Apple AND health
8460 Apple AND medicine
8461 Apple AND medicine
8462 Apple AND drug
8463 Apple AND drug
8464 Apple AND hospital
8465 Apple AND hospital
8466 Bristol Myers Squibb
8467 Bristol Myers Squibb
8468 SaintGobain
8469 SaintGobain
8470 EPA NOT fish oil NOT DHA
8471 JPMorgan AND covid
8472 JPMorgan AND covid
8473 JPMorgan AND covid
8474 Chase AND business AND credit
8475 Chase AND business AND credit
8476 Chase AND business AND credit
8477 JPMorgan AND business AND credit
8478 JPMorgan AND business AND credit
8479 JPMorgan AND business AND credit
8480 Bristol Myers Squibb
8481 Bristol Myers Squibb
8482 JPMorgan AND credit card
8483 JPMorgan AND credit card
8484 Apple AND drugs
8485 Apple AND drugs
8486 3M AND PFAS
8487 3M AND PFAS
8488 PFAS NOT Pension Fund Administrator NOT Pension Fund Admini

In [122]:
# run a new search
term_idx = 1
nbr_results_to_return = int(25) # need to limit results, otherwise all docs in corpus will be returned
result = pipe.run(
    query=searches[term_idx],
    params={"top_k": nbr_results_to_return}
)

# compile search results with ad predictions
search_results = pd.DataFrame({
    "search_term": searches[term_idx],
    "doc_id": [doc.meta['doc_id'] for doc in result['documents']],
    "text": [doc.content for doc in result['documents']],
    "score_search": [doc.score for doc in result['documents']],
    "score_ad_prob": [ad_probs_of_doc_ids_in_cluster_samples[int(doc.meta['doc_id'])] for doc in result['documents']],
    "original_disposition": [
        blackwing_data.loc[blackwing_data['doc_id'] == int(doc.meta['doc_id']), 'disposition'].values.tolist()[0] 
        for doc in result['documents']
    ],
})
search_results['score_final'] = search_results['score_search'] * (1 - search_results['score_ad_prob'])

search_results.sort_values(['score_search'], ascending=False).to_csv(f'test_results_option1_search{term_idx}.csv', index=False)
search_results.head()

Unnamed: 0,search_term,doc_id,text,score_search,score_ad_prob,original_disposition,score_final
0,General Electric,5432,?While we do not always begin our journey with perfect information we strive...,0.649902,0.462885,SELECT,0.349072
1,General Electric,8283,Feb 16 (Reuters) - CelLink a California startup with a pathbreaking product ...,0.649149,0.27513,SELECT,0.470548
2,General Electric,4444,It hasn?t followed many industrial peers most recently General Electric Co. ...,0.648335,0.878326,SELECT,0.078886
3,General Electric,3314,He found many supporters for his project such as ?MIT? ?Honeywell? and ?Gene...,0.648176,0.884746,IGNORE,0.074705
4,General Electric,5432,said Chad McAllister executive vice president Milliken & Company and preside...,0.648093,0.462885,SELECT,0.3481


#### Output Option 2

Original search from Blackwing, just de-duplicated syndicated content and sorted by ad probability.

In [86]:
blackwing_data['ad_probability'] = blackwing_data['cluster'].map(cluster_ad_probs)
blackwing_data.sort_values(['ad_probability', 'doc_id']).to_csv('test_results_option2.csv', index=False)