In [None]:
!pip install faiss-cpu numpy scikit-learn matplotlib seaborn ipywidgets pandas -q
!pip install "tensorflow>=2.0.0" -q
!pip install --upgrade tensorflow-hub -q

### Importing Required Libraries


In [None]:
# Core libraries
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import faiss
import re
import pandas as pd 

# Scikit-learn for dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity # For embedding example

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Interactive Widgets (Optional)
import ipywidgets as widgets
from IPython.display import display, clear_output

# Suppressing warnings for cleaner output
import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings('ignore')

print(f"TensorFlow Version: {tf.__version__}")
print(f"TensorFlow Hub Version: {hub.__version__}")
print(f"Faiss Version: {faiss.__version__}")
print(f"NumPy Version: {np.__version__}")
print(f"Pandas Version: {pd.__version__}")

## <a id='The-20-Newsgroups-Dataset'></a>The 20 Newsgroups Dataset

In this project, we'll be using the 20 Newsgroups dataset, a classic collection in NLP. It comprises approximately 18,000 newsgroup posts (if fetching 'all', or ~11,000 for 'train' subset) across 20 different topics. It's ideal for semantic search because:

- **Diverse Topics**: It covers a wide range of subjects like computers, religion, politics, sports, and science.
- **Real-world Text**: The language is natural, including informal discussions, technical jargon, and varying writing styles.
- **Context is Key**: Understanding the underlying meaning is crucial for effective search and classification.

### <a id='Loading-and-Exploring'></a>Loading and Exploring the Dataset

Let's load the dataset and take a quick look.

In [None]:
# We'll fetch the 'train' subset for quicker processing,
# and remove headers, footers, and quotes to focus on the main content.
newsgroups_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Take only the first 8000 samples
sample_size = 8000
data_samples = newsgroups_data.data[:sample_size]
target_labels_samples = newsgroups_data.target[:sample_size]
target_names = newsgroups_data.target_names[:sample_size]

print(f"Number of documents loaded: {len(data_samples)}")
print(f"Number of unique categories: {len(target_names)}")

print("\nFirst few category names:")
print(target_names[:5])

Let's see the distribution of documents across categories.

In [None]:
plt.figure(figsize=(12, 8))
category_counts = pd.Series([target_names[i] for i in target_labels_samples]).value_counts()
sns.barplot(x=category_counts.values, y=category_counts.index, palette="viridis")
plt.title('Distribution of Documents per Category (Train set)')
plt.xlabel('Number of Documents')
plt.ylabel('Category')
plt.tight_layout()
plt.show()

In [None]:
# Display the first few posts to understand their structure
print("\nSample posts (snippets):")
for i in range(3):
    print(f"--- Document {i+1} (Category: {target_names[target_labels_samples[i]]}) ---")
    print(data_samples[i][:300].strip() + "...")
    print("-"*80 + "\n")

## <a id='Preprocessing'></a>Preprocessing Text Data

Preprocessing is a crucial step to clean and standardize our text data. This helps the Universal Sentence Encoder focus on the meaningful content and generate better embeddings.

### <a id='Why-Preprocessing-Matters'></a>Why Preprocessing Matters

- **Noise Reduction**: Removing irrelevant characters (like special symbols, excessive whitespace) or metadata (like email headers, if not removed during fetch) helps the model focus on the actual content.
- **Standardization**: Converting text to lowercase ensures that words like "Search" and "search" are treated the same.
- **Improved Embeddings**: Cleaner text generally leads to more accurate and representative vector embeddings.

Our preprocessing function will:
1.  Remove email addresses (often noisy and not semantically rich for general topics).
2.  Remove characters that are not alphanumeric or whitespace.
3.  Convert text to lowercase.
4.  Normalize whitespace (reduce multiple spaces/newlines to a single space).

**Note:** We already removed headers, footers, and quotes when loading the data using `fetch_20newsgroups(remove=...)`. If we hadn't, we would add steps to remove those here.

In [None]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\S*@\S*\s?', '', text) # Remove email addresses
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove non-alphanumeric (keep spaces, numbers)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
    return text

processed_documents = []
original_documents_for_processed = []
target_labels_for_processed = [] # Store corresponding labels

for i, doc in enumerate(data_samples):
    processed_doc = preprocess_text(doc)
    if processed_doc: # Only include non-empty documents after processing
        processed_documents.append(processed_doc)
        original_documents_for_processed.append(doc)
        target_labels_for_processed.append(target_labels_samples[i])

print(f"Number of documents after preprocessing and filtering: {len(processed_documents)}")

### <a id='Preprocessing-Steps'></a>Preprocessing Steps in Action

Let's look at an example of a document before and after preprocessing.

In [None]:
sample_idx = 0
if len(original_documents_for_processed) > sample_idx:
    print("Original Document (from filtered list, Snippet):")
    print(original_documents_for_processed[sample_idx][:500].strip() + "...")
    print(f"Original Category: {target_names[target_labels_for_processed[sample_idx]]}")
    print("\n" + "-"*80 + "\n")
    print("Preprocessed Document (Snippet):")
    print(processed_documents[sample_idx][:500] + "...")
else:
    print("Not enough documents to display sample.")

## <a id='Universal-Sentence-Encoder'></a>Universal Sentence Encoder (USE)

The Universal Sentence Encoder (USE) is a model from Google that encodes text into high-dimensional vectors (embeddings). These embeddings capture the semantic meaning of sentences, paragraphs, or short documents. Sentences with similar meanings will have embeddings that are close together in the vector space.

- We load the pre-trained USE model from TensorFlow Hub.
- We define a function to take text and return its USE embedding as a NumPy array.

### <a id='Generating-Embeddings'></a>Generating Embeddings

In [None]:
print("Loading Universal Sentence Encoder model from TensorFlow Hub...")
use_model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed_model = hub.load(use_model_url)
print("USE model loaded successfully.")

def generate_embeddings(texts_list):
    if not texts_list:
        return np.array([])
    print(f"Generating embeddings for {len(texts_list)} texts...")
    # USE model expects a list of strings.
    embeddings = embed_model(texts_list)
    return embeddings.numpy()

document_embeddings = np.array([]) # Initialize
if processed_documents:
    document_embeddings = generate_embeddings(processed_documents)
    print(f"Shape of document embeddings: {document_embeddings.shape}")
else:
    print("No processed documents to embed.")

### <a id='Understanding-Embeddings'></a>Understanding Embeddings (Briefly)

Each document is now transformed into a dense vector of 512 numbers. These numbers represent the document's meaning in a high-dimensional space. We can use mathematical operations, like cosine similarity, to measure how 'close' or 'similar' two embeddings (and thus their original texts) are.

Let's quickly check the similarity between a few example sentences:

In [None]:
example_sentences = [
    "How is the weather today?",
    "What is the current weather like?",
    "This is a completely unrelated sentence about computers.",
    "Let's talk about artificial intelligence."
]

if example_sentences:
    example_embeddings = generate_embeddings(example_sentences)
    if example_embeddings.ndim == 2 and example_embeddings.shape[0] > 1:
        similarity_matrix = cosine_similarity(example_embeddings)
        print("\nCosine Similarity Matrix for example sentences:")
        print(pd.DataFrame(similarity_matrix, index=example_sentences, columns=example_sentences).round(4))

        print(f"\nSimilarity of '{example_sentences[0]}' with:")
        for i, sentence in enumerate(example_sentences[1:]):
            print(f" - '{sentence}': {similarity_matrix[0, i+1]:.4f}")
    else:
        print("Could not generate example embeddings to show similarity.")

## <a id='Faiss'></a>Indexing and Searching with Faiss

FAISS (Facebook AI Similarity Search) is a library for efficient similarity search and clustering of dense vectors. It's incredibly fast, especially for large datasets.

### <a id='Creating-a-FAISS-Index'></a>Creating a FAISS Index

1.  **Determine Dimension**: Get the dimension of our embeddings (512 for USE).
2.  **Choose an Index Type**: We'll use `faiss.IndexFlatL2`. This index performs an exact search using L2 (Euclidean) distance. It's simple and effective for datasets of moderate size. For very large datasets, FAISS offers more complex approximate nearest neighbor (ANN) indexes (e.g., `IndexIVFFlat`, `IndexHNSWFlat`) that trade a bit of accuracy for significant speed gains.
3.  **Add Vectors**: Add our document embeddings to the index.

In [None]:
faiss_index = None # Initialize
if document_embeddings.size > 0:
    dimension = document_embeddings.shape[1]
    print(f"Dimension of embeddings: {dimension}")

    faiss_index = faiss.IndexFlatL2(dimension)
    print(f"Is FAISS index trained? {faiss_index.is_trained}")

    faiss_index.add(document_embeddings.astype('float32'))
    print(f"Number of vectors in the FAISS index: {faiss_index.ntotal}")
else:
    print("No document embeddings available to create FAISS index.")

### <a id='Querying-the-Index'></a>Querying the Index

Now we can define a search function that:
1.  Takes a query text.
2.  Preprocesses the query.
3.  Generates an embedding for the query using USE.
4.  Searches the FAISS index for the `k` most similar document embeddings.
5.  Returns the distances and indices of these similar documents.

In [None]:
def search_documents(query_text, k=5):
    if not faiss_index or faiss_index.ntotal == 0:
        print("FAISS index is not available or is empty.")
        return np.array([]), np.array([])

    preprocessed_query = preprocess_text(query_text)
    if not preprocessed_query:
        print("Query became empty after preprocessing.")
        return np.array([]), np.array([])

    query_embedding = generate_embeddings([preprocessed_query])

    # FAISS search returns distances (D) and indices (I)
    distances, indices = faiss_index.search(query_embedding.astype('float32'), k)

    return distances[0], indices[0] # Return for the single query

example_query = "reliable cars for families"
num_results = 5

print(f"Searching for: '{example_query}' (top {num_results} results)\n")
distances, result_indices = search_documents(example_query, k=num_results)

if result_indices.size > 0:
    for i, idx in enumerate(result_indices):
        if idx < len(original_documents_for_processed):
            print(f"--- Rank {i+1} --- (Distance: {distances[i]:.4f})")
            print(f"Category: {target_names[target_labels_for_processed[idx]]}")
            print("Original Snippet:")
            print(original_documents_for_processed[idx][:300].strip() + "...")
            print("\nProcessed Snippet:")
            print(processed_documents[idx][:300] + "...")
            print("-"*50 + "\n")
        else:
            print(f"Rank {i+1}: Index {idx} out of bounds.")
else:
    print("No results found or error during search.")

Let's try another query, perhaps something more technical.

In [None]:
technical_query = "encryption algorithms and data security"
num_results = 3

print(f"Searching for: '{technical_query}' (top {num_results} results)\n")
distances, result_indices = search_documents(technical_query, k=num_results)

if result_indices.size > 0:
    for i, idx in enumerate(result_indices):
        if idx < len(original_documents_for_processed):
            print(f"--- Rank {i+1} --- (Distance: {distances[i]:.4f})")
            print(f"Category: {target_names[target_labels_for_processed[idx]]}")
            print("Original Snippet:")
            print(original_documents_for_processed[idx][:300].strip() + "...")
            print("-"*50 + "\n")
else:
    print("No results found.")

## <a id='Interactive-Querying'></a>Bonus: Interactive Querying

Using `ipywidgets`, we can create a simple interactive search box directly within the notebook.

In [None]:
query_input = widgets.Text(
    value='space exploration technology',
    placeholder='Enter your search query',
    description='Search:',
    disabled=False,
    layout=widgets.Layout(width='60%')
)

num_results_slider = widgets.IntSlider(
    value=3,
    min=1,
    max=10,
    step=1,
    description='Results:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

results_output_widget = widgets.Output()

def handle_search_interaction(b):
    with results_output_widget:
        clear_output(wait=True) # Clear previous results
        query = query_input.value
        k_val = num_results_slider.value
        if not query.strip():
            print("Please enter a query.")
            return

        print(f"Searching for: '{query}' (top {k_val} results)...\n")
        distances, result_indices = search_documents(query, k=k_val)
        if result_indices.size > 0:
            for i, idx in enumerate(result_indices):
                if idx < len(original_documents_for_processed):
                    print(f"--- Rank {i+1} --- (Distance: {distances[i]:.4f})")
                    print(f"Category: {target_names[target_labels_for_processed[idx]]}")
                    print(original_documents_for_processed[idx][:250].strip() + "...")
                    print("-"*40 + "\n")
        else:
            print("No results found for this query.")

search_button_widget = widgets.Button(description="Search Now")
search_button_widget.on_click(handle_search_interaction)

# Arrange widgets
input_box = widgets.HBox([query_input, num_results_slider])
display(widgets.VBox([input_box, search_button_widget, results_output_widget]))

# Optional: Perform an initial search with default values
# handle_search_interaction(None)