In [None]:
!pip install --upgrade sentence-transformers transformers

In [63]:
import warnings
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import shutil
import os
import pickle

In [13]:
# Function to clear Hugging Face cache
def clear_huggingface_cache():
    cache_dir = os.path.expanduser('~/.cache/huggingface')
    if os.path.exists(cache_dir):
        shutil.rmtree(cache_dir)
    os.makedirs(cache_dir, exist_ok=True)

In [16]:
# Clear the Hugging Face cache to ensure a clean environment
clear_huggingface_cache()

In [64]:
# Suppress specific FutureWarning from huggingface_hub
warnings.filterwarnings('ignore', category=FutureWarning, module='huggingface_hub.file_download')

In [72]:
# Function to clear the FAISS index in memory
def clear_faiss_index(dimension):
    return faiss.IndexFlatL2(dimension)

# Function to delete index files from disk
def delete_index_files(index_path, metadata_path):
    # Remove the index file
    if os.path.exists(index_path):
        os.remove(index_path)
        print(f"Deleted index file: {index_path}")

    # Remove the metadata file
    if os.path.exists(metadata_path):
        os.remove(metadata_path)
        print(f"Deleted metadata file: {metadata_path}")

# Initialize variables
dimension = 384  # Use the appropriate dimension for your vectors
index_path = 'faiss_index.index'
metadata_path = 'metadata.pkl'

# Clear the FAISS index in memory
index = clear_faiss_index(dimension)

# Delete index files from disk
delete_index_files(index_path, metadata_path)

Deleted index file: faiss_index.index


In [70]:
# Step 1: Load Data from CSV
csv_file_path = 'sample_data.csv'  # Replace with the path to your CSV file
# Print the current working directory
print("Current working directory:", os.getcwd())

# Check if the file exists
if os.path.exists(csv_file_path):
    print(f"File {csv_file_path} exists.")
else:
    print(f"File {csv_file_path} does not exist. Please check the file path.")


Current working directory: /Users/satyaanumolu/POCs/faissproj
File sample_data.csv exists.


In [79]:
# Try to read the CSV file
try:
    df = pd.read_csv(csv_file_path)
    print("CSV file loaded successfully.")
    
    # Display the DataFrame
    print("DataFrame content:")
    print(df)
    
    # Display the DataFrame columns
    print("DataFrame columns:")
    print(df.columns)

    # Assuming the CSV has a column named 'text' that you want to index
    texts = df['text'].tolist()
    sources = df['source'].tolist()
    print("Data extracted from CSV:")
    print(texts)
    print("Sources extracted from CSV:")
    print(sources)
except Exception as e:
    print(f"Error reading CSV file: {e}")

CSV file loaded successfully.
DataFrame content:
   id                                               text   source
0   1         This is a document about machine learning.     Book
1   2  Here we discuss the basics of artificial intel...  Article
2   3  This text is all about natural language proces...     Book
3   4    Deep learning techniques are advancing rapidly.  Article
4   5             AI is transforming various industries.   Report
5   6  Machine learning provides systems the ability ...  Article
6   7  Understanding AI and ML is crucial for modern ...     Book
7   8  Data science integrates domain knowledge progr...   Report
8   9  Big data and analytics are essential for extra...  Article
9  10  The future of AI includes ethical consideratio...     Book
DataFrame columns:
Index(['id', 'text', 'source'], dtype='object')
Data extracted from CSV:
['This is a document about machine learning.', 'Here we discuss the basics of artificial intelligence.', 'This text is all about natu

In [80]:
# Step 2: Convert Text Data to Vectors using Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')
vectors = model.encode(texts)

In [81]:
# Step 3: Create a FAISS Index and Add Vectors
dimension = vectors.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 (Euclidean) distance
index.add(np.array(vectors))

In [82]:
# Save the index and metadata for later use
index_path = 'faiss_index.index'
metadata_path = 'metadata.pkl'

faiss.write_index(index, index_path)

# Save metadata (e.g., texts and sources)
metadata = {'texts': texts, 'sources': sources}
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)

In [83]:
# Step 4: Query the Index
def query_faiss(query_text, top_k=5):
    query_vector = model.encode([query_text])
    distances, indices = index.search(np.array(query_vector), top_k)
    return [(metadata['texts'][idx], metadata['sources'][idx], distances[0][i]) for i, idx in enumerate(indices[0])]

In [86]:
# Example Query
query = "Big data and analytics essental for ? "
results = query_faiss(query, top_k=2)

In [87]:
# Print the results
for text, source, distance in results:
    print(f"Document: {text}, Source: {source}, Distance: {distance}")

Document: Big data and analytics are essential for extracting insights., Source: Article, Distance: 0.6674928069114685
Document: Data science integrates domain knowledge programming skills and math., Source: Report, Distance: 1.2074646949768066


In [None]:
# Function to load the index and metadata back (if needed)
def load_index_and_metadata(index_path, metadata_path):
    index = faiss.read_index(index_path)
    with open(metadata_path, 'rb') as f:
        metadata = pickle.load(f)
    return index, metadata

# Usage of the load function
index, metadata = load_index_and_metadata(index_path, metadata_path)