In [2]:
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.runnables.history import RunnableWithMessageHistory
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from task_utils import get_secret
from openai import OpenAI


In [2]:
import boto3
import os

os.environ["http_proxy"] = "http://127.0.0.1:7897"
os.environ["https_proxy"] = "http://127.0.0.1:7897"

client = boto3.client("secretsmanager", region_name="us-east-1")
response = client.list_secrets()
print(response)

{'SecretList': [{'ARN': 'arn:aws:secretsmanager:us-east-1:730335414940:secret:doai/openai/1015-tZkxgY', 'Name': 'doai/openai/1015', 'LastChangedDate': datetime.datetime(2025, 4, 30, 18, 53, 27, 622000, tzinfo=tzlocal()), 'LastAccessedDate': datetime.datetime(2025, 6, 28, 8, 0, tzinfo=tzlocal()), 'Tags': [], 'SecretVersionsToStages': {'4341c63f-4884-4c31-acce-c8c4ca95f5f8': ['AWSPREVIOUS'], 'e03510ea-71ac-44bf-b5d0-d9f81e55d534': ['AWSCURRENT']}, 'CreatedDate': datetime.datetime(2024, 10, 16, 14, 48, 50, 150000, tzinfo=tzlocal())}], 'ResponseMetadata': {'RequestId': '36ed15f2-43fb-4863-bd66-e8c755580db7', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '36ed15f2-43fb-4863-bd66-e8c755580db7', 'content-type': 'application/x-amz-json-1.1', 'content-length': '371', 'date': 'Sat, 28 Jun 2025 07:57:49 GMT'}, 'RetryAttempts': 0}}


In [29]:
import asyncio
import aiohttp
from typing import TypedDict
from langgraph.graph import StateGraph, START, END
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
from task_utils import get_secret

model = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient embedding model
api_key = get_secret()
os.environ["OPENAI_API_KEY"] = api_key

def get_subpages_from_homepage(url, max_links=10):
    """
    Scrape subpage URLs from homepage by checking navbars and sitemap links.
    """
    try:
        homepage = requests.get(url, timeout=5)
        homepage.raise_for_status()
        soup = BeautifulSoup(homepage.text, "html.parser")

        links = set()

        # 1. Try finding sitemap
        sitemap_url = urljoin(url, "/sitemap.xml")
        try:
            sitemap = requests.get(sitemap_url, timeout=3)
            if sitemap.status_code == 200:
                sitemap_soup = BeautifulSoup(sitemap.text, "xml")
                for loc in sitemap_soup.find_all("loc"):
                    links.add(loc.text)
        except:
            pass  # Fall back to navbar scraping

        # 2. If no sitemap or to enrich, parse <a> tags in nav/menu
        for tag in soup.find_all("a", href=True):
            href = tag["href"]
            full_url = urljoin(url, href)
            if urlparse(full_url).netloc == urlparse(url).netloc:
                links.add(full_url)

        return list(links)[:max_links]

    except Exception as e:
        print(f"Subpage scraping failed for {url}: {e}")
        return []

def get_favicon(url):
    """Fetch the favicon URL from a website."""
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            favicon_url = None
            for link in soup.find_all('link', rel=['icon', 'shortcut icon', 'apple-touch-icon']):
                favicon_url = link.get('href')
                if favicon_url:
                    break
            
            if favicon_url:
                return urljoin(url, favicon_url)  # Ensure the favicon URL is absolute
            else:
                return None
        else:
            return None
    except Exception as e:
        return None

def similarity_score(query, search):
    """
    Performs similarity search between a prompt and text partitions.

    Args:
        prompt (str): The search query.
        partitions (list of str): List of partitioned text.
        threshold (float): Minimum similarity score for filtering.

    Returns:
        list: Relevant partitions with similarity scores.
    """
    # Generate embeddings
    prompt_embedding = [model.encode(query)]
    partition_embeddings = [model.encode(search)]
    # print(prompt_embedding.shape, partition_embeddings.shape)


    # Compute cosine similarity
    similarity = cosine_similarity(prompt_embedding, partition_embeddings)

    # Sort by relevance (descending similarity)
    # filtered_partitions.sort(key=lambda x: x["score"], reverse=True)

    return similarity

class SearchState(TypedDict):
    step_name: str
    workflow_structure: str
    query: str


# --- Query Generation Chain ---
llm = ChatOpenAI(model="gpt-4o", temperature=0.4)

query_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert researcher who crafts precise Google search queries."),
    ("user", 
     """Given the current workflow structure:
{workflow_structure}

And the current step:
{step_name}

Craft the best possible Google search query that would help someone find relevant information to complete the step in this workflow. Be specific, and assume the searcher is looking for practical resources or documentation. Reply with ONLY the query string.""")
])

query_chain = query_prompt | llm | StrOutputParser()

def call_google_search_api(step, context):
    """
    Calls the Google Search API to retrieve search results and also includes the website favicon.
    Args:
        query (str): The search query.
    Returns:
        dict: A dictionary containing evidence or an error message, including favicons.
    """
    query = get_query(step, context)[1:-1]
    print(query)
    api_key = "AIzaSyDYO5BSod8opzI20moUfGLfcYO1ez1vMQU"
    search_engine_id = "c5297ee11db07449c"  # Replace with your custom search engine ID
    base_url = "https://www.googleapis.com/customsearch/v1"

    params = {"key": api_key, "cx": search_engine_id, "q": query}

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()

        if "items" not in data:
            return {"success": False, "error": "No results found."}

        results = []
        all_items = []

        # Step 1: Process top search results
        for item in data["items"]:
            print(1)
            title = item.get("title")
            link = item.get("link")
            snippet = item.get("snippet")
            # favicon = get_favicon(link)
            score = similarity_score(query, title)
            all_items.append({
                "title": title, "link": link, "snippet": snippet,
                "favicon": 0, "score": float(score[0][0])
            })
            print(2)
            if len(all_items)>8:
                break

        # Step 2: Scrape subpages of top 3 items
        # for top_result in all_items[:3]:
        #     subpages = get_subpages_from_homepage(top_result["link"], max_links=10)
        #     print(subpages)
        #     for subpage in subpages:
        #         sub_favicon = get_favicon(subpage)
        #         sub_score = similarity_score(query, subpage)
        #         all_items.append({
        #             "title": subpage,  # No title, so using URL
        #             "link": subpage,
        #             "snippet": "Subpage of " + top_result["link"],
        #             "favicon": sub_favicon,
        #             "score": float(sub_score[0][0])
        #         })

        # Step 3: Sort and return top 5
        top_results = sorted(all_items, key=lambda x: x["score"], reverse=True)[:8]
        return {"success": True, "evidence": top_results}

    except requests.exceptions.RequestException as e:
        return {"success": False, "error": str(e)}

def get_query(step, context):

    def generate_query_node(state: SearchState) -> SearchState:
        query = query_chain.invoke({
            "workflow_structure": state["workflow_structure"],
            "step_name": state["step_name"]
        })
        return {
            "step_name": state["step_name"],
            "workflow_structure": state["workflow_structure"],
            "query": query
        }

    # Build the LangGraph
    graph = StateGraph(SearchState)
    graph.add_node("generate_query", RunnableLambda(generate_query_node))
    graph.set_entry_point("generate_query")
    graph.add_edge("generate_query", END)

    app = graph.compile()

    # --- Example Run ---
    input_data = {
        "step_name": step,
        "workflow_structure": context,
    }
    result = app.invoke(input_data)
    return result['query']

A = call_google_search_api("asd", "asd")

asd workflow step completion guide
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2


In [53]:
index_name = "test0713"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

In [51]:
import hashlib

namespace = f"{userid}-{workflowid}"
evidence = "AAA"
hashid = hashlib.sha256(evidence.encode()).hexdigest()
index = pc.Index("test0711")  # Locate index hax

index.upsert_records(
    namespace,
    [
        {
            "_id": hashid,
            "chunk_text": evidence,
        }
    ]
)


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sun, 13 Jul 2025 05:45:54 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '32', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Integrated inference is not configured for this index"},"status":400}


In [59]:
pinecone_api_key = "pcsk_37QAFP_MpCRebPKKbDM3D3wUbqFFK78jtP25cDEYexXxoEcw8U8PyMy5YEGfU8p13Y7Yqp"
pc = Pinecone(api_key=pinecone_api_key)
index_description = pc.describe_index("test0713")
index = pc.Index("test0713")  # Locate index hax
userid = "114214507913166948561"
workflowid = "a884d54b-685f-46d7-b630-4437261e9d48"
namespace = f"{userid}-{workflowid}"
stats = index.describe_index_stats()
print("📊 Index stats:", stats)

📊 Index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'114214507913166948561-a884d54b-685f-46d7-b630-4437261e9d48': {'vector_count': 5}},
 'total_vector_count': 5,
 'vector_type': 'dense'}


In [65]:
query = "Verify collected recipes against dietary guidelines"
dense_index = pc.Index("test0713")  # Locate index hax

# Search the dense index
results = dense_index.search(
    namespace=f"{userid}-{workflowid}",
    query={
        "top_k": 10,
        "inputs": {
            'text': query
        }
    }
)
[hit['fields']['chunk_text'] for hit in results['result']['hits']]
# Print the results
# for hit in results['result']['hits']:
#         print(f"id: {hit['_id']:<5} | score: {round(hit['_score'], 2):<5} | text: {hit['fields']['chunk_text']:<50}")

['Several templates—adaptable guides for healthy eat- ing—have been developed that show how Americans can put these principles into action: the USDA Food Patterns, lacto-ovo vegetarian or vegan adaptations of the USDA Food Patterns, and the DASH72 Eating Plan. These templates translate and integrate dietary recommendations into an overall healthy way to eat. They identify average daily amounts of foods, in nutrient-dense forms, to eat from all food groups and include limits for some dietary components. Consumers, professionals, and organizations can make use of these templates to plan healthy eating patterns or assess food and beverage choices. Key Recommendations 6HOHFW\x03DQ\x03HDWLQJ\x03SDWWHUQ\x03WKDW\x03PHHWV\x03QXWULHQW\x03 QHHGV\x03RYHU\x03WLPH\x03DW\x03DQ\x03DSSURSULDWH\x03FDORULH\x03OHYHO\x11\x03 $FFRXQW\x03IRU\x03DOO\x03IRRGV\x03DQG\x03EHYHUDJHV\x03 FRQVXPHG\x03DQG\x03DVVHVV\x03KRZ\x03WKH\\\x03ILW\x03ZLWKLQ\x03D\x03 WRWDO\x03KHDOWK\\\x03HDWLQJ\x03SDWWHUQ\x11\x03 )ROORZ\x03IRR

In [48]:
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = "pcsk_37QAFP_MpCRebPKKbDM3D3wUbqFFK78jtP25cDEYexXxoEcw8U8PyMy5YEGfU8p13Y7Yqp"
pc = Pinecone(api_key=pinecone_api_key)
index_description = pc.describe_index("test0711")
index = pc.Index("quickstart")  # Locate index hax
userid = "114214507913166948561"
workflowid = "72dd6f31-a97e-4605-835f-22947f44f853"
namespace = f"{userid}-{workflowid}"
stats = index.describe_index_stats()
print("📊 Index stats:", stats)

res = index.query(
    namespace=namespace,
    vector=[0.0] * 1536,  # dummy vector to trigger return
    top_k=10,
    include_metadata=True
)
print(f"🔎 Documents in namespace {namespace}:")
for match in res["matches"]:
    print(match["metadata"].get("chunk_text", "NO TEXT FOUND"))

📊 Index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'114214507913166948561-72dd6f31-a97e-4605-835f-22947f44f853': {'vector_count': 1}},
 'total_vector_count': 1,
 'vector_type': 'dense'}


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 12 Jul 2025 09:30:44 GMT', 'Content-Type': 'application/json', 'Content-Length': '104', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '15', 'x-pinecone-request-id': '4034052298567691213', 'x-envoy-upstream-service-time': '1', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Vector dimension 1536 does not match the dimension of the index 1024","details":[]}


In [27]:
import httpx

with httpx.Client(http2=False) as client:
    r = client.get(
        "https://www.googleapis.com/customsearch/v1",
        params={
            "key": "AIzaSyDYO5BSod8opzI20moUfGLfcYO1ez1vMQU",
            "cx": "c5297ee11db07449c",
            "q": "OpenAI"
        }
    )
    print(r.status_code)
    print(r.text)


200
{
  "kind": "customsearch#search",
  "url": {
    "type": "application/json",
    "template": "https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json"
  },
  "queries": {
    "request": [
      {
        "title": "Google Custom Search - OpenAI",
        "totalResults": "223000000",
        "searchTerms": "OpenAI",
        "count": 10,
        "startIndex": 1,
        "inputEncoding": "utf8",

In [None]:
import requests

api_key = "AIzaSyDYO5BSod8opzI20moUfGLfcYO1ez1vMQU"
search_engine_id = "c5297ee11db07449c"
query = "OpenAI"

response = requests.get(
    "https://www.googleapis.com/customsearch/v1",
    params={
        "key": api_key,
        "cx": search_engine_id,
        "q": query
    }
)

print(response.status_code)
print(response.json())

In [27]:
Q2

array([[ 0.      ,  0.      ,  0.      ],
       [ 0.4224  ,  0.48716 , -0.08208 ],
       [ 0.48716 ,  0.589119,  0.069228],
       [-0.08208 ,  0.069228,  0.988336]])

In [32]:
import numpy as np

A = np.array([[2, 4, 1, -1], [1, 2, 3, -1], [0, 0, 0, 0]]).T
B = np.array([[6.6904, 4, 1, -1]])
u = B / np.linalg.norm(B)
print(u)
# u = np.array([[0.836, 0.5, 0.125, -0.125]])
Q1 = np.identity(4) - 2* np.dot(u.T, u)
print(Q1)
print(np.dot(Q1, A))
u2 = np.array([[-0.76, 0.641, -0.108]])
Q2 = np.identity(3) - 2 * np.dot(u2.T, u2)
Q2 = np.vstack([np.array([[0, 0, 0]]), Q2])
Q2 = np.hstack([np.array([[1, 0, 0, 0]]).T, Q2])
R = np.dot(Q2, np.dot(Q1, A))
print(np.dot(Q1, Q2))
print(np.dot(np.dot(Q1, Q2), R))

[[ 0.84451155  0.50490945  0.12622736 -0.12622736]]
[[-0.42639951 -0.85280372 -0.21320093  0.21320093]
 [-0.85280372  0.49013289 -0.12746678  0.12746678]
 [-0.21320093 -0.12746678  0.96813331  0.03186669]
 [ 0.21320093  0.12746678  0.03186669  0.96813331]]
[[-4.69041576e+00 -2.98481067e+00  0.00000000e+00]
 [-9.42234412e-06 -3.82405041e-01  0.00000000e+00]
 [-2.35558603e-06  2.40439874e+00  0.00000000e+00]
 [ 2.35558603e-06 -4.04398740e-01  0.00000000e+00]]
[[-0.42639951 -0.11036986 -0.83938528  0.31870469]
 [-0.85280372 -0.221187    0.4724754   0.02638448]
 [-0.21320093  0.95782325  0.05277685  0.18609212]
 [ 0.21320093 -0.14766325  0.26391715  0.92903588]]
[[ 2.          1.00053133  0.        ]
 [ 4.          1.99949094  0.        ]
 [ 1.          3.0007767   0.        ]
 [-1.         -1.0001969   0.        ]]


In [None]:
print(1)

In [1]:
import numpy as np

# Given singular values
singular_values = np.array([2.75, 2.3, 2.0, 1.98, 1.5, 1.0, 0.6])

# Compute the variance explained (square of singular values)
variance = singular_values ** 2
total_variance = np.sum(variance)

# Cumulative explained variance ratio
cumulative_ratio = np.cumsum(variance) / total_variance

# Determine the number of components needed to preserve at least 85% variance
threshold = 0.85
num_components = np.searchsorted(cumulative_ratio, threshold) + 1

print(f"Number of principal components to preserve at least 85% variance: {num_components}")


Number of principal components to preserve at least 85% variance: 4


In [2]:
cumulative_ratio

array([0.31015589, 0.52711121, 0.69116061, 0.85194542, 0.94422321,
       0.98523555, 1.        ])

In [120]:
def similarity_search(prompt, partitions, threshold=0.5):
    """
    Performs similarity search between a prompt and text partitions.

    Args:
        prompt (str): The search query.
        partitions (list of str): List of partitioned text.
        threshold (float): Minimum similarity score for filtering.

    Returns:
        list: Relevant partitions with similarity scores.
    """
    model = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient embedding model
    # Generate embeddings
    prompt_embedding = model.encode([prompt])
    partition_embeddings = model.encode(partitions)
    # print(prompt_embedding.shape, partition_embeddings.shape)


    # Compute cosine similarity
    similarity = cosine_similarity(prompt_embedding, partition_embeddings)[0]

        # Filter partitions based on threshold
    filtered_partitions = [
        {"text": partitions[i]}
        for i in range(len(partitions))
        if similarity[i] > threshold
    ]
    print(similarity)

    # Sort by relevance (descending similarity)
    # filtered_partitions.sort(key=lambda x: x["score"], reverse=True)

    return filtered_partitions

In [7]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

def get_sublinks_and_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)

    result = []
    for link in links:
        full_url = urljoin(url, link['href'])  # Handle relative URLs
        text = link.get_text(strip=True)
        result.append((full_url, text))

    return result

def extract_minimal_text_for_similarity(url: str, timeout: int = 5) -> str:
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        session = requests.Session()
        session.headers.update(headers)

        response = session.get(url, timeout=timeout)
        response.raise_for_status()
    except Exception as e:
        print(f"[Error fetching {url}]: {e}")
        return ""

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract title
    title = soup.title.string.strip() if soup.title and soup.title.string else ""

    # Extract h1-h3 headers (which often summarize sections)
    headers = [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3'])]

    # Extract body paragraphs
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
    body_text = " ".join(paragraphs)
    
    # Clean and truncate to ~500 words max
    body_text = re.sub(r'\s+', ' ', body_text)
    truncated_body = " ".join(body_text.split()[:500])

    # Combine all parts
    combined = "\n".join([title] + headers + [truncated_body])
    return combined

# Example usage:
url = "http://coursecatalog.web.cmu.edu/schools-colleges/schoolofcomputerscience/undergraduatecomputerscience/"
sublinks = get_sublinks_and_text(url)
for link, text in sublinks[0:20]:
    combined = extract_minimal_text_for_similarity(link)
    print(similarity_search("Gather Carnegie Mellon University Computer Science graduate program's application requirements specifics", [combined] ))
    print(f"Link: {link}\nText: {text}\n")

NameError: name 'similarity_search' is not defined

In [129]:
new = "https://www.qatar.cmu.edu/admission/how-to-apply/"
combined = extract_minimal_text_for_similarity(new)
print(similarity_search("Gather Carnegie Mellon University Computer Science graduate program's application requirements specifics", [combined] ))


[0.5835358]
[{'text': 'How to Apply - Carnegie Mellon University in Qatar\nExplore\nEngage\nApply\nEnroll\nAdmission\nHow to apply\nIs CMU-Q the right university for you?\nStart here\nSteps to apply\nOnline application form\nWriting Supplement Essay\nOfficial and complete academic transcripts\nOfficial TOEFL, IELTS or Duolingo results\nLetters of recommendation\nPassport and biographical information\nOfficial SAT or ACT results (optional)\nInterview (optional)\nApplication fee\nImportant application dates\nContact the Office of Admission\nHomeAdmissionHow to Apply The deadline for Fall 2025 applications was January 15.Learn more about submitting a late application. Applications for Fall 2026 admission will open on September 1, 2025. If you’ve already decided CMU-Q is your first choice among schools and meet the eligibility requirements. If you have completed at least one semester as a degree candidate at another college or university. If you are currently in your last year of high scho

In [63]:
api_key = get_secret()
os.environ["OPENAI_API_KEY"] = api_key
def rec_send(chatbot_response, original_prompt):
    # Define a secondary chatbot model for recommendations
    rec_chat = ChatOpenAI(model="gpt-4o", temperature=0.7)

    # System prompt to generate helpful recommendations
    rec_prompt = """You are assisting a user who is unsure how to respond to an advisor's question. 
    The advisor wants to help the user who wants to {original_prompt} . Thus, the advisor just asked the user: "{chatbot_response}".
    
    Your task is to provide six diverse and thoughtful example responses that the user might consider. 
    These examples should be concise yet informative, covering different angles or perspectives that 
    a user might take when answering the question. They should consist of around 50 characters or less than 6 words.
    
    Format your response exactly as a list of six distinct suggestions below.
    Recommendation 1 | Recommendataion 2 | Recommendation 3 | Recommendation 4 | Recommendation 5 | Recommendation 6

    Start your recommendation directly and don't add something like "Recommendation 1:".
    """

    rec_prompt_template = ChatPromptTemplate.from_messages(
        [("user", rec_prompt)]
    )

    parser = StrOutputParser()

    # Chain to elaborate the task
    task_elaboration_chain = rec_prompt_template | rec_chat | parser
    elaborated_task = task_elaboration_chain.invoke({"original_prompt":original_prompt, "chatbot_response": chatbot_response})
    
    return {"recommendations": elaborated_task.split("|")}

In [14]:
!pip3 install sentence_transformers

Collecting sentence_transformers
  Obtaining dependency information for sentence_transformers from https://files.pythonhosted.org/packages/05/89/7eb147a37b7f31d3c815543df539d8b8d0425e93296c875cc87719d65232/sentence_transformers-3.4.1-py3-none-any.whl.metadata
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.41.0 from https://files.pythonhosted.org/packages/20/37/1f29af63e9c30156a3ed6ebc2754077016577c094f31de7b2631e5d379eb/transformers-4.49.0-py3-none-any.whl.metadata
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.20.0 (from sentence_transformers)
  Obtaining dependency information for huggingface-hub>=0.20.0 from https://files.pythonhosted.org/packages/ae/05/75b90

In [3]:
import pandas as pd
df = pd.read_csv("../../../../../../Downloads/haX_Testing_Prompt - Sheet1.csv")
df

Unnamed: 0,#,Prompt
0,1,Plan a recruitment event for the psychology cl...
1,2,Organize a fundraising campaign for an environ...
2,3,Develop a content calendar for a social media ...
3,4,Prepare a business pitch deck for a startup co...
4,5,Write a research paper on the impact of AI on ...
5,6,Plan a personal productivity routine to balanc...
6,7,Create a step-by-step guide for launching an o...
7,8,Organize a community clean-up event for a loca...
8,9,Develop a mobile app prototype for a time-mana...
9,10,Plan a corporate team-building retreat for bet...


In [44]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_favicon(url):
    """Fetch the favicon URL from a website."""
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            favicon_url = None
            for link in soup.find_all('link', rel=['icon', 'shortcut icon', 'apple-touch-icon']):
                favicon_url = link.get('href')
                if favicon_url:
                    break
            
            if favicon_url:
                return urljoin(url, favicon_url)  # Ensure the favicon URL is absolute
            else:
                return None
        else:
            return None
    except Exception as e:
        return None

def call_google_search_api(query):
    """
    Calls the Google Search API to retrieve search results and also includes the website favicon.
    Args:
        query (str): The search query.
    Returns:
        dict: A dictionary containing evidence or an error message, including favicons.
    """
    api_key = "AIzaSyDYO5BSod8opzI20moUfGLfcYO1ez1vMQU"
    search_engine_id = "c5297ee11db07449c"  # Replace with your custom search engine ID
    base_url = "https://www.googleapis.com/customsearch/v1"

    # Construct the request parameters
    params = {
        "key": api_key,
        "cx": search_engine_id,
        "q": query
    }

    try:
        # Make the API request
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise HTTPError for bad responses

        # Parse the JSON response
        data = response.json()

        # Extract relevant information, including favicon
        if "items" in data:
            results = []
            for item in data["items"]:
                title = item.get("title")
                link = item.get("link")
                snippet = item.get("snippet")

                # Fetch the favicon using the helper function
                favicon_url = get_favicon(link)
                
                results.append({
                    "title": title,
                    "link": link,
                    "snippet": snippet,
                    "favicon": favicon_url  # Store the actual favicon URL
                })

            return {"success": True, "evidence": results}
        else:
            return {"success": False, "error": "No results found."}

    except requests.exceptions.RequestException as e:
        return {"success": False, "error": str(e)}


In [112]:
def similarity_score(query, search):
    """
    Performs similarity search between a prompt and text partitions.

    Args:
        prompt (str): The search query.
        partitions (list of str): List of partitioned text.
        threshold (float): Minimum similarity score for filtering.

    Returns:
        list: Relevant partitions with similarity scores.
    """
    # Generate embeddings
    prompt_embedding = [model.encode(query)]
    partition_embeddings = [model.encode(search)]
    # print(prompt_embedding.shape, partition_embeddings.shape)


    # Compute cosine similarity
    similarity = cosine_similarity(prompt_embedding, partition_embeddings)

    # Sort by relevance (descending similarity)
    # filtered_partitions.sort(key=lambda x: x["score"], reverse=True)

    return similarity


In [118]:
api_key = "AIzaSyDYO5BSod8opzI20moUfGLfcYO1ez1vMQU"
search_engine_id = "c5297ee11db07449c"  # Replace with your custom search engine ID
base_url = "https://www.googleapis.com/customsearch/v1"

# Construct the request parameters
params = {
    "key": api_key,
    "cx": search_engine_id,
    "q": "CMU"
}


# Make the API request
response = requests.get(base_url, params=params)
response.raise_for_status()  # Raise HTTPError for bad responses

# Parse the JSON response
data = {"items": {"A"}}

# Extract relevant information, including favicon
# Extract relevant information, including favicon
if "items" in data:
    results = []
    # Compute similarity scores
    scores = [similarity_score("CMU", "AAA") for item in data["items"]]

    # Sort data["items"] based on scores in descending order
    sorted_items_with_scores = sorted(
        zip(scores, data["items"]), key=lambda x: x[0], reverse=True
    )

    for score, item in sorted_items_with_scores:
        print(score[0][0])

0.32609707


In [76]:
import random
import requests

for i in range(final_df.shape[0]):
    if final_df['Evidence'][i] == "forbidden":
        prompt = final_df['Prompt'][i]
        A = call_google_search_api(prompt)
        B = [A['evidence'][i]['link'] for i in range(len(A['evidence']))]
        print(len(B))
        urls = random.sample(B, k=3)
        for url in urls: 
            try:
                web_text = scrape_website(url)
                print("Scraped", end = "|")
                # Step 2: Partition text
                partitions = split_text(web_text)

                # Step 3: Perform similarity search
                results = similarity_search(prompt, partitions, threshold)
                print(results)
                print("Searched",  end = "|")

                final_results = random.sample(results, k=min(len(results), 3))
            except requests.exceptions.RequestException as e:
                print(e)






10
Scraped|[{'text': 'One fun fact about me is I love to paint banners for birthdays or other events! Caroline Readinger Vice President Member Development Ella McCutchin Vice President Membership Cate Laudadio Facilities Director Hometown: Houston, TX Major: Public Health Reese Venghaus Events Director Hometown: Houston, TX Major: Elementary Education My name is Reese Venghaus, and I am from Houston, Texas! I am an elementary education major, and I hope to someday be a 3rd or 4th grade teacher. I love working with children, and I cannot wait to design my own classroom someday and create a positive and fun learning environment for my future students. I have always had a passion for design as well, which has led me to recently become the Campus Manager of Fresh Prints at Texas A&M University. I will find clients and work hands-on with them in order to help them design their own custom apparel while organizing their orders. My other hobbies include Pilates, baking for friends and family, 

ValueError: Sample larger than population or is negative

In [24]:
import re

text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
text = " ".join(element.get_text() for element in text_elements)

# Remove extra spaces, newlines, and special characters
text = re.sub(r"\s+", " ", text).strip()

In [64]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def scrape_website(url):
    """
    Scrapes text content from a given website URL.

    Args:
        url (str): The webpage URL.

    Returns:
        str: Extracted text content.
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    session = requests.Session()
    session.headers.update(headers)

    response = session.get(url)

    response.raise_for_status()  # Ensure successful request

    soup = BeautifulSoup(response.text, "html.parser")

     # Extract meaningful content (paragraphs and headers)
    text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    text = " ".join(element.get_text() for element in text_elements)

    # Remove extra spaces, newlines, and special characters
    text = re.sub(r"\s+", " ", text).strip()

    return text

def split_text(text, chunk_size=300):
    """
    Splits a large text into smaller partitions, ensuring each partition ends with a period.
    
    Args:
        text (str): The full text.
        chunk_size (int): Approximate number of words per partition.
        
    Returns:
        list: List of text partitions.
    """
    # First, split text into sentences.
    # The regex splits after a period (keeping the period with the sentence).
    sentences = re.split(r'(?<=[.])\s+', text)
    
    partitions = []
    current_partition = ""
    current_word_count = 0

    for sentence in sentences:
        # Count words in the sentence.
        sentence_word_count = len(sentence.split())
        
        # If adding this sentence exceeds the chunk size and there is content in the current partition,
        # finalize the current partition.
        if current_partition and current_word_count + sentence_word_count > chunk_size:
            # Ensure the partition ends with a period.
            if not current_partition.endswith('.'):
                current_partition = current_partition.rstrip() + '.'
            partitions.append(current_partition.strip())
            # Start a new partition with the current sentence.
            current_partition = sentence
            current_word_count = sentence_word_count
        else:
            # Otherwise, add the sentence to the current partition.
            if current_partition:
                current_partition += " " + sentence
            else:
                current_partition = sentence
            current_word_count += sentence_word_count

    # Add the last partition.
    if current_partition:
        if not current_partition.endswith('.'):
            current_partition = current_partition.rstrip() + '.'
        partitions.append(current_partition.strip())

    return partitions

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))  # Store API key securely

def get_openai_embedding(text):
    """
    Gets an embedding vector for a given text using OpenAI's API.

    Args:
        text (str): Input text to embed.

    Returns:
        np.array: Embedding vector.
    """
    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )
    return np.array(response.data[0].embedding)

def similarity_search(prompt, partitions, threshold=0.5):
    """
    Performs similarity search between a prompt and text partitions.

    Args:
        prompt (str): The search query.
        partitions (list of str): List of partitioned text.
        threshold (float): Minimum similarity score for filtering.

    Returns:
        list: Relevant partitions with similarity scores.
    """
    similarities = [0, 0]
    for i in range(2):
        if i == 0:
            model = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient embedding model
            # Generate embeddings
            prompt_embedding = model.encode([prompt])
            partition_embeddings = model.encode(partitions)
            # print(prompt_embedding.shape, partition_embeddings.shape)
        else:
            prompt_embedding = get_openai_embedding(prompt).reshape(-1, 1).T
            partition_embeddings = np.array([get_openai_embedding(p) for p in partitions])
            # print(prompt_embedding.shape, partition_embeddings.shape)


        # Compute cosine similarity
        similarities[i] = cosine_similarity(prompt_embedding, partition_embeddings)[0]

        # Filter partitions based on threshold
    filtered_partitions = [
        {"text": partitions[i], "score": similarities[0][i], "score2": similarities[1][i]}
        for i in range(len(partitions))
        if similarities[0][i] > threshold
    ]

    # Sort by relevance (descending similarity)
    filtered_partitions.sort(key=lambda x: x["score"], reverse=True)

    return filtered_partitions


In [33]:
A = """/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Osmia californica/UCSB-IZC00043780_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Osmia californica/UCSB-IZC00043780_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00036306_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00030767_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00036306_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00030767_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00030694_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 5s 5s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00030694_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00037256_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00037256_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00041624_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Agapostemon texanus/UCSB-IZC00041624_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Andrena piperi/UCSB-IZC00044044_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Andrena piperi/UCSB-IZC00044044_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/UCSB-IZC00009028_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/UCSB-IZC00043095_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 5s 5s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/UCSB-IZC00009028_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/UCSB-IZC00043095_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/UCSB-IZC00064947_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/UCSB-IZC00064947_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/SBMNHENT0116662_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/SBMNHENT0116662_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/SBMNHENT0116667_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora californica/SBMNHENT0116667_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00030135_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00030135_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00064964_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00064964_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00045340_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00045340_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00045302_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 5s 5s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00045302_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00038954_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora curta/UCSB-IZC00038954_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00009010_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00009010_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00028548_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00028548_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00010375_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00010375_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 5s 5s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00038622_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00038622_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00040909_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Anthophora urbana/UCSB-IZC00040909_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010835_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010835_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010448_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010467_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010467_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010824_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010824_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010537_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010537_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Apis mellifera/UCSB-IZC00010448_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Bombus melanopygus/UCSB-IZC00009120_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Bombus melanopygus/UCSB-IZC00009120_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Bombus melanopygus/UCSB-IZC00028362_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Bombus melanopygus/UCSB-IZC00028362_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Bombus melanopygus/UCSB-IZC00035612_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Bombus melanopygus/UCSB-IZC00035612_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Bombus melanopygus/UCSB-IZC00064955_L.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 5s 5s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Bombus melanopygus/UCSB-IZC00064955_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 3s 3s/step
/content/drive/MyDrive/Bee Wing Variation and Imaging/Wing Venation Computer Vision/data/all_wings_isolated/Bombus melanopygus/UCSB-IZC00035570_R.JPG
1/1 ━━━━━━━━━━━━━━━━━━━━ 4s 4s/step
/content/drive/MyDrive/Bee Wing Variation """
print(len(A.split("\n")))

127


In [39]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
import json
import re
import random
import time
import traceback
import pymupdf
from urllib.parse import urlparse

def scrape_website(url):
    """
    Scrapes text content from a given website URL.

    Args:
        url (str): The webpage URL.

    Returns:
        str: Extracted text content.
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    session = requests.Session()
    session.headers.update(headers)

    if url.lower().endswith(".pdf"):
        response = session.get(url)
        response.raise_for_status()

        with open("temp.pdf", "wb") as f:
            f.write(response.content)

        # Use PyMuPDF to extract text
        doc = pymupdf.open("temp.pdf") # open a document
        text = ""
        for page in doc: # iterate the document pages
            text += page.get_text()
        os.remove("temp.pdf")  # Clean up
        return re.sub(r"\s+", " ", text).strip()
    else:

        response = session.get(url)

        response.raise_for_status()  # Ensure successful request

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract meaningful content (paragraphs and headers)
        text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        text = " ".join(element.get_text() for element in text_elements)

        # Remove extra spaces, newlines, and special characters
        text = re.sub(r"\s+", " ", text).strip()

        return text

def split_text(text, chunk_size=300):
    """
    Splits a large text into smaller partitions, ensuring each partition ends with a period.
    
    Args:
        text (str): The full text.
        chunk_size (int): Approximate number of words per partition.
        
    Returns:
        list: List of text partitions.
    """
    # First, split text into sentences.
    # The regex splits after a period (keeping the period with the sentence).
    sentences = re.split(r'(?<=[.])\s+', text)
    
    partitions = []
    current_partition = ""
    current_word_count = 0

    for sentence in sentences:
        # Count words in the sentence.
        sentence_word_count = len(sentence.split())
        
        # If adding this sentence exceeds the chunk size and there is content in the current partition,
        # finalize the current partition.
        if current_partition and current_word_count + sentence_word_count > chunk_size:
            # Ensure the partition ends with a period.
            if not current_partition.endswith('.'):
                current_partition = current_partition.rstrip() + '.'
            partitions.append(current_partition.strip())
            # Start a new partition with the current sentence.
            current_partition = sentence
            current_word_count = sentence_word_count
        else:
            # Otherwise, add the sentence to the current partition.
            if current_partition:
                current_partition += " " + sentence
            else:
                current_partition = sentence
            current_word_count += sentence_word_count

    # Add the last partition.
    if current_partition:
        if not current_partition.endswith('.'):
            current_partition = current_partition.rstrip() + '.'
        partitions.append(current_partition.strip())

    return partitions

def similarity_search(prompt, partitions, threshold=0.5):
    """
    Performs similarity search between a prompt and text partitions.

    Args:
        prompt (str): The search query.
        partitions (list of str): List of partitioned text.
        threshold (float): Minimum similarity score for filtering.

    Returns:
        list: Relevant partitions with similarity scores.
    """
    try:
        model = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient embedding model
        # Generate embeddings
        prompt_embedding = model.encode([prompt])
        partition_embeddings = model.encode(partitions)
        # print(prompt_embedding.shape, partition_embeddings.shape)


        # Compute cosine similarity
        similarity = cosine_similarity(prompt_embedding, partition_embeddings)[0]

            # Filter partitions based on threshold
        filtered_partitions = [
            {"text": partitions[i], "score": similarity[i]}
            for i in range(len(partitions))
            if similarity[i] > threshold
        ]

        # Sort by relevance (descending similarity)
        # filtered_partitions.sort(key=lambda x: x["score"], reverse=True)
        filtered_partitions.sort(key=lambda item: item['score'], reverse=True)
        return filtered_partitions
    except Exception as e:
        print("Error during similarity search:", e)
        traceback.print_exc()
        return []


def retrieve_evidence(prompt, url, threshold):
    logs = {}
    print(prompt, url)
    try:
        start_time = time.time()
        
        # Step 1: Scrape
        scrape_start = time.time()
        web_text = scrape_website(url)
        scrape_end = time.time()
        logs["scrape_time"] = scrape_end - scrape_start
        print(1)
        # Step 2: Partition
        split_start = time.time()
        partitions = split_text(web_text)
        split_end = time.time()
        logs["split_time"] = split_end - split_start
        print(1)

        # Step 3: Similarity search
        sim_start = time.time()
        results = similarity_search(prompt, partitions, threshold)
        sim_end = time.time()
        logs["similarity_search_time"] = sim_end - sim_start
        print(1)

        # Step 4: Final selection
        final_start = time.time()
        final_results = random.sample(results, k=min(10, len(results)))
        final_end = time.time()
        logs["filtering_time"] = final_end - final_start
        print(1)

        total_time = time.time() - start_time
        logs["total_time"] = total_time
        logs["num_results"] = len(final_results)
        print(1)
        
        # Save log to file
        log_dir = "logs"
        os.makedirs(log_dir, exist_ok=True)
        timestamp = int(time.time())
        # log_path = os.path.join(log_dir, f"logs/log_{timestamp}.json")
        # with open(log_path, "w") as f:
        #     json.dump(logs, f, indent=2)

        return final_results

    except requests.exceptions.RequestException as e:
        print(e)



In [15]:
!pip3 install --upgrade pymupdf


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pymupdf
  Obtaining dependency information for pymupdf from https://files.pythonhosted.org/packages/4e/55/43b64fa6cd048d2ea4574c045b5ac05d023254b91c2c703185f6f8a77b30/pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl.metadata
  Downloading pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m306.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [41]:
results

[{'text': 'California Indian Food and Culture PHOEBE A. HEARST MUSEUM OF ANTHROPOLOGY Written and Designed by Nicole Mullen Contributors: Ira Jacknis, Barbara Takiguchi, and Liberty Winn. Sources Consulted The former exhibition: Food in California Indian Culture at the Phoebe A. Hearst Museum of Anthropology. Ortiz, Beverly, as told by Julia Parker. It Will Live Forever. Heyday Books, Berkeley, CA 1991. Jacknis, Ira. Food in California Indian Culture. Hearst Museum Publications, Berkeley, CA, 2004. Copyright © 2003. Phoebe A. Hearst Museum of Anthropology and the Regents of the University of California, Berkeley. All Rights Reserved. PHOEBE A. HEARST MUSEUM OF ANTHROPOLOGY Table of Contents 1. Glossary 2. Topics of Discussion for Lessons 3. Map of California Cultural Areas 4. General Overview of California Indians 5. Plants and Plant Processing 6. Animals and Hunting 7. Food from the Sea and Fishing 8. Insects 9. Beverages 10. Salt 11. Drying Foods 12. Earth Ovens 13. Serving Utensils 

In [40]:
prompt = "Step 1: Gather resources on Indian vegetarian cuisine for Research vegetarian dishes. for Weekly vegetarian meal plan with Indian and Italian dishes." 
url = "https://hearstmuseum.berkeley.edu/wp-content/uploads/TeachingKit_CaliforniaIndianFoodAndCulture-1.pdf"

logs = {}
start_time = time.time()

# Step 1: Scrape
scrape_start = time.time()
web_text = scrape_website(url)
scrape_end = time.time()
logs["scrape_time"] = scrape_end - scrape_start
print(1)
# Step 2: Partition
split_start = time.time()
partitions = split_text(web_text)
split_end = time.time()
logs["split_time"] = split_end - split_start
print(1)

# Step 3: Similarity search
sim_start = time.time()
results = similarity_search(prompt, partitions, threshold=0.2)
sim_end = time.time()
logs["similarity_search_time"] = sim_end - sim_start

1
1


In [25]:
partitions

['California Indian Food and Culture PHOEBE A. HEARST MUSEUM OF ANTHROPOLOGY Written and Designed by Nicole Mullen Contributors: Ira Jacknis, Barbara Takiguchi, and Liberty Winn. Sources Consulted The former exhibition: Food in California Indian Culture at the Phoebe A. Hearst Museum of Anthropology. Ortiz, Beverly, as told by Julia Parker. It Will Live Forever. Heyday Books, Berkeley, CA 1991. Jacknis, Ira. Food in California Indian Culture. Hearst Museum Publications, Berkeley, CA, 2004. Copyright © 2003. Phoebe A. Hearst Museum of Anthropology and the Regents of the University of California, Berkeley. All Rights Reserved. PHOEBE A. HEARST MUSEUM OF ANTHROPOLOGY Table of Contents 1. Glossary 2. Topics of Discussion for Lessons 3. Map of California Cultural Areas 4. General Overview of California Indians 5. Plants and Plant Processing 6. Animals and Hunting 7. Food from the Sea and Fishing 8. Insects 9. Beverages 10. Salt 11. Drying Foods 12. Earth Ovens 13. Serving Utensils 14. Food 

In [42]:
retrieve_evidence(prompt, url, threshold=0)

Step 1: Gather resources on Indian vegetarian cuisine for Research vegetarian dishes. for Weekly vegetarian meal plan with Indian and Italian dishes. https://hearstmuseum.berkeley.edu/wp-content/uploads/TeachingKit_CaliforniaIndianFoodAndCulture-1.pdf
1
1
1
1
1


[{'text': 'Southern tribes also stored large amounts of food in granaries made of twigs. Miwok granaries could hold up to 500 pounds of dry acorns! In the northern and eastern regions, pits were often used to store food. Pits were dug in the ground and lined with bark or grass. The Karuk stored dried fish in a pit at the back of the house. Miwok acorn cache, Calaveras Co.; 1906 PHOEBE A. HEARST MUSEUM OF ANTHROPOLOGY Feasts Native Californians looked forward to feasts where they had the chance to eat special foods. Neighboring groups met and socialized at feasts. Feasts were a time for trading, games, and dances. Many occasions called for feasts. Weddings were celebrated with feasts, as well as seasonal events such as the arrival of the first salmon or the acorn harvest. Feasts were often a time for trading foods after a harvest. Owens Valley Paiute traded salt, pine nuts, and other seeds for acorns and manzanita berries brought by the Western Mono, who lived nearby. Mary Eslick and Be