In [None]:
!pip install streamlit pyngrok pandas numpy faiss-cpu sentence-transformers aiohttp requests tenacity

from pyngrok import ngrok
import os
import streamlit as st
import pandas as pd
import faiss
import pickle
import numpy as np
import openai
import aiohttp
import asyncio
from sentence_transformers import SentenceTransformer
from tenacity import retry, stop_after_attempt, wait_exponential
import hashlib
import requests



  from tqdm.autonotebook import tqdm, trange


In [None]:
!pip install --upgrade openai



In [None]:
!pip install together



In [None]:
%%writefile streamlit_app.py
from pyngrok import ngrok
import os
import streamlit as st
import pandas as pd
import faiss
import pickle
import numpy as np
import openai
import aiohttp
import asyncio
from together import AsyncTogether
from sentence_transformers import SentenceTransformer
from tenacity import retry, stop_after_attempt, wait_exponential
import hashlib
from openai import OpenAI
from together import Together

openai.api_key = #"your openai api key"

os.environ['TOGETHER_API_KEY'] = #"your together api key"
# Initialize models for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# FAISS Similarity Search
def get_faiss_search_results(query, faiss_index, df):
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query])[0]
    query_embedding = np.array([query_embedding], dtype='float32')

    # Perform FAISS search for top 5 similar entries
    D, I = faiss_index.search(query_embedding, k=5)

    # Collecting the top results from FAISS search
    faiss_results = []
    for idx in I[0]:
        if idx < len(df):
            faiss_results.append(df.iloc[idx].to_dict())  # Convert each row to a dictionary

    return faiss_results

# Function to handle file upload and create FAISS index dynamically
def handle_file_upload(uploaded_file):
    if uploaded_file is not None:
        df = pd.read_csv(uploaded_file)
        st.write("Preview of the uploaded CSV:")
        st.dataframe(df.head())

        # Create a combined description column
        description_columns = df.select_dtypes(include=['object']).columns
        df['description'] = df[description_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

        # Generate embeddings and create FAISS index
        descriptions = df['description'].tolist()
        embeddings = embedding_model.encode(descriptions)

        # Create FAISS index
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(np.array(embeddings).astype('float32'))

        # Save FAISS index and column data
        with open('faiss_index.pkl', 'wb') as f:
            pickle.dump(index, f)
        with open('columns.pkl', 'wb') as f:
            pickle.dump(df.columns.tolist(), f)

        st.success("FAISS index created and saved successfully!")
        return df, index

    return None, None

API_KEY = #'your google search engine api key'
CX = #'your cx key'

# Cache dictionary to store results for reuse
search_cache = {}

# Function to get a hash for the search query to store results in cache
def get_query_hash(query):
    return hashlib.md5(query.encode()).hexdigest()

# @retry(stop=stop_after_attempt(5), wait=wait_exponential(min=4, max=60))
# async def expand_query_with_llm(query):
#     # Create a more specific and detailed prompt for query expansion
#     expanded_query_prompt = f"Make the following query more specific and detailed but concise for a web search:\n\n'{query}'"

#     # Call LLM to expand the query
#     response = await async_client.chat.completions.create(
#         model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
#         messages=[
#             {"role": "system", "content": "You are a helpful assistant for search query expansion."},
#             {"role": "user", "content": expanded_query_prompt}
#         ],
#         max_tokens=100
#     )

#     # Extract the expanded query from the LLM response, ensuring it's a single line
#     expanded_query = response['choices'][0]['message']['content'].strip().replace("\n", " ")

#     return expanded_query


# Function to make a web search request using Google Custom Search API
async def fetch_web_data(query, session):
    query_hash = get_query_hash(query)

    # If the query result is cached, return it from the cache
    if query_hash in search_cache:
        return search_cache[query_hash]

    # expanded_query = await expand_query_with_llm(query)
    # st.write(f"Expanded query: {expanded_query}")

    # encoded_query = urllib.parse.quote(expanded_query)
    # st.write(f"Expanded query: {encoded_query}")

    url = f'https://www.googleapis.com/customsearch/v1?q={query}&key={API_KEY}&cx={CX}'
    st.write(f"Search URL: {url}")

    try:
        async with session.get(url) as response:
            if response.status == 200:
                data = await response.json()
                if 'items' in data:
                    results = [{"title": item["title"], "link": item["link"], "snippet": item.get("snippet", "No snippet available")} for item in data['items']]
                    # Cache the result
                    search_cache[query_hash] = results
                    return results
                else:
                    return [{"title": "No results found", "link": "", "snippet": ""}]
            else:
                raise Exception(f"Error fetching data: {response.status}")
    except Exception as e:
        print(f"Error in fetching data: {e}")
        return []

# Retry mechanism for async search queries
@retry(stop=stop_after_attempt(5), wait=wait_exponential(min=4, max=60))
async def async_search_queries(queries):
    async with aiohttp.ClientSession() as session:
        results = []
        for query in queries:
            result = await fetch_web_data(query, session)
            results.append(result)
            await asyncio.sleep(1)
        return results

# client = OpenAI(
#     api_key = os.getenv("OPENAI_API_KEY"),
# )
# client = Together()

# async def llm_extract_info(text, detailed=False):
#     model_name = "gpt-4" if detailed else "gpt-3.5-turbo"

#     response = await client.chat.completions.create(
#     model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
#     messages = [
#             {"role": "system", "content": "Extract structured information from the following text."},
#             {"role": "user", "content": text}
#         ],
#     )
#     #max_tokens=150 if detailed else 50,
#     return response['choices'][0]['message']['content']

#     print(response._request_id)

async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

@retry(stop=stop_after_attempt(5), wait=wait_exponential(min=4, max=60))
async def llm_extract_info(text, detailed=False):
    model_name = "meta-llama/Llama-3-70b-chat-hf" if detailed else "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"

    # Create async chat completion task
    response = await async_client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "Extract structured information from the following text."},
            {"role": "user", "content": text}
        ],
        max_tokens=150 if detailed else 50
    )

    # Extract and return the content
    return response.choices[0].message.content


# Define your Streamlit main function here
def main():
    st.set_page_config(page_title="Enhanced Data Extraction Tool", layout="wide")
    uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])

    df = None
    faiss_index = None

    if uploaded_file:
        df, faiss_index = handle_file_upload(uploaded_file)

    if faiss_index is not None:
        query = st.text_input("Enter search query:")
        if query:
            async def run_query():
                # FAISS Similarity Search
                faiss_results = get_faiss_search_results(query, faiss_index, df)

                # Async Web Search
                web_search_results = await async_search_queries([query])

                # LLM Extraction on FAISS and Web Results
                # faiss_extractions = await asyncio.gather(*[llm_extract_info(str(result), detailed=True) for result in faiss_results])
                # web_extractions = await asyncio.gather(*[llm_extract_info(str(result), detailed=True) for result in web_search_results])

                faiss_extractions = []
                for result in faiss_results:
                    faiss_extractions.append(await llm_extract_info(str(result), detailed=True))
                    await asyncio.sleep(1)  # Delay to prevent hitting rate limits

                web_extractions = []
                for result in web_search_results:
                    web_extractions.append(await llm_extract_info(str(result), detailed=True))
                    await asyncio.sleep(1)

                # Compile all results into a DataFrame
                structured_results = {
                    "Source": ["FAISS"] * len(faiss_extractions) + ["Web Search"] * len(web_extractions),
                    "Query": [query] * (len(faiss_extractions) + len(web_extractions)),
                    "Extracted Information": faiss_extractions + web_extractions
                }

                results_df = pd.DataFrame(structured_results)

                st.write("Structured Extraction Results:")
                st.dataframe(results_df)

                # Download structured results as CSV
                csv_data = results_df.to_csv(index=False)
                st.download_button("Download Structured Data", csv_data, "extracted_data.csv", "text/csv")

            asyncio.run(run_query())

if __name__ == "__main__":
    main()


Overwriting streamlit_app.py


In [None]:
import os
def run_streamlit():
    # Start the Streamlit server in the background with the correct path to your saved script
    os.system('streamlit run streamlit_app.py --server.enableCORS=false &')

    # Set up the ngrok tunnel
    public_url = ngrok.connect(8501)
    print(f"Streamlit app is live at: {public_url}")


In [None]:
!ngrok authtoken #enter your ngrok auth token

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
run_streamlit()

Streamlit app is live at: NgrokTunnel: "https://e904-34-106-20-10.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!pkill ngrok
