# Python for Data Science

## Task 1: LLM integration

### 1.1 Single Text Translation

In [1]:
import json
import random
import time
import textwrap
import numpy as np
import requests
import bs4
from bs4 import BeautifulSoup
from time import sleep
from IPython.display import Markdown
import warnings

warnings.filterwarnings("ignore")

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai

In [2]:
genai.configure(api_key="AIzaSyAeZkk_q3t9Z9_iy6rAReyfE3LsmOa-WtM")

In [3]:
generator_config = {
    "temperature": 0,
    "top_k": 24,
    "top_p": 0.8,
    "max_output_tokens": 8192,
}

model = genai.GenerativeModel(
    model_name="models/gemini-1.5-flash-8b-latest",
    generation_config=generator_config,
)


In [4]:
# Function to create the translation prompt
def create_translation_prompt(sentence, target_language):
    return (
        "Here is how to handle translations:  \n"
        "Input: \"Hello, my name is John\" in English. Target language: Vietnamese.  \n"
        "Output: \"Xin chào, tôi tên là John\".  \n\n"
        "Input: \"chào tôi tên Trang\" in Vietnamese. Target language: Vietnamese.  \n"
        "Output: \"chào tôi tên Trang\".  \n\n"
        "Now translate the following sentence:  \n"
        f"Input: \"{sentence}\". Target language: {target_language}.  \n"
        "Output:"
    )

# Clean up model's response to remove unwanted words
def clean_response(response_text):
    return response_text.replace("Output: ", "").strip()

# Single Text Translation
def translate_text(sentence, target_language):
    prompt = create_translation_prompt(sentence, target_language=target_language)

    try:
        response = model.generate_content(prompt)
        return clean_response(response.text)
    except Exception as e:
        print(f"Error translating text: {sentence}. Error: {e}")
        return f"Error: {e}"

# Multiple Text Translation
def translate_sentences(sentences, target_language="Vietnamese"):
    translations = []

    for sentence in sentences:
        prompt = create_translation_prompt(sentence, target_language)

        try:
            response = model.generate_content(prompt)
            translations.append(clean_response(response.text))
        except Exception as e:
            print(f"Error translating sentence: {sentence}. Error: {e}")
            translations.append(f"Error: {e}")

    return translations


In [None]:
list_language = ["English", "French", "Vietnamese", "Portuguese", "German", "Thai", "Russian"]
for i in range(len(list_language)):
    print(f"{i + 1}. {list_language[i]}")

target_language = input("Nhập vào ngôn ngữ cần dịch dịch từ ngôn ngữ trên (nhập số thứ tự):\n")

idx_target_language = int(target_language) - 1

target_lang = list_language[idx_target_language]

single_text = "Hello, my name is Robert, I live in a rural area and have never been to a place as beautiful as this, tôi đến từ Hà Nội."

translated_text = translate_text(single_text, target_lang)
print(f"\nCâu sau khi dịch: {translated_text}")

1. English
2. French
3. Vietnamese
4. Portuguese
5. German
6. Thai
7. Russian

Câu sau khi dịch: "Xin chào, tên tôi là Robert, tôi sống ở vùng nông thôn và chưa từng đến nơi nào đẹp như thế này, tôi đến từ Hà Nội."


## 1.2 Multiple Text Translation

In [10]:
sentenses = [
    "Helo, my name is Hiệp, you can call me Lucifer.",
    "I'm studying in University of Science.",
    "My english not good, vì thế mà tôi nói tiếng việt sẽ dễ hơn.",
    "저는 현재 데이터 과학을 공부하고 있습니다."
]

for i in range(len(list_language)):
    print(f"{i + 1}. {list_language[i]}")

idx_target_language = int(input("Nhập vào ngôn ngữ cần dịch dịch từ ngôn ngữ trên:\n")) - 1
target_language = list_language[idx_target_language]

translated_sentences = translate_sentences(sentenses, target_language)

for original, translated in zip(sentenses, translated_sentences):
    print(f"\nOriginal: {original}\nTranslated: {translated}")


1. English
2. French
3. Vietnamese
4. Portuguese
5. German
6. Thai
7. Russian

Original: Helo, my name is Hiệp, you can call me Lucifer.
Translated: "Chào, tên tôi là Hiệp, bạn có thể gọi tôi là Lucifer."

Original: I'm studying in University of Science.
Translated: "Tôi đang học tại Đại học Khoa học."

Original: My english not good, vì thế mà tôi nói tiếng việt sẽ dễ hơn.
Translated: "Tiếng Anh của tôi không tốt, vì thế mà tôi nói tiếng Việt sẽ dễ hơn."

Original: 저는 현재 데이터 과학을 공부하고 있습니다.
Translated: Tôi hiện đang học khoa học dữ liệu.


## Task 2

### 2.1 Data Access and Indexing

Parsing data

In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import random

# Configure Chrome options for headless browsing
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource issues
chrome_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration

# Specify the Chrome driver path
chrome_options.add_argument("webdriver.chrome.driver=/usr/lib/chromium-browser/chromedriver")
driver = webdriver.Chrome(options=chrome_options)

# Open the target webpage
url = "https://www.presight.io/privacy-policy.html"
driver.get(url)

# Wait for a random duration to mimic human behavior
time.sleep(random.randint(5, 10))

# Locate the target div element using CSS selector
css_selector = "div.css-fugq39"
div_element = driver.find_element(By.CSS_SELECTOR, css_selector)

# Extract the inner HTML content of the div
div_html = div_element.get_attribute("innerHTML")

driver.quit()


In [12]:
soup = BeautifulSoup(div_html, "html.parser")
pretty_html = soup.prettify()
print(pretty_html)


<div class="chakra-stack css-1uji4px">
 <h2 class="chakra-heading css-dhb2ck">
  PRIVACY POLICY
 </h2>
 <h2 class="chakra-heading css-18j379d">
  Last updated 15 Sep 2023
 </h2>
 <hr aria-orientation="horizontal" class="chakra-divider css-svjswr"/>
 <div class="chakra-stack css-lcbvq9">
  <p class="chakra-text css-0">
   At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.
  </p>
  <div class="chakra-stack css-o5l3sd">
   <h2 class="chakra-heading css-18j379d">
    Information Collection and Use
   </h2>
   <p class="chakra-text css-0">
    We collect several different types of information for various purposes to provide and improve our Service to you.
   </p>
  </div>
  <div class="chakra-stack css-o5l3sd">
   <h2 class="chakra-heading css-18j379d">
    Types of Data Collected
   </h2>
   <div class="chakra-stack css-bel3sh">
   

In [13]:
# Dictionary to store structured data
indexed_data = {}
current_heading = None

# Iterate through all relevant HTML elements
for element in soup.find_all(["h2", "p", "i", "ul"]):

    # Detect a new heading (h2) and initialize a new section
    if element.name == "h2":
        current_heading = element.get_text(strip=True)
        if current_heading not in indexed_data:
            indexed_data[current_heading] = {
                "content": [],  # Stores paragraph content and list items
                "subheaders": []  # Stores subheaders (italic text with details)
            }

    # If the element is a paragraph (p), add its text to the current section
    elif element.name == "p" and current_heading:
        indexed_data[current_heading]["content"].append(element.get_text(strip=True))

    # If the element is an unordered list (ul), extract all list items
    elif element.name == "ul" and current_heading:
        list_items = [li.get_text(strip=True) for li in element.find_all("li")]
        indexed_data[current_heading]["content"].extend(list_items)

    # If the element is an italic tag (i), treat it as a subheader
    elif element.name == "i" and current_heading:
        subheader_title = element.get_text(strip=True)
        subheader_content = element.find_next_sibling("p").get_text(strip=True) if element.find_next_sibling("p") else ""
        subheader_list = []

        # Check if there's a list (ul) following the subheader and extract items
        sibling_ul = element.find_next_sibling("ul")
        if sibling_ul:
            subheader_list = [li.get_text(strip=True) for li in sibling_ul.find_all("li")]

        # Append subheader details to the current heading's subheaders list
        indexed_data[current_heading]["subheaders"].append({
            "Title": subheader_title,
            "Content": subheader_content,
            "List": subheader_list
        })


In [14]:
# Convert the dictionary into a structured list format
indexed_list = [
    {
        "heading": heading,
        "content": " ".join(data["content"]),  # Merge paragraph texts and list items
        "subheaders": data["subheaders"]
    }
    for heading, data in indexed_data.items()
]


In [15]:
indexed_list

[{'heading': 'PRIVACY POLICY', 'content': '', 'subheaders': []},
 {'heading': 'Last updated 15 Sep 2023',
  'content': 'At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.',
  'subheaders': []},
 {'heading': 'Information Collection and Use',
  'content': 'We collect several different types of information for various purposes to provide and improve our Service to you.',
  'subheaders': []},
 {'heading': 'Types of Data Collected',
  'content': 'While using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to: Email address First name and last name Phone number Address, State, Province, ZIP/Postal code, City Cookies and Usage Data We may also collect information t

### Indexed_List[7] và indexed_List[8] chính là các mục con của Indexed_list[6] (Tụi em check tại trang web). Nên cần chuyển 2 phần tử này vào subheaders của Indexed_list[6]

 {'heading': 'Access to Personal Information',
  'content': '',
  'subheaders': []},
 {'heading': 'Accessing Your Personal Information',
  'content': 'You have the right to access all of your personal information that we hold. Through the application, you can correct, amend, or append your personal information by logging into the application and navigating to your settings and profile.',
  'subheaders': []},
 {'heading': 'Automated Edit Checks',
  'content': 'Presight employs automated edit checks to ensure that data entry fields are completed properly when collecting personal information. These edit checks help maintain data integrity and accuracy. You are encouraged to provide complete and valid information to ensure the smooth processing of their personal data.',
  'subheaders': []},

In [16]:
# Merge the 7th and 8th headings into the subheaders of the 6th heading
indexed_list[6]['subheaders'].append({
    "Title": indexed_list[7]['heading'],
    "Content": indexed_list[7]['content'],
    "List": []
})

indexed_list[6]['subheaders'].append({
    "Title": indexed_list[8]['heading'],
    "Content": indexed_list[8]['content'],
    "List": []
})

# Remove redundant entries (7th and 8th headings)
del indexed_list[7:9]

# Clear content of the 4th heading
indexed_list[3]['content'] = ""

indexing data

In [17]:
# Print index list after clean
for item in indexed_list:
    print(f"Object: {item['heading']}")
    print(f"Content: {item['content']}")
    if item['subheaders']:
        print("Subheaders:")
        for sub in item['subheaders']:
            print(f"  - Title: {sub['Title']}")
            print(f"    Content: {sub['Content']}")
            if sub['List']:
                print(f"    List: {', '.join(sub['List'])}")
    print()


Object: PRIVACY POLICY
Content: 

Object: Last updated 15 Sep 2023
Content: At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.

Object: Information Collection and Use
Content: We collect several different types of information for various purposes to provide and improve our Service to you.

Object: Types of Data Collected
Content: 
Subheaders:
  - Title: Personal Data
    Content: While using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to:
    List: Email address, First name and last name, Phone number, Address, State, Province, ZIP/Postal code, City, Cookies and Usage Data
  - Title: Usage Data
    Content: We may also collect information that your brows

In [18]:
with open('indexed_list.json', 'w', encoding='utf-8') as json_file:
    json.dump(indexed_list, json_file, ensure_ascii=False, indent=4)

print("Data has been saved to indexed_list.json")


Data has been saved to indexed_list.json


Embeddings

In [20]:
# 2.2 Create Embeddings for RAG
# Initialize embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare content for embedding
content_for_embedding = []
for item in indexed_list:
    full_content = f"Heading: {item['heading']}\nContent: {item['content']}\n"
    
    if item['subheaders']:
        for subheader in item['subheaders']:
            full_content += f"Subheading: {subheader['Title']}\n"
            full_content += f"Subcontent: {subheader['Content']}\n"
            if subheader['List']:
                full_content += f"List items: {', '.join(subheader['List'])}\n"
    
    content_for_embedding.append(full_content)

# Create embeddings
embeddings = embedding_model.encode(content_for_embedding)
embeddings = np.array(embeddings)

print(f" Embeddings created with shape: {embeddings.shape}")

 Embeddings created with shape: (19, 384)


In [28]:
# 2.3 RAG System Implementation
class RAGSystem:
    def __init__(self, indexed_list, embeddings, embedding_model, generation_model):
        self.indexed_list = indexed_list
        self.embeddings = embeddings
        self.embedding_model = embedding_model
        self.generation_model = generation_model
        self.content_for_embedding = content_for_embedding
    
    def get_query_embedding(self, query):
        return self.embedding_model.encode([query])
    
    def find_best_match(self, query, top_k=5):
        query_embedding = self.get_query_embedding(query)
        similarities = cosine_similarity(query_embedding, self.embeddings).flatten()
        
        # Get top-k indices sorted by similarity (descending)
        top_k_indices = np.argsort(similarities)[-top_k:][::-1]
        
        matches = []
        for idx in top_k_indices:
            matches.append({
                'index': idx,
                'content': self.content_for_embedding[idx],
                'similarity': similarities[idx],
                'heading': self.indexed_list[idx]['heading'],
                'section_data': self.indexed_list[idx]
            })
        
        return matches
    
    def generate_answer_gemini(self, top_matches, question):
        # Create context với thông tin chi tiết
        context = "Relevant information from Presight Privacy Policy:\n\n"
        
        for i, match in enumerate(top_matches, 1):
            context += f"Section {i}: {match['heading']}\n"
            context += f"Content: {match['section_data']['content']}\n"
            
            # Thêm subheaders nếu có
            if match['section_data']['subheaders']:
                for subheader in match['section_data']['subheaders']:
                    context += f"  - {subheader['Title']}: {subheader['Content']}\n"
                    if subheader['List']:
                        context += f"    List: {', '.join(subheader['List'])}\n"
            
            context += f"\n(Relevance Score: {match['similarity']:.4f})\n\n"
        
        prompt = f"""
        Act as a professional assistant at company Presight in answering the question provided.
        Your job is to provide a clear and concise answer based only on the information provided in the context.
        Do not add any details or information beyond what is provided in the context.

        Context:
        {context}

        Question: {question}

        Requirements:
        1. Answer the question as detailed as possible from the provided context, make sure to provide all the details.
        2. If the answer is not in the context provided, just say "Your question is not in the company's database, please ask another question." without any further answer.
        3. Provide specific examples or lists when available in the context.
        4. Mention relevant section titles for reference.

        Answer:
        """
        
        try:
            response = self.generation_model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Error generating answer: {e}"
    
    def query(self, user_question, top_k=5, show_details=True):
        """Main query function với logging chi tiết"""
        print(f" Processing Query: {user_question}")
        print("-" * 60)
        
        start_time = time.time()
        
        # Step 1: Find best matches
        top_matches = self.find_best_match(user_question, top_k=top_k)
        
        if show_details:
            print(f"\n Top {top_k} Most Relevant Sections:")
            for i, match in enumerate(top_matches, 1):
                print(f"{i}. {match['heading']} (Score: {match['similarity']:.4f})")
            print()
        
        # Step 2: Generate answer
        answer = self.generate_answer_gemini(top_matches, user_question)
        
        end_time = time.time()
        
        print(" Generated Answer:")
        print(answer)
        print(f"\n Execution Time: {end_time - start_time:.4f} seconds")
        print("=" * 60)
        
        return {
            'query': user_question,
            'top_matches': top_matches,
            'answer': answer,
            'execution_time': end_time - start_time
        }

# Initialize RAG system
rag_system = RAGSystem(indexed_list, embeddings, embedding_model, model)

In [29]:
# 2.4 RAG System Testing
test_queries = [
    "What personal data does Presight collect?",
    "How does Presight use my data?",
    "Can I access my personal information?",
    "What security measures does Presight implement?",
    "How long does Presight retain my data?",
    "How can I contact Presight about privacy concerns?"
]

for i, query in enumerate(test_queries[:3], 1):  # Test first 3 queries
    print(f"\n Test {i} ")
    result = rag_system.query(query, top_k=3, show_details=True)
    print("\n")


 Test 1 
 Processing Query: What personal data does Presight collect?
------------------------------------------------------------

 Top 3 Most Relevant Sections:
1. Use of Data (Score: 0.7550)
2. Access to Personal Information (Score: 0.6761)
3. Last updated 15 Sep 2023 (Score: 0.5931)

 Generated Answer:
Section 1: Use of Data describes that Presight collects data for various purposes, including providing and maintaining the service, notifying users of service changes, allowing participation in interactive features, providing customer support, gathering analysis for service improvement, monitoring service usage, and detecting/preventing technical issues.  Section 2: Access to Personal Information states that Presight collects personal information, and that users can access, correct, amend, or append their information through the application's settings and profile.  Automated edit checks are used to ensure data integrity.


 Execution Time: 1.8280 seconds



 Test 2 
 Processing Quer

In [1]:
!pip install streamlit

Collecting streamlit




  Downloading streamlit-1.48.1-py3-none-any.whl (9.9 MB)
     ---------------------------------------- 9.9/9.9 MB 2.1 MB/s eta 0:00:00
Collecting gitpython!=3.1.19,<4,>=3.0.7
  Downloading gitpython-3.1.45-py3-none-any.whl (208 kB)
     -------------------------------------- 208.2/208.2 kB 2.1 MB/s eta 0:00:00
Collecting altair!=5.4.0,!=5.4.1,<6,>=4.0
  Downloading altair-5.5.0-py3-none-any.whl (731 kB)
     -------------------------------------- 731.2/731.2 kB 2.4 MB/s eta 0:00:00
Collecting pydeck<1,>=0.8.0b4
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
     ---------------------------------------- 6.9/6.9 MB 2.0 MB/s eta 0:00:00
Collecting blinker<2,>=1.5.0
  Downloading blinker-1.9.0-py3-none-any.whl (8.5 kB)
Collecting tenacity<10,>=8.1.0
  Downloading tenacity-9.1.2-py3-none-any.whl (28 kB)
Collecting narwhals>=1.14.2
  Downloading narwhals-2.1.2-py3-none-any.whl (392 kB)
     -------------------------------------- 392.1/392.1 kB 1.2 MB/s eta 0:00:00
Collecting gitdb