In [5]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient


In [6]:
from bs4 import BeautifulSoup
import requests

# Get website from user
url = input("Enter a website to extract URLs from: ").strip()

# Ensure it starts with http/https
if not url.startswith(("http://", "https://")):
    url = "https://" + url  # safer default

# Set headers to avoid 403
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/124.0.0.0 Safari/537.36"
}

try:
    # Fetch webpage
    r = requests.get(url, headers=headers, timeout=10)
    r.raise_for_status()  # raises error for 4xx/5xx

    # Parse HTML
    soup = BeautifulSoup(r.text, 'lxml')

    # Extract and print title
    if soup.title:
        title = soup.title.string.strip()
        tag_name = soup.title.name
        print(f"\nPage Title: {title}\nTag Name: {tag_name}\n")
    else:
        print("No title found.\n")

    # Extract and print URLs
    print("All found URLs:\n")
    for link in soup.find_all('a', href=True):
        print(link['href'])

except requests.exceptions.RequestException as e:
    print(f"Error fetching the site: {e}")



Page Title: Google
Tag Name: title

All found URLs:

https://about.google/?fg=1&utm_source=google-IN&utm_medium=referral&utm_campaign=hp-header
https://store.google.com/IN?utm_source=hp_header&utm_medium=google_ooo&utm_campaign=GS100042&hl=en-IN
https://mail.google.com/mail/&ogbl
https://www.google.com/imghp?hl=en&ogbl
https://www.google.co.in/intl/en/about/products
https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=futura_exp_og_so_72776762_e
https://www.google.com/setprefs?sig=0_2NMF7clKURSWPfXn1iu1hEofHak%3D&hl=hi&source=homepage&sa=X&ved=0ahUKEwjOxLzqm4WRAxUjR2wGHTNIJZYQ2ZgBCBg
https://www.google.com/setprefs?sig=0_2NMF7clKURSWPfXn1iu1hEofHak%3D&hl=bn&source=homepage&sa=X&ved=0ahUKEwjOxLzqm4WRAxUjR2wGHTNIJZYQ2ZgBCBk
https://www.google.com/setprefs?sig=0_2NMF7clKURSWPfXn1iu1hEofHak%3D&hl=te&source=homepage&sa=X&ved=0ahUKEwjOxLzqm4WRAxUjR2wGHTNIJZYQ2ZgBCBo
https://www.google.com/setprefs?sig=0_2NMF7clKURSWPfXn1iu1hEofHak%3D&hl=mr&source=h

In [7]:
url = 'http://www.usamega.com/mega-millions-history.asp?p=1'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/124.0.0.0 Safari/537.36"
}

req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, 'lxml')  


tables = soup.find_all('table')

if len(tables) > 4:
    rows = tables[4].find_all('tr')
    if len(rows) > 1:
        cells = rows[1].find_all('td')
        if len(cells) > 3:
            try:
                draw_date = cells[1].a.string.strip() if cells[1].a else "No link text"
                jackpot_b = cells[3].b.string.strip() if cells[3].b else "No <b> tag"
                jackpot_strong = cells[3].strong.string.strip() if cells[3].strong else "No <strong> tag"
                
                print("Draw Date:", draw_date)
                print("Jackpot (b tag):", jackpot_b)
                print("Jackpot (strong tag):", jackpot_strong)
            except Exception as e:
                print("Error extracting data:", e)
        else:
            print("Not enough <td> elements in the row.")
    else:
        print("Not enough rows in the table.")
else:
    print("Not enough tables on the page.")

Not enough tables on the page.


In [8]:
# MongoDB setup
MONGO_URI = "mongodb://localhost:27017/"
DATABASE = "lotto"
COLLECTION = "mega_millions"

def mongo_connection():
    client = MongoClient(MONGO_URI)
    col = client[DATABASE][COLLECTION]
    return col

def scrape_page(page_num):
    url = f"https://www.usamega.com/mega-millions-history.asp?p={page_num}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/124.0.0.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "lxml")

    tables = soup.find_all("table")
    if len(tables) < 5:
        print(f"Page {page_num}: expected table not found.")
        return []

    rows = tables[4].find_all("tr")
    results = []

    for row in rows[1:]:  # skip header
        tds = row.find_all("td")
        if len(tds) < 4:
            continue

        draw_link = tds[1].find("a")
        jackpot_cell = tds[3]

        if not draw_link or not jackpot_cell:
            continue

        draw_date = draw_link.get_text(strip=True)
        numbers_text = jackpot_cell.find("b")
        mega_text = jackpot_cell.find("strong")

        if not numbers_text or not mega_text:
            continue

        # Split numbers, BeautifulSoup already decodes &middot;
        numbers = [int(n) for n in numbers_text.get_text(strip=True).split("·")]
        mega_number = int(mega_text.get_text(strip=True))

        results.append({
            "date": draw_date,
            "numbers": numbers,
            "mega_number": mega_number
        })
    return results

def main():
    col = mongo_connection()
    total_pages = 63

    for page_num in range(1, total_pages + 1):
        print(f"Scraping page {page_num}...")
        data = scrape_page(page_num)
        if data:
            col.insert_many(data)
            print(f"Inserted {len(data)} records from page {page_num}")
        else:
            print(f"No data on page {page_num}")

if __name__ == "__main__":
    main()


Scraping page 1...


HTTPError: 403 Client Error: Forbidden for url: https://www.usamega.com/mega-millions-history.asp?p=1

In [9]:
from pymongo import MongoClient

# Connect to your Atlas cluster
uri = "MONGO_DB_URL"
client = MongoClient(uri)

# Select database and collection
db = client["lotto"]
collection = db["mega_millions"]

# Example data to insert
draws = [
    {
        "date": "2025-10-25",
        "numbers": [5, 12, 28, 36, 48],
        "mega_number": 17
    },
    {
        "date": "2025-10-22",
        "numbers": [1, 8, 19, 32, 45],
        "mega_number": 10
    },
    {
        "date": "2025-10-18",
        "numbers": [7, 16, 24, 39, 42],
        "mega_number": 6
    }
]

# Insert multiple documents
result = collection.insert_many(draws)
print("Inserted IDs:", result.inserted_ids)

# Optional: verify insertion
for doc in collection.find():
    print(doc)


Inserted IDs: [ObjectId('692163da095d3dd71747d9bf'), ObjectId('692163da095d3dd71747d9c0'), ObjectId('692163da095d3dd71747d9c1')]
{'_id': ObjectId('6903766be55cf07cfaf09fd3'), 'date': '2025-10-18', 'numbers': [7, 16, 24, 39, 42], 'mega_number': 6}
{'_id': ObjectId('6903766be55cf07cfaf09fd1'), 'date': '2025-10-25', 'numbers': [5, 12, 28, 36, 48], 'mega_number': 17}
{'_id': ObjectId('6903766be55cf07cfaf09fd2'), 'date': '2025-10-22', 'numbers': [1, 8, 19, 32, 45], 'mega_number': 10}
{'_id': ObjectId('692163da095d3dd71747d9bf'), 'date': '2025-10-25', 'numbers': [5, 12, 28, 36, 48], 'mega_number': 17}
{'_id': ObjectId('692163da095d3dd71747d9c0'), 'date': '2025-10-22', 'numbers': [1, 8, 19, 32, 45], 'mega_number': 10}
{'_id': ObjectId('692163da095d3dd71747d9c1'), 'date': '2025-10-18', 'numbers': [7, 16, 24, 39, 42], 'mega_number': 6}


In [10]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import datetime

# === Configuration ===
MONGO_URI = "MONGO_DB_URL"
DATABASE = "lotto"
COLLECTION = "mega_millions"
# Website that lists the latest Mega Millions results
RESULTS_URL = "https://en.wikipedia.org/wiki/Computer"  # example source

# === Database connection ===
client = MongoClient(MONGO_URI)
db = client[DATABASE]
col = db[COLLECTION]

# === Function to fetch latest draw ===
def fetch_latest_draw():
    resp = requests.get(RESULTS_URL, headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/124.0.0.0 Safari/537.36"
    }, timeout=10)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "lxml")

    # Example: from lotteryusa page we might extract first draw info
    # (This part **must** be adapted to the actual HTML structure)
    # For example: the latest draw date might be in a <div> with class “results__header” etc
    # Here we make a guess:
    date_div = soup.select_one("section.results-section h3")  # this is speculative
    if not date_div:
        raise RuntimeError("Could not find draw date element")

    draw_date_str = date_div.get_text(strip=True)
    # parse date (depends on format)
    draw_date = datetime.datetime.strptime(draw_date_str, "%A, %b %d, %Y")  # adjust format accordingly

    numbers_div = soup.select_one("ul.drawresult-numbers")  # speculative selector
    if not numbers_div:
        raise RuntimeError("Could not find numbers element")

    number_spans = numbers_div.find_all("li")
    # We expect 5 white + 1 mega ball
    if len(number_spans) < 6:
        raise RuntimeError("Unexpected numbers count")

    whites = [int(span.get_text()) for span in number_spans[:5]]
    mega_ball = int(number_spans[5].get_text())

    return {
        "date": draw_date.strftime("%Y-%m-%d"),
        "numbers": whites,
        "mega_number": mega_ball
    }

def main():
    try:
        draw = fetch_latest_draw()
        print("Fetched draw:", draw)

        # Check if we already have this draw in DB
        existing = col.find_one({"date": draw["date"]})
        if existing:
            print("Draw for date", draw["date"], "already in database. Skipping insert.")
        else:
            result = col.insert_one(draw)
            print("Inserted draw with _id:", result.inserted_id)

    except Exception as e:
        print("Error during fetching/inserting:", e)

if __name__ == "__main__":
    main()



Error during fetching/inserting: Could not find draw date element


In [11]:
import pandas as pd
import folium

# Example data
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 22],
    'Location': ['37.7749,-122.4194', '34.0522,-118.2437', '31.2304,121.4737']
}

# Create DataFrame
df2 = pd.DataFrame(data)

# Create base map
world_map = folium.Map(location=[35, 100], zoom_start=4)

# Add markers
for i in range(len(df2)):
    lat, lon = map(float, df2.Location[i].split(','))
    popup_text = f"{df2.Name[i]}, age: {df2.Age[i]}"
    folium.Marker(location=[lat, lon], popup=popup_text).add_to(world_map)

# Display the map (in Jupyter) or save to file
world_map.save("world_map.html")
world_map
