In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time

In [2]:
# Set up Splinter
browser = Browser('chrome')

In [3]:
# Visit the NUFORC REPORTS UFO SIGHTINGS site
url = 'https://nuforc.org/subndx/?id=cMexico'
browser.visit(url)

In [4]:
# Scrape the website
html = browser.html

# Create a BeautifulSoup object from the scraped HTML
soup = BeautifulSoup(html, 'html.parser')

In [5]:
# Find the table with id 'table_1'
table = soup.find('table', id='table_1')

# Extract data and links from the table into lists
table_rows = table.find_all('tr')
data = []
links = []  # List to store extracted links

for row in table_rows:
    row_data = []   # List to store data
    row_links = []  # List to store links in the current row

    for td in row.find_all('td'):
        # Extract text data from each <td> and append it to the row_data list
        row_data.append(td.text.strip())

        # Find all 'a' tags and extract the 'href' attribute
        for a in td.find_all('a', href=True):
            # Append the full link to the row_links list
            row_links.append(f"https://nuforc.org{a['href']}")

    # Append the row data to the data list if it's not empty
    if row_data:
        data.append(row_data)
    
    # Extend the links list with links from the current row
    links.extend(row_links)

# Extract headers from the table
headers = [header.text for header in table.find_all('th')]

# Convert the list of lists into a pandas DataFrame
table_df = pd.DataFrame(data, columns=headers)
links_df = pd.DataFrame(links, columns=['Link'])



In [6]:
    # Display the table_df
table_df.head()

Unnamed: 0,Link,Occurred,City,State,Country,Shape,Summary,Media,Explanation
0,Open,05/15/2024 02:20,Mazatlán,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,,
1,Open !,05/12/2024 20:58,Los Médanos,Baja California,Mexico,Cylinder,"We observed a very large (500-1500 ft long), d...",,
2,Open,05/09/2024 21:38,San José del Cabo,Baja California Sur,Mexico,Triangle,We saw a triangular/round bright light in the ...,Y,Rocket
3,Open,02/26/2024 19:40,Ciudad de México,Ciudad de México,Mexico,Orb,I was walking down the street coming to my hom...,Y,
4,Open,12/28/2023 20:49,Acapulco de Juárez,Guerrero,Mexico,Circle,Maybe drone or maybe ufo,Y,Drone?


In [7]:
    # Display the links_df
links_df.head()

Unnamed: 0,Link
0,https://nuforc.org/sighting/?id=181515
1,https://nuforc.org/sighting/?id=181495
2,https://nuforc.org/sighting/?id=181446
3,https://nuforc.org/sighting/?id=180712
4,https://nuforc.org/sighting/?id=179884


In [8]:
# Drop the first column from the data_df and merge both dataframes to include the links column (replacing the dropped one)
# Drop the first column in the table_df
table_df.drop(columns=table_df.columns[0], inplace=True)
merged_df = pd.merge(table_df, links_df, left_index=True, right_index=True)
merged_df.head()

Unnamed: 0,Occurred,City,State,Country,Shape,Summary,Media,Explanation,Link
0,05/15/2024 02:20,Mazatlán,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,,,https://nuforc.org/sighting/?id=181515
1,05/12/2024 20:58,Los Médanos,Baja California,Mexico,Cylinder,"We observed a very large (500-1500 ft long), d...",,,https://nuforc.org/sighting/?id=181495
2,05/09/2024 21:38,San José del Cabo,Baja California Sur,Mexico,Triangle,We saw a triangular/round bright light in the ...,Y,Rocket,https://nuforc.org/sighting/?id=181446
3,02/26/2024 19:40,Ciudad de México,Ciudad de México,Mexico,Orb,I was walking down the street coming to my hom...,Y,,https://nuforc.org/sighting/?id=180712
4,12/28/2023 20:49,Acapulco de Juárez,Guerrero,Mexico,Circle,Maybe drone or maybe ufo,Y,Drone?,https://nuforc.org/sighting/?id=179884


In [9]:

# Find the table with id 'table_1'
table = soup.find('table', id='table_1')

# Check if the table is found
if table:
    all_data = []
    while True:
        # Extract data from the table into a list of lists
        table_rows = table.find_all('tr')
        data = []
        for row in table_rows:
            row_data = []
            for td in row.find_all('td'):
                row_data.append(td.text.strip())
            if row_data:
                data.append(row_data)
        
        # Extend the list of all data with data from the current page
        all_data.extend(data)
        
        # Find the "Next" link
        next_link = browser.find_by_id('table_1_next')

        # Check if the "Next" link exists
        if next_link:
            # Click the "Next" link
            next_link.click()

            # Wait for the page to load (you may need to adjust the wait time)
            time.sleep(2)  # Adjust the sleep time as needed

            # Update the HTML content after navigating to the next page
            html = browser.html

            # Create a new BeautifulSoup object from the updated HTML
            soup = BeautifulSoup(html, 'html.parser')

            # Find the table with id 'table_1' on the new page
            table = soup.find('table', id='table_1')
        else:
            break

    # Convert the list of lists into a pandas DataFrame
    table_df = pd.DataFrame(all_data)

    # Display the DataFrame
    print(table_df.head())
else:
    print("Table not found.")


KeyboardInterrupt: 

In [None]:
browser.links.find_by_partial_text('Next').click()

In [None]:
base_url = "https://nuforc.org/subndx/?id=cMexico&page="
# Número de páginas a scraper
num_pages = 5  # Ajusta este valor según sea necesario
# Crear un DataFrame para almacenar los datos
df = pd.DataFrame()
# Iterar sobre las páginas
for page in range(1, num_pages + 1):
    # Construir la URL para la página actual
    url = base_url + str(page)
    # Enviar solicitud GET para la página actual
    response = requests.get(url)
    # Parsear el contenido HTML con BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    # Encontrar la tabla que contiene los datos
    table = soup.find('table', {'class': 'table-striped'})
    # Extraer los encabezados de la tabla
    headers = [th.text.strip() for th in table.find('tr').find_all('th')]
    # Extraer los datos de la tabla
    data = []
    for row in table.find_all('tr')[1:]:
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        data.append([col for col in cols])
    # Convertir los datos en un DataFrame de Pandas
    page_df = pd.DataFrame(data, columns=headers)
    # Agregar los datos de la página actual al DataFrame principal
    df = pd.concat([df, page_df], ignore_index=True)
# Mostrar el DataFrame completo
print(df.head())
