In [1]:
# Import Libraries and Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
from fuzzywuzzy import process
import pandas as pd
import requests
import time

In [2]:
# Set up Splinter
browser = Browser('chrome')

In [3]:
# Visit the NUFORC REPORTS UFO SIGHTINGS site
url = 'https://nuforc.org/subndx/?id=cMexico'
browser.visit(url)

In [4]:
# Scrape the website
html = browser.html

# Create a BeautifulSoup object from the scraped HTML
soup = BeautifulSoup(html, 'html.parser')

In [5]:
# Initialize variables to store scraped data
all_data = []  # List to store table data
links = []     # List to store links
# Loop until the "Next" link is clicked 4 times
# The underscore means "iterate four times, and I don't care about the loop variable."
for _ in range(4):
    
    ''' EXTRACT DATA AND LINKS FROM THE TABLE INTO LISTS
     '#table_1 tr' is a CSS selector used to select all <tr> (table row) elements within
     the HTML table that has an id attribute equal to 'table_1'.'''
    
    table_rows = soup.select('#table_1 tr')[1:]  # # the [1:] means skip the first row (headers)
    for row in table_rows:
        # Extract text data from each <td> element in the row and removes leading and trailing whitespace characters from each extracted text
        row_data = [td.get_text(strip=True) for td in row.find_all('td')]
        all_data.append(row_data)  # Append row data to the list of all data
        # Extract links from <a> elements in the row
        row_links = [f"https://nuforc.org{a['href']}" for a in row.find_all('a', href=True)]
        links.extend(row_links)  # Extend the list of links with links from the current row
    # Find the "Next" link and click it
    next_link = browser.find_by_id('table_1_next')
    next_link.click()
    # Wait for the page to load
    time.sleep(3)  # Adjust the sleep time as needed
    # Update the HTML content after navigating to the next page
    html = browser.html
    # Create a new BeautifulSoup object from the updated HTML
    soup = BeautifulSoup(html, 'html.parser')
# Extract headers from the table
headers = [header.get_text(strip=True) for header in soup.select('#table_1 th')]
# Convert the list of lists into a pandas DataFrame
table_df = pd.DataFrame(all_data, columns=headers)  # Create DataFrame for table data
links_df = pd.DataFrame(links, columns=['Link'])    # Create DataFrame for links


In [6]:
    # Display the table_df
table_df

Unnamed: 0,Link,Occurred,City,State,Country,Shape,Summary,Reported,Media,Explanation
0,Open,05/15/2024 02:20,Mazatlán,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,,,
1,Open !,05/12/2024 20:58,Los Médanos,Baja California,Mexico,Cylinder,"We observed a very large (500-1500 ft long), d...",,,
2,Open,05/09/2024 21:38,San José del Cabo,Baja California Sur,Mexico,Triangle,We saw a triangular/round bright light in the ...,Y,Rocket,
3,Open,02/26/2024 19:40,Ciudad de México,Ciudad de México,Mexico,Orb,I was walking down the street coming to my hom...,Y,,
4,Open,12/28/2023 20:49,Acapulco de Juárez,Guerrero,Mexico,Circle,Maybe drone or maybe ufo,Y,Drone?,
...,...,...,...,...,...,...,...,...,...,...
399,Open,02/16/1984 23:00,"Guadalajara, Jalisco (Mexico)",,Mexico,Formation,it happend while on a road to guadalajara jali...,03/01/1998,,
400,Open,06/01/1983 20:30,Ciudad Lazaro Cardenas (Mexico),,Mexico,Sphere,We saw a gigantic sphere in the sky one Saturd...,05/14/1999,,
401,Open,05/03/1983 22:00,Highway (Mexico),,Mexico,Oval,Huge light formation spanned across (and above...,01/04/2003,,
402,Open,08/01/1982 13:00,Cozumel (Mexico),,Mexico,,Can someone give me an idea what this is?,08/02/2001,,


In [7]:
# Display the links_df
links_df.head()

Unnamed: 0,Link
0,https://nuforc.org/sighting/?id=181515
1,https://nuforc.org/sighting/?id=181495
2,https://nuforc.org/sighting/?id=181446
3,https://nuforc.org/sighting/?id=180712
4,https://nuforc.org/sighting/?id=179884


In [8]:
# Drop the first column in the table_df
table_df.drop(columns=table_df.columns[0], inplace=True)

# Merge both dataframes to include the links column (replacing the dropped one)
merged_df = pd.merge(table_df, links_df, left_index=True, right_index=True)

In [9]:
merged_df

Unnamed: 0,Occurred,City,State,Country,Shape,Summary,Reported,Media,Explanation,Link
0,05/15/2024 02:20,Mazatlán,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,,,,https://nuforc.org/sighting/?id=181515
1,05/12/2024 20:58,Los Médanos,Baja California,Mexico,Cylinder,"We observed a very large (500-1500 ft long), d...",,,,https://nuforc.org/sighting/?id=181495
2,05/09/2024 21:38,San José del Cabo,Baja California Sur,Mexico,Triangle,We saw a triangular/round bright light in the ...,Y,Rocket,,https://nuforc.org/sighting/?id=181446
3,02/26/2024 19:40,Ciudad de México,Ciudad de México,Mexico,Orb,I was walking down the street coming to my hom...,Y,,,https://nuforc.org/sighting/?id=180712
4,12/28/2023 20:49,Acapulco de Juárez,Guerrero,Mexico,Circle,Maybe drone or maybe ufo,Y,Drone?,,https://nuforc.org/sighting/?id=179884
...,...,...,...,...,...,...,...,...,...,...
395,06/30/1988 23:00,Mexico (Pacific Ocean),,Mexico,Disk,WE WERE COMMERCIAL NET FISHING SWORDFISH ABOUT...,07/05/2008,,,https://nuforc.org/sighting/?id=98043
396,06/01/1985 18:00,Michoacan (Mexico),,Mexico,Disk,dark gray with lights around it,04/24/2007,,,https://nuforc.org/sighting/?id=3259
397,08/15/1984 22:30,Mexico,,Mexico,Light,8/15/84 22:30 intense white light with t...,10/13/2005,,,https://nuforc.org/sighting/?id=7737
398,07/01/1984 07:00,Morelia (Mexico),,Mexico,Sphere,Orange spheres close to homes in Mexico.,05/17/2013,,,https://nuforc.org/sighting/?id=26888


In [30]:
merged_df.drop(columns=['Media','Explanation','Reported'], inplace=True)

In [31]:
merged_df

Unnamed: 0,Occurred,City,State,Country,Shape,Summary,Link
0,05/15/2024 02:20,Mazatlán,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,https://nuforc.org/sighting/?id=181515
1,05/12/2024 20:58,Los Médanos,Baja California,Mexico,Cylinder,"We observed a very large (500-1500 ft long), d...",https://nuforc.org/sighting/?id=181495
2,05/09/2024 21:38,San José del Cabo,Baja California Sur,Mexico,Triangle,We saw a triangular/round bright light in the ...,https://nuforc.org/sighting/?id=181446
3,02/26/2024 19:40,Ciudad de México,Ciudad de México,Mexico,Orb,I was walking down the street coming to my hom...,https://nuforc.org/sighting/?id=180712
4,12/28/2023 20:49,Acapulco de Juárez,Guerrero,Mexico,Circle,Maybe drone or maybe ufo,https://nuforc.org/sighting/?id=179884
...,...,...,...,...,...,...,...
395,06/30/1988 23:00,Mexico (Pacific Ocean),,Mexico,Disk,WE WERE COMMERCIAL NET FISHING SWORDFISH ABOUT...,https://nuforc.org/sighting/?id=98043
396,06/01/1985 18:00,Michoacan (Mexico),,Mexico,Disk,dark gray with lights around it,https://nuforc.org/sighting/?id=3259
397,08/15/1984 22:30,Mexico,,Mexico,Light,8/15/84 22:30 intense white light with t...,https://nuforc.org/sighting/?id=7737
398,07/01/1984 07:00,Morelia (Mexico),,Mexico,Sphere,Orange spheres close to homes in Mexico.,https://nuforc.org/sighting/?id=26888


In [32]:
# Export to CSV
merged_df.to_csv('./RESOURCES/nuforc_data.csv', index=False)

# World Cities DB Cleaning

In [10]:
# Read csv and create DF
worldcities_df = pd.read_csv("./RESOURCES/worldcities.csv")
worldcities_df.drop(columns=worldcities_df.columns[0], inplace=True)

worldcities_df.head()

Unnamed: 0,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37732000.0,1392685764
1,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077
2,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604
3,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133
4,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629


In [11]:
# Adjust the column name as necessary based on your dataset
filtered_df = worldcities_df[worldcities_df['country'] == 'Mexico'].copy()

# Display the first few rows of the filtered data
filtered_df.head()

Unnamed: 0,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
9,Mexico City,19.4333,-99.1333,Mexico,MX,MEX,Ciudad de México,primary,21804000.0,1484247881
109,Guadalajara,20.6767,-103.3475,Mexico,MX,MEX,Jalisco,admin,5525000.0,1484950208
115,Monterrey,25.6667,-100.3,Mexico,MX,MEX,Nuevo León,admin,5341171.0,1484559591
416,Tijuana,32.525,-117.0333,Mexico,MX,MEX,Baja California,minor,2002000.0,1484708778
427,Ecatepec,19.6097,-99.06,Mexico,MX,MEX,México,minor,1929926.0,1484003694


In [12]:
# Check how many rows there are in the filtered worldcities_df
num_rows = filtered_df.shape[0]
print(f"Number of rows in the filtered DataFrame: {num_rows}")

Number of rows in the filtered DataFrame: 983


In [13]:
filtered_df.columns

Index(['city_ascii', 'lat', 'lng', 'country', 'iso2', 'iso3', 'admin_name',
       'capital', 'population', 'id'],
      dtype='object')

In [14]:
filtered_df.drop(columns=['id','country', 'iso2', 'iso3', 'capital', 'population'], inplace=True)
filtered_df.columns

Index(['city_ascii', 'lat', 'lng', 'admin_name'], dtype='object')

In [15]:
filtered_df.head()

Unnamed: 0,city_ascii,lat,lng,admin_name
9,Mexico City,19.4333,-99.1333,Ciudad de México
109,Guadalajara,20.6767,-103.3475,Jalisco
115,Monterrey,25.6667,-100.3,Nuevo León
416,Tijuana,32.525,-117.0333,Baja California
427,Ecatepec,19.6097,-99.06,México


In [17]:
# Rename the columns
filtered_df.rename(columns={
    'city_ascii': 'City',
    'admin_name': 'State',
    'lat': 'Lat',
    'lng': 'Lng'
}, inplace=True)

# Reorder the columns
filtered_df = filtered_df[['City', 'State', 'Lat', 'Lng']]

filtered_df.head()

Unnamed: 0,City,State,Lat,Lng
9,Mexico City,Ciudad de México,19.4333,-99.1333
109,Guadalajara,Jalisco,20.6767,-103.3475
115,Monterrey,Nuevo León,25.6667,-100.3
416,Tijuana,Baja California,32.525,-117.0333
427,Ecatepec,México,19.6097,-99.06


In [18]:
# Save the modified DataFrame to a new CSV file
filtered_df.to_csv('./RESOURCES/mexico_cities.csv', index=False)

# NUFORC Mexico DB Cleaning

In [40]:
# Read csv and create DF
UFO_df = pd.read_csv("./RESOURCES/nuforc_data.csv")
UFO_df.drop(columns=UFO_df.columns[0], inplace=True)

UFO_df.head()

Unnamed: 0,City,State,Country,Shape,Summary,Link
0,Mazatlán,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,https://nuforc.org/sighting/?id=181515
1,Los Médanos,Baja California,Mexico,Cylinder,"We observed a very large (500-1500 ft long), d...",https://nuforc.org/sighting/?id=181495
2,San José del Cabo,Baja California Sur,Mexico,Triangle,We saw a triangular/round bright light in the ...,https://nuforc.org/sighting/?id=181446
3,Ciudad de México,Ciudad de México,Mexico,Orb,I was walking down the street coming to my hom...,https://nuforc.org/sighting/?id=180712
4,Acapulco de Juárez,Guerrero,Mexico,Circle,Maybe drone or maybe ufo,https://nuforc.org/sighting/?id=179884


In [41]:
UFO_df.columns

Index(['City', 'State', 'Country', 'Shape', 'Summary', 'Link'], dtype='object')

In [42]:
len(UFO_df)

400

In [43]:
# Create a new column with the information that is in parenthesis in City
UFO_df['Notas'] = UFO_df['City'].str.extract(r'\((.*?)\)', expand=False)

# Delete the information in parenthesis in the column City
UFO_df['City'] = UFO_df['City'].str.replace(r'\((.*?)\)', '', regex=True).str.strip()

# Remove any accidental duplicates created
UFO_df = UFO_df.drop_duplicates()

UFO_df.head()

Unnamed: 0,City,State,Country,Shape,Summary,Link,Notas
0,Mazatlán,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,https://nuforc.org/sighting/?id=181515,
1,Los Médanos,Baja California,Mexico,Cylinder,"We observed a very large (500-1500 ft long), d...",https://nuforc.org/sighting/?id=181495,
2,San José del Cabo,Baja California Sur,Mexico,Triangle,We saw a triangular/round bright light in the ...,https://nuforc.org/sighting/?id=181446,
3,Ciudad de México,Ciudad de México,Mexico,Orb,I was walking down the street coming to my hom...,https://nuforc.org/sighting/?id=180712,
4,Acapulco de Juárez,Guerrero,Mexico,Circle,Maybe drone or maybe ufo,https://nuforc.org/sighting/?id=179884,


In [44]:
len(UFO_df)

400

# Checking for typos and Assigning State

In [45]:
# Function to find the best match for each city name
def find_best_match(city, choices):
    if pd.isna(city):
        return city
    match, score = process.extractOne(str(city), choices)
    return match if score > 80 else city  # Return the original city if no good match is found

# Ensure all values in the 'City' column are treated as strings
UFO_df['City'] = UFO_df['City'].astype(str)

# List of known correct city names
known_cities = filtered_df['City'].tolist()

# Correct the city names in UFO_df
UFO_df['Corrected_City'] = UFO_df['City'].apply(lambda x: find_best_match(x, known_cities))

# Remove duplicates in filtered_df by keeping only the first occurrence of each city
filtered_df_unique = filtered_df.drop_duplicates(subset='City', keep='first')

# Create a dictionary from filtered_df_unique for quick lookup
city_state_dict = filtered_df_unique.set_index('City')['State'].to_dict()

# Assign the correct state based on the corrected city
UFO_df['State_corrected'] = UFO_df['Corrected_City'].map(city_state_dict)

# Replace the information in the 'City' column with 'Corrected_City'
UFO_df['City'] = UFO_df['Corrected_City']

# Replace the information in the 'State' column with 'State_corrected'
UFO_df['State'] = UFO_df['State_corrected']

# Drop rows where the 'State' column is empty
UFO_df.dropna(subset=['State'], inplace=True)

# Drop the redundant 'Corrected_City' and 'State_corrected' columns
UFO_df.drop(columns=['Corrected_City', 'State_corrected', 'Notas'], inplace=True)

# Save the modified DataFrame to a new CSV file
UFO_df.to_csv('./RESOURCES/NUFORC_Mexico_clean.csv', index=False)


In [29]:
# Display the final DataFrame
UFO_df

Unnamed: 0,City,State,Country,Shape,Summary,Reported,Media,Explanation,Link
0,Mazatlan,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,,,,https://nuforc.org/sighting/?id=181515
1,Leon de los Aldama,Guanajuato,Mexico,Cylinder,"We observed a very large (500-1500 ft long), d...",,,,https://nuforc.org/sighting/?id=181495
2,San Jose del Cabo,Baja California Sur,Mexico,Triangle,We saw a triangular/round bright light in the ...,Y,Rocket,,https://nuforc.org/sighting/?id=181446
3,Xico,Veracruz,Mexico,Orb,I was walking down the street coming to my hom...,Y,,,https://nuforc.org/sighting/?id=180712
4,Acapulco de Juarez,Guerrero,Mexico,Circle,Maybe drone or maybe ufo,Y,Drone?,,https://nuforc.org/sighting/?id=179884
...,...,...,...,...,...,...,...,...,...
394,Ensenada,Baja California,Mexico,Disk,A circular shaped craft emiting three amber r...,04/14/2004,,,https://nuforc.org/sighting/?id=46984
395,Mexico City,Ciudad de México,Mexico,Disk,WE WERE COMMERCIAL NET FISHING SWORDFISH ABOUT...,07/05/2008,,,https://nuforc.org/sighting/?id=98043
397,Mexico City,Ciudad de México,Mexico,Light,8/15/84 22:30 intense white light with t...,10/13/2005,,,https://nuforc.org/sighting/?id=7737
398,Morelia,Michoacán,Mexico,Sphere,Orange spheres close to homes in Mexico.,05/17/2013,,,https://nuforc.org/sighting/?id=26888


In [53]:
UFO_df['City'].value_counts()

City
Mexico City        40
Cancun             39
Puerto Vallarta    26
Tijuana            21
Monterrey          12
                   ..
Guadalupe           1
Culiacan            1
Zapopan             1
Manzanillo          1
Zihuatanejo         1
Name: count, Length: 93, dtype: int64

In [46]:
len(UFO_df)

359

In [47]:
# Merge "merged_df" with "UFO_df" to add coordinates to the table corresponding to each city
sightings_with_coordinates = pd.merge(UFO_df, filtered_df, on='City')

In [48]:
sightings_with_coordinates

Unnamed: 0,City,State_x,Country,Shape,Summary,Link,State_y,Lat,Lng
0,Mazatlan,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,https://nuforc.org/sighting/?id=181515,Sinaloa,23.2167,-106.4167
1,Mazatlan,Sinaloa,Mexico,Sphere,I noticed a perfectly round ball traveling sou...,https://nuforc.org/sighting/?id=161740,Sinaloa,23.2167,-106.4167
2,Mazatlan,Sinaloa,Mexico,Circle,We were sitting at the beach and looked up to ...,https://nuforc.org/sighting/?id=145434,Sinaloa,23.2167,-106.4167
3,Mazatlan,Sinaloa,Mexico,Light,about 300 saw a bright comet-like object flyin...,https://nuforc.org/sighting/?id=50175,Sinaloa,23.2167,-106.4167
4,Mazatlan,Sinaloa,Mexico,,I have 20 minutes of film but no one seams int...,https://nuforc.org/sighting/?id=12584,Sinaloa,23.2167,-106.4167
...,...,...,...,...,...,...,...,...,...
369,Teotihuacan,México,Mexico,Disk,A disk-shaped flying object appeared above the...,https://nuforc.org/sighting/?id=173550,México,19.6897,-98.8608
370,Aguascalientes,Aguascalientes,Mexico,Oval,"The witnesses of this sighting are my father,m...",https://nuforc.org/sighting/?id=74428,Aguascalientes,21.8760,-102.2960
371,Apaseo el Grande,Guanajuato,Mexico,Oval,UFO sighting and scanning sentsation in El Aju...,https://nuforc.org/sighting/?id=52033,Guanajuato,20.5531,-100.6347
372,Reynosa,Tamaulipas,Mexico,Formation,THEY WERE FLYING AT AT A FAST SPEED,https://nuforc.org/sighting/?id=64312,Tamaulipas,26.0922,-98.2778


In [56]:
# Drop unnecesary columns
sightings_with_coordinates.drop(columns=['State_y'], inplace=True)
sightings_with_coordinates

Unnamed: 0,City,State_x,Country,Shape,Summary,Link,Lat,Lng
0,Mazatlan,Sinaloa,Mexico,Light,Lights dropping from very high altitude and th...,https://nuforc.org/sighting/?id=181515,23.2167,-106.4167
1,Mazatlan,Sinaloa,Mexico,Sphere,I noticed a perfectly round ball traveling sou...,https://nuforc.org/sighting/?id=161740,23.2167,-106.4167
2,Mazatlan,Sinaloa,Mexico,Circle,We were sitting at the beach and looked up to ...,https://nuforc.org/sighting/?id=145434,23.2167,-106.4167
3,Mazatlan,Sinaloa,Mexico,Light,about 300 saw a bright comet-like object flyin...,https://nuforc.org/sighting/?id=50175,23.2167,-106.4167
4,Mazatlan,Sinaloa,Mexico,,I have 20 minutes of film but no one seams int...,https://nuforc.org/sighting/?id=12584,23.2167,-106.4167
...,...,...,...,...,...,...,...,...
369,Teotihuacan,México,Mexico,Disk,A disk-shaped flying object appeared above the...,https://nuforc.org/sighting/?id=173550,19.6897,-98.8608
370,Aguascalientes,Aguascalientes,Mexico,Oval,"The witnesses of this sighting are my father,m...",https://nuforc.org/sighting/?id=74428,21.8760,-102.2960
371,Apaseo el Grande,Guanajuato,Mexico,Oval,UFO sighting and scanning sentsation in El Aju...,https://nuforc.org/sighting/?id=52033,20.5531,-100.6347
372,Reynosa,Tamaulipas,Mexico,Formation,THEY WERE FLYING AT AT A FAST SPEED,https://nuforc.org/sighting/?id=64312,26.0922,-98.2778


In [60]:
sightings_with_coordinates['Shape']=sightings_with_coordinates['Shape'].fillna('Unknown')

In [None]:
# Save the modified DataFrame to a new CSV file
sightings_with_coordinates.to_csv('./RESOURCES/sightings_with_coordinates.csv', index=False)

In [61]:
sightings_with_coordinates['Shape'].unique()

array(['Light', 'Sphere', 'Circle', 'Unknown', 'Cylinder', 'Cigar',
       'Disk', 'Triangle', 'Other', 'Fireball', 'Orb', 'Chevron', 'Cone',
       'Oval', 'Diamond', 'Star', 'Changing', 'Egg', 'Cross', 'Formation',
       'Flash', 'Rectangle', 'Teardrop'], dtype=object)

In [62]:
sightings_with_coordinates.to_json('ufo_sightings_with_coordinates.json', orient='records', indent=4)