In [None]:
# Import Splinter, BeautifulSoup and dependencies
import requests
from bs4 import BeautifulSoup as bs
from pymongo import MongoClient
from splinter import Browser
from selenium import webdriver
import time
import pandas as pd
from pprint import pprint
import numpy as np

## Data Extraction: Web Scrape and Save Output

In [None]:
# Launch browser
browser = Browser('chrome') 
url = "https://nuforc.org/subndx/?id=all"
browser.visit(url)
time.sleep(4) 

# Initialize an empty list to store all sightings
sightings = []


In [None]:
record_count = 0
total_records_to_collect = 59000

# Main scraping loop
while record_count < total_records_to_collect:
    # Create Beautiful Soup object
    html = browser.html
    soup = bs(html, 'html.parser')
    time.sleep(2) 
    
    # Find the sightings data table on the current page
    table = soup.find('div', id='outer-wrap').find('table', id='table_1')
    time.sleep(2)
    
    rows = table.find_all('tr')
    for row in rows:
        data = row.find_all('td')
        record = [d.text for d in data]
        sightings.append(record)
        record_count += 1
        #print(record_count)
        
    time.sleep(2)
    
    # Click the next button using JavaScript to bypass interception
    try:
        next_button = browser.find_by_id('table_1_next')
        browser.execute_script("arguments[0].click();", next_button[0]._element)
        time.sleep(5)
    except Exception as e:
        print(f"Error clicking next button: {e}")
        break 


In [None]:
# Close the browser
browser.quit()

In [None]:
# Create Pandas dataframe for sightings data
sightings_df = pd.DataFrame(sightings,
                           columns = ["LINK",
                                     "OCCURRED_DATE",
                                     "CITY",
                                     "STATE",
                                     "COUNTRY",
                                     "SHAPE",
                                     "SUMMARY",
                                      "REPORTED",
                                     "MEDIA",
                                     "EXPLANATION"
                                     ])

In [None]:
# Remove unnecessary columns for planned data analysis
sightings_df = sightings_df.drop(columns = ["LINK", "MEDIA", "EXPLANATION"])

In [None]:
# Save sightings data for future use
sightings_df.to_csv('../Output/sightings_df.csv',index=False)

## Data Transformation: Read Output, Perform Data Filtering and Data Cleansing

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read sightings df in for further data filtering and cleansing
sightings_df = pd.read_csv('../Output/sightings_df.csv')
sightings_df.head()

Unnamed: 0,OCCURRED_DATE,CITY,STATE,COUNTRY,SHAPE,SUMMARY,REPORTED
0,,,,,,,
1,06/20/2024 09:30,Cranston,RI,USA,Flash,My wife and I as well as our 2 children witnes...,Y
2,06/20/2024 04:30,Oklahoma City,OK,USA,Light,Erratically moving light at high altitude,
3,06/20/2024 00:50,Key West,FL,USA,Light,Saw unexplainable lights slightly west of Nort...,Y
4,06/19/2024 17:50,Valley Stream,NY,USA,Sphere,Sphere in the sky that glowed like an LED ligh...,


In [3]:
# Remove unnecessary columns for planned data analysis
sightings_df = sightings_df.drop(columns = ["REPORTED"])

In [4]:
# Convert OCCURRED_DATE to Date format
sightings_df["OCCURRED_DATE"] = pd.to_datetime(sightings_df["OCCURRED_DATE"]).dt.date


In [5]:
# Create YEAR column based on OCCURRED_DATE
sightings_df['YEAR'] = pd.to_datetime(sightings_df['OCCURRED_DATE']).dt.year

In [6]:
# Filter dataframe to just USA sightings
usa_sightings_df = sightings_df.loc[sightings_df['COUNTRY'] == 'USA']


In [7]:
# Replace blank values from CITY with NaN and drop NaN columns for data cleanliness
usa_sightings_df['CITY'] = usa_sightings_df['CITY'].replace('', np.nan)
usa_sightings_df = usa_sightings_df.dropna()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usa_sightings_df['CITY'] = usa_sightings_df['CITY'].replace('', np.nan)


In [8]:
# Create CITYSTATE combined column
usa_sightings_df['CITYSTATE'] = usa_sightings_df['CITY'].str.cat(usa_sightings_df['STATE'], sep=', ')

In [9]:
# Confirm there are no records without an OCCURRED_DATE
null_occurred_date = usa_sightings_df[usa_sightings_df["OCCURRED_DATE"].isnull()]
print(null_occurred_date)

Empty DataFrame
Columns: [OCCURRED_DATE, CITY, STATE, COUNTRY, SHAPE, SUMMARY, YEAR, CITYSTATE]
Index: []


In [10]:
# Sort USA dataframe by OCCURRED_DATE
usa_sightings_df = usa_sightings_df.sort_values(by=["OCCURRED_DATE"], ascending = False)
usa_sightings_df.head()

Unnamed: 0,OCCURRED_DATE,CITY,STATE,COUNTRY,SHAPE,SUMMARY,YEAR,CITYSTATE
1,2024-06-20,Cranston,RI,USA,Flash,My wife and I as well as our 2 children witnes...,2024.0,"Cranston, RI"
3,2024-06-20,Key West,FL,USA,Light,Saw unexplainable lights slightly west of Nort...,2024.0,"Key West, FL"
2,2024-06-20,Oklahoma City,OK,USA,Light,Erratically moving light at high altitude,2024.0,"Oklahoma City, OK"
4,2024-06-19,Valley Stream,NY,USA,Sphere,Sphere in the sky that glowed like an LED ligh...,2024.0,"Valley Stream, NY"
5,2024-06-19,Jacksonville,FL,USA,Triangle,Black triangle very low no noise,2024.0,"Jacksonville, FL"


In [11]:
usa_sightings_df['SIGHTING_ID'] = range(1, len(usa_sightings_df) + 1)
usa_sightings_df.set_index('SIGHTING_ID', inplace=True)
usa_sightings_df.head()

Unnamed: 0_level_0,OCCURRED_DATE,CITY,STATE,COUNTRY,SHAPE,SUMMARY,YEAR,CITYSTATE
SIGHTING_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2024-06-20,Cranston,RI,USA,Flash,My wife and I as well as our 2 children witnes...,2024.0,"Cranston, RI"
2,2024-06-20,Key West,FL,USA,Light,Saw unexplainable lights slightly west of Nort...,2024.0,"Key West, FL"
3,2024-06-20,Oklahoma City,OK,USA,Light,Erratically moving light at high altitude,2024.0,"Oklahoma City, OK"
4,2024-06-19,Valley Stream,NY,USA,Sphere,Sphere in the sky that glowed like an LED ligh...,2024.0,"Valley Stream, NY"
5,2024-06-19,Jacksonville,FL,USA,Triangle,Black triangle very low no noise,2024.0,"Jacksonville, FL"


In [12]:
# Save USA sightings data for future use
usa_sightings_df.to_csv('../Output/usa_sightings_df_.csv')