# UFO Sightings Analysis

This notebook analyzes the UFO sightings dataset and creates visualizations to understand patterns in the data.

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium import plugins
from pathlib import Path

# Set plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

In [None]:
# Load the dataset with proper handling of problematic rows
data_path = Path('../data/raw/complete.csv')
df = pd.read_csv(data_path, 
                 escapechar='\\',
                 quotechar='"',
                 doublequote=True,
                 encoding='utf-8',
                 on_bad_lines='warn',
                 low_memory=False)

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing Values:")
print(df.isnull().sum())

## Data Preprocessing

Let's clean up the data and prepare it for analysis:

In [None]:
# Convert datetime and ensure numeric coordinates
df['datetime'] = pd.to_datetime(df['datetime'])
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Remove rows with invalid coordinates
df_clean = df.dropna(subset=['latitude', 'longitude'])

print("Original dataset size:", len(df))
print("Clean dataset size:", len(df_clean))
print("Removed rows:", len(df) - len(df_clean))

## Interactive Map of UFO Sightings

Create a heatmap showing the concentration of UFO sightings across the globe:

In [None]:
# Create base map centered on US
m = folium.Map(location=[37.0902, -95.7129], 
               zoom_start=4,
               tiles='CartoDB positron')

# Add heatmap layer
locations = df_clean[['latitude', 'longitude']].values.tolist()
plugins.HeatMap(locations, 
                radius=8,
                blur=5,
                gradient={0.4: 'blue', 0.65: 'lime', 1: 'red'}).add_to(m)

# Display the map
m

## Temporal Analysis

Let's analyze how UFO sightings have changed over time:

In [None]:
# Create yearly trend plot
yearly_sightings = df_clean.groupby(df_clean['datetime'].dt.year).size()

plt.figure(figsize=(15, 6))
yearly_sightings.plot(kind='line', marker='o')
plt.title('UFO Sightings by Year')
plt.xlabel('Year')
plt.ylabel('Number of Sightings')
plt.grid(True)
plt.show()

print("\nSightings by Decade:")
decade_sightings = df_clean.groupby((df_clean['datetime'].dt.year // 10) * 10).size()
print(decade_sightings)

## Geographic Distribution

Analyze where UFO sightings occur most frequently:

In [None]:
# Top states for UFO sightings (US only)
us_sightings = df_clean[df_clean['country'] == 'us']
state_counts = us_sightings['state'].value_counts().head(15)

plt.figure(figsize=(15, 6))
state_counts.plot(kind='bar')
plt.title('Top 15 States with Most UFO Sightings')
plt.xlabel('State')
plt.ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nTop 15 States for UFO Sightings:")
print(state_counts)

## UFO Characteristics

Analyze the reported shapes and durations of UFO sightings:

In [None]:
# Most common UFO shapes
shape_counts = df_clean['shape'].value_counts().head(10)

plt.figure(figsize=(12, 6))
shape_counts.plot(kind='bar')
plt.title('Most Common UFO Shapes')
plt.xlabel('Shape')
plt.ylabel('Number of Sightings')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nTop 10 Reported UFO Shapes:")
print(shape_counts)

## Summary Statistics

Key findings from the analysis:

In [None]:
print("Dataset Overview:")
print(f"Total number of sightings: {len(df_clean):,}")
print(f"Date range: {df_clean['datetime'].min().year} to {df_clean['datetime'].max().year}")
print(f"Number of countries: {df_clean['country'].nunique()}")
print(f"Number of different shapes reported: {df_clean['shape'].nunique()}")

# Peak sighting years
peak_year = yearly_sightings.idxmax()
print(f"\nPeak year for sightings: {peak_year} with {yearly_sightings[peak_year]:,} sightings")

# Recent trends
recent_years = yearly_sightings.tail(5)
print("\nRecent years sighting counts:")
print(recent_years)