# UFO sighting EDA


#### This EDA is using the database of registered sighting from National UFO Reporting Center (NUFORC). This dataset has more than 80,000 reports, and I will present some interesting information below.

### Some information about the dataset

In [123]:
#Setup
import pandas as pd 
import matplotlib.pyplot as plt
import folium

df = pd.read_csv('dataset/scrubbed.csv', error_bad_lines = False)
state_geo = 'us-states.json'

In [124]:
#Renaming some columns
df.rename(columns = {'duration (seconds)':'sec_duration'}, inplace = True)
df.rename(columns = {'duration (hours/min)':'hrmin_duration'}, inplace = True)
df.rename(columns = {'date posted':'date_posted'}, inplace = True)
df.rename(columns = {'longitude ':'longitude'}, inplace = True)

In [125]:
#Preparing data
datetime = pd.to_datetime(df['datetime'], errors = 'coerce')
duration = df['sec_duration'].map(lambda x: str(x).replace('`', ''))
duration = pd.to_numeric(duration)

In [126]:
#creating new dataframe with just the necessary data
dataset =  df[['city', 'state', 'country', 'latitude', 'longitude']]
dataset.insert(0, 'datetime', datetime)
dataset.insert(1, 'duration', duration)

In [127]:
dataset.head(5)

Unnamed: 0,datetime,duration,city,state,country,latitude,longitude
0,1949-10-10 20:30:00,2700.0,san marcos,tx,us,29.8830556,-97.941111
1,1949-10-10 21:00:00,7200.0,lackland afb,tx,,29.38421,-98.581082
2,1955-10-10 17:00:00,20.0,chester (uk/england),,gb,53.2,-2.916667
3,1956-10-10 21:00:00,20.0,edna,tx,us,28.9783333,-96.645833
4,1960-10-10 20:00:00,900.0,kaneohe,hi,us,21.4180556,-157.803611


In [128]:
dataset.tail(5)

Unnamed: 0,datetime,duration,city,state,country,latitude,longitude
80327,2013-09-09 21:15:00,600.0,nashville,tn,us,36.1658,-86.784444
80328,2013-09-09 22:00:00,1200.0,boise,id,us,43.6136,-116.2025
80329,2013-09-09 22:00:00,1200.0,napa,ca,us,38.2972,-122.284444
80330,2013-09-09 22:20:00,5.0,vienna,va,us,38.9011,-77.265556
80331,2013-09-09 23:00:00,1020.0,edmond,ok,us,35.6528,-97.477778


In [129]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 7 columns):
datetime     79638 non-null datetime64[ns]
duration     80332 non-null float64
city         80332 non-null object
state        74535 non-null object
country      70662 non-null object
latitude     80332 non-null object
longitude    80332 non-null float64
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 4.3+ MB


In [130]:
dataset.dtypes

datetime     datetime64[ns]
duration            float64
city                 object
state                object
country              object
latitude             object
longitude           float64
dtype: object

In [131]:
dataset.shape

(80332, 7)

In [132]:
#Does any column have missing data?
pd.isnull(dataset).any()

datetime      True
duration     False
city         False
state         True
country       True
latitude     False
longitude    False
dtype: bool

In [133]:
#central tendency measures
print(dataset.mean(numeric_only = True), '\n')
print(dataset.median(numeric_only = True), '\n')
print(dataset.mode(numeric_only = True), '\n')

duration     9016.889016
longitude     -86.772885
dtype: float64 

duration     180.000000
longitude    -87.903611
dtype: float64 

   duration   longitude
0     300.0 -122.330833 



In [134]:
#Analysing data amount per country
countr_per_country = dataset.groupby(["country"]).size()
percentage_per_country = countr_per_country.apply(lambda x: round((100 * x / float(dataset.shape[0])), 2))

data = {'Countries': countr_per_country.index, 
        'Count': countr_per_country.values, 
        'Percentage': percentage_per_country.values}
amount_dataframe = pd.DataFrame(data = data, columns = ['Countries', 'Count', 'Percentage'])
amount_dataframe.sort_values(['Percentage'], ascending = False)

Unnamed: 0,Countries,Count,Percentage
4,us,65114,81.06
1,ca,3000,3.73
3,gb,1905,2.37
0,au,538,0.67
2,de,105,0.13


More than 80% of the data is related to the USA, like is shown above. For this reason, we will use just this slice of data.

In [144]:
dataset = dataset[dataset["country"] == 'us']
dataset.head()

Unnamed: 0,datetime,duration,city,state,country,latitude,longitude
0,1949-10-10 20:30:00,2700.0,san marcos,tx,us,29.8830556,-97.941111
3,1956-10-10 21:00:00,20.0,edna,tx,us,28.9783333,-96.645833
4,1960-10-10 20:00:00,900.0,kaneohe,hi,us,21.4180556,-157.803611
5,1961-10-10 19:00:00,300.0,bristol,tn,us,36.595,-82.188889
7,1965-10-10 23:45:00,1200.0,norwalk,ct,us,41.1175,-73.408333


In [None]:
map = folium.Map(location = [37, -102], zoom_start = 3)

folium.Choropleth(
    geo_data = state_geo,
    name = 'choropleth',
    data = dataset,
    columns = ['state', 'duration'],
    key_on = 'feature.id',
    fill_color = 'YlGn',
    fill_opacity = 0.7,
    line_opacity = 0.2,
    legend_name = 'UFO sighting '
).add_to(map)

map