In [1]:
#initial inspection of the resulting data
import pandas as pd
import numpy as np

#open the csv file as a data frame
df = pd.read_csv('UFO_observations_1994_2018.csv', header = None, index_col=None)
df.columns = ["Date and Time", "City", "State", "Shape", "Duration", "Summary", "Date Posted"]

pd.set_option('max_rows', 10)
df

Unnamed: 0,Date and Time,City,State,Shape,Duration,Summary,Date Posted
0,1/30/94 18:00,Chemnitz (Germany),,Oval,20sec,UFO during wintertime 1993/1994 in Chemnitz/Ge...,12/12/09
1,1/25/94 17:56,Spring Valley/LaMesa,CA,Fireball,5-7 minutes,"Fireball descending towards ground, changes tr...",11/6/14
2,1/24/94 22:00,Los Angeles,CA,Disk,5 minutes,my friend and Isaw saucer craft -silent -eight...,7/11/00
3,1/19/94 19:00,Summit,SD,Disk,10 minutes,Large Silver Disk explodes over SD Antenna Fie...,8/28/02
4,1/18/94 19:00,Milford,IA,Flash,1 hour,The objects hover to the west changing diffren...,1/22/04
...,...,...,...,...,...,...,...
117050,12/1/18 10:02,Salt Lake City,UT,,,MADAR Node 112,1/4/19
117051,12/1/18 09:10,Newington,CT,,,MADAR Node 106,12/6/18
117052,12/1/18 03:45,Lanexa,VA,Light,10 minutes,Numerous lights seen in early morning southeas...,12/6/18
117053,12/1/18 02:15,Kingman,AZ,Cigar,10 minutes,My friend was going out to her truck to get my...,12/6/18


In [2]:
#converts the date strings into date/time formatted objects
df["Date and Time"] = pd.to_datetime(df["Date and Time"])
df["Date Posted"] = pd.to_datetime(df["Date Posted"])

#scripts to clean up the dates as anything before 1970 is being automatically assigned to the future.
#i.e.: 1/1/46 is turning into 1/1/2047 - see source below 
#https://stackoverflow.com/questions/37766353/pandas-to-datetime-parsing-wrong-year

from datetime import datetime, timedelta, date
future = df["Date and Time"] > date(year=2019,month=1,day=1)
df.loc[future, "Date and Time"] -= timedelta(days=365*100)

#tidy up of any NaN values in any cells
for col in df.columns[1:5]:
    df[col] = df[col].replace(np.nan, "")

df

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Date and Time,City,State,Shape,Duration,Summary,Date Posted
0,1994-01-30 18:00:00,Chemnitz (Germany),,Oval,20sec,UFO during wintertime 1993/1994 in Chemnitz/Ge...,2009-12-12
1,1994-01-25 17:56:00,Spring Valley/LaMesa,CA,Fireball,5-7 minutes,"Fireball descending towards ground, changes tr...",2014-11-06
2,1994-01-24 22:00:00,Los Angeles,CA,Disk,5 minutes,my friend and Isaw saucer craft -silent -eight...,2000-07-11
3,1994-01-19 19:00:00,Summit,SD,Disk,10 minutes,Large Silver Disk explodes over SD Antenna Fie...,2002-08-28
4,1994-01-18 19:00:00,Milford,IA,Flash,1 hour,The objects hover to the west changing diffren...,2004-01-22
...,...,...,...,...,...,...,...
117050,2018-12-01 10:02:00,Salt Lake City,UT,,,MADAR Node 112,2019-01-04
117051,2018-12-01 09:10:00,Newington,CT,,,MADAR Node 106,2018-12-06
117052,2018-12-01 03:45:00,Lanexa,VA,Light,10 minutes,Numerous lights seen in early morning southeas...,2018-12-06
117053,2018-12-01 02:15:00,Kingman,AZ,Cigar,10 minutes,My friend was going out to her truck to get my...,2018-12-06


In [3]:
#removes any reports from an automated reporting system called MADAR.
#MADAR Data is sparse and adds no value to the overall reporting stet 

df = df[~df['Summary'].str.contains("MADAR", na=False)]
df

Unnamed: 0,Date and Time,City,State,Shape,Duration,Summary,Date Posted
0,1994-01-30 18:00:00,Chemnitz (Germany),,Oval,20sec,UFO during wintertime 1993/1994 in Chemnitz/Ge...,2009-12-12
1,1994-01-25 17:56:00,Spring Valley/LaMesa,CA,Fireball,5-7 minutes,"Fireball descending towards ground, changes tr...",2014-11-06
2,1994-01-24 22:00:00,Los Angeles,CA,Disk,5 minutes,my friend and Isaw saucer craft -silent -eight...,2000-07-11
3,1994-01-19 19:00:00,Summit,SD,Disk,10 minutes,Large Silver Disk explodes over SD Antenna Fie...,2002-08-28
4,1994-01-18 19:00:00,Milford,IA,Flash,1 hour,The objects hover to the west changing diffren...,2004-01-22
...,...,...,...,...,...,...,...
117048,2018-12-01 21:20:00,Rapid City,SD,Formation,20 minutes,Between 17 and 20 orange orbs drifted toward t...,2018-12-06
117049,2018-12-01 11:00:00,Clayton,NJ,Light,5 seconds,White dot flying at crazy speeds!!!,2019-01-04
117052,2018-12-01 03:45:00,Lanexa,VA,Light,10 minutes,Numerous lights seen in early morning southeas...,2018-12-06
117053,2018-12-01 02:15:00,Kingman,AZ,Cigar,10 minutes,My friend was going out to her truck to get my...,2018-12-06


In [4]:
#removes any data from outside the contiguous 48 US states as determined by the state value in the data 
US_states = ["AL", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

df = df[df['State'].isin(US_states)]
df

Unnamed: 0,Date and Time,City,State,Shape,Duration,Summary,Date Posted
1,1994-01-25 17:56:00,Spring Valley/LaMesa,CA,Fireball,5-7 minutes,"Fireball descending towards ground, changes tr...",2014-11-06
2,1994-01-24 22:00:00,Los Angeles,CA,Disk,5 minutes,my friend and Isaw saucer craft -silent -eight...,2000-07-11
3,1994-01-19 19:00:00,Summit,SD,Disk,10 minutes,Large Silver Disk explodes over SD Antenna Fie...,2002-08-28
4,1994-01-18 19:00:00,Milford,IA,Flash,1 hour,The objects hover to the west changing diffren...,2004-01-22
5,1994-01-17 05:00:00,Los Angeles,CA,Fireball,few seconds,Fire ball after northidge quake .,2013-12-23
...,...,...,...,...,...,...,...
117048,2018-12-01 21:20:00,Rapid City,SD,Formation,20 minutes,Between 17 and 20 orange orbs drifted toward t...,2018-12-06
117049,2018-12-01 11:00:00,Clayton,NJ,Light,5 seconds,White dot flying at crazy speeds!!!,2019-01-04
117052,2018-12-01 03:45:00,Lanexa,VA,Light,10 minutes,Numerous lights seen in early morning southeas...,2018-12-06
117053,2018-12-01 02:15:00,Kingman,AZ,Cigar,10 minutes,My friend was going out to her truck to get my...,2018-12-06


In [5]:
#enforces lowercase on all reported UFO shapes to better group the data for further manipulation
df['Shape'] = df['Shape'].astype(str).str.lower()

shapes = df['Shape'].unique()

print (shapes)

['fireball' 'disk' 'flash' 'unknown' 'sphere' 'chevron' 'triangle' 'other'
 'rectangle' 'changing' 'light' 'cylinder' 'oval' 'circle' 'formation'
 'cigar' '' 'egg' 'diamond' 'teardrop' 'delta' 'cone' 'cross' 'round'
 'dome' 'changed' 'pyramid' 'crescent' 'flare' 'hexagon']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [6]:
# reclassifies all of the self-reported UFO shapes into a standard set of shapes that we can use for the analysis
# there has been some interpolation here of the shapes by the author

UFO_Shapes = {
 'fireball': 'Shapeless Light',
 'disk': 'Disk',
 'flash': 'Shapeless Light',
 'unknown': 'Unknown',
 'sphere': 'Spherical',
 'chevron': 'Triangular',
 'triangle': 'Triangular',
 'other': 'Unknown',
 'rectangle': 'Polygonal',
 'changing': 'Changing',
 'light': 'Shapeless Light', 
 'cylinder': 'Cylindrical',
 'oval': 'Oval',
 'circle': 'Disk',  
 'formation': 'Formation',
 'cigar': 'Cylindrical',
 '': 'Unknown',    
 'egg': "Oval",
 'diamond': 'Polygonal',
 'teardrop': 'Oval',
 'delta': 'Triangular',
 'cone': 'Triangular',
 'cross': 'Cross',
 'round': 'Circular',
 'dome': 'Spherical',
 'changed': 'Changing',
 'pyramid': "Triangular",
 'crescent': 'Circular',
 'flare': 'Shapeless Light',
 'hexagon': 'Polygonal'
}

df['Shape'].replace(UFO_Shapes, inplace=True)

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,Date and Time,City,State,Shape,Duration,Summary,Date Posted
1,1994-01-25 17:56:00,Spring Valley/LaMesa,CA,Shapeless Light,5-7 minutes,"Fireball descending towards ground, changes tr...",2014-11-06
2,1994-01-24 22:00:00,Los Angeles,CA,Disk,5 minutes,my friend and Isaw saucer craft -silent -eight...,2000-07-11
3,1994-01-19 19:00:00,Summit,SD,Disk,10 minutes,Large Silver Disk explodes over SD Antenna Fie...,2002-08-28
4,1994-01-18 19:00:00,Milford,IA,Shapeless Light,1 hour,The objects hover to the west changing diffren...,2004-01-22
5,1994-01-17 05:00:00,Los Angeles,CA,Shapeless Light,few seconds,Fire ball after northidge quake .,2013-12-23
...,...,...,...,...,...,...,...
117048,2018-12-01 21:20:00,Rapid City,SD,Formation,20 minutes,Between 17 and 20 orange orbs drifted toward t...,2018-12-06
117049,2018-12-01 11:00:00,Clayton,NJ,Shapeless Light,5 seconds,White dot flying at crazy speeds!!!,2019-01-04
117052,2018-12-01 03:45:00,Lanexa,VA,Shapeless Light,10 minutes,Numerous lights seen in early morning southeas...,2018-12-06
117053,2018-12-01 02:15:00,Kingman,AZ,Cylindrical,10 minutes,My friend was going out to her truck to get my...,2018-12-06


In [7]:
# converts the state values from stndard US Postal Service abbreviations to the full state name
# this is done to improve interoperability with US Map Shape Files for the visualisation

states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

df['State'].replace(states, inplace=True)

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,Date and Time,City,State,Shape,Duration,Summary,Date Posted
1,1994-01-25 17:56:00,Spring Valley/LaMesa,California,Shapeless Light,5-7 minutes,"Fireball descending towards ground, changes tr...",2014-11-06
2,1994-01-24 22:00:00,Los Angeles,California,Disk,5 minutes,my friend and Isaw saucer craft -silent -eight...,2000-07-11
3,1994-01-19 19:00:00,Summit,South Dakota,Disk,10 minutes,Large Silver Disk explodes over SD Antenna Fie...,2002-08-28
4,1994-01-18 19:00:00,Milford,Iowa,Shapeless Light,1 hour,The objects hover to the west changing diffren...,2004-01-22
5,1994-01-17 05:00:00,Los Angeles,California,Shapeless Light,few seconds,Fire ball after northidge quake .,2013-12-23
...,...,...,...,...,...,...,...
117048,2018-12-01 21:20:00,Rapid City,South Dakota,Formation,20 minutes,Between 17 and 20 orange orbs drifted toward t...,2018-12-06
117049,2018-12-01 11:00:00,Clayton,New Jersey,Shapeless Light,5 seconds,White dot flying at crazy speeds!!!,2019-01-04
117052,2018-12-01 03:45:00,Lanexa,Virginia,Shapeless Light,10 minutes,Numerous lights seen in early morning southeas...,2018-12-06
117053,2018-12-01 02:15:00,Kingman,Arizona,Cylindrical,10 minutes,My friend was going out to her truck to get my...,2018-12-06


In [8]:
#exports changes to a new csv file for 
df.to_csv('UFO_observations_1994_2018_2.csv')