# Bigfoot Data Cleaning
### The purpose of this notebook is to clean and compare three sets of Bigfoot siting reports, so that the data is free of redundancy, easy to follow, and usable for visualizations.

In [1]:
#Import libraries 

import pandas as pd
from sqlalchemy import create_engine
import json

In [2]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

In [3]:
#Store Data in Dataframe

csv_loc = "Resources/bfro_report_locations.csv"
csv_geo = "Resources/bfro_reports_geocoded.csv"
json_file = "Resources/bfro_reports.json"

bigfoot_locations_df = pd.read_csv(csv_loc)
bigfoot_geo_df = pd.read_csv(csv_geo)
bfro_rep_df = pd.read_json(json_file, lines=True)

### Cleaning of bfro_report_locations

In [4]:
# Display location CSV DataFrame
bigfoot_locations_df.head()

Unnamed: 0,number,title,classification,timestamp,latitude,longitude
0,637,Report 637: Campers' encounter just after dark...,Class A,2000-06-16T12:00:00Z,61.5,-142.9
1,2917,Report 2917: Family observes large biped from car,Class A,1995-05-15T12:00:00Z,55.1872,-132.7982
2,7963,Report 7963: Sasquatch walks past window of ho...,Class A,2004-02-09T12:00:00Z,55.2035,-132.8202
3,9317,"Report 9317: Driver on Alcan Highway has noon,...",Class A,2004-06-18T12:00:00Z,62.9375,-141.5667
4,13038,Report 13038: Snowmobiler has encounter in dee...,Class A,2004-02-15T12:00:00Z,61.0595,-149.7853


In [5]:
# Begin filtering data
# Drop null values on 'number'
bf_loc_transformed = bigfoot_locations_df[bigfoot_locations_df['number'].notna()]

# Split timestamp column into date and time column
bf_loc_transformed[['Date_loc','Time']] = bf_loc_transformed.timestamp.str.split("T",expand=True)

bf_loc_transformed.head()

Unnamed: 0,number,title,classification,timestamp,latitude,longitude,Date_loc,Time
0,637,Report 637: Campers' encounter just after dark...,Class A,2000-06-16T12:00:00Z,61.5,-142.9,2000-06-16,12:00:00Z
1,2917,Report 2917: Family observes large biped from car,Class A,1995-05-15T12:00:00Z,55.1872,-132.7982,1995-05-15,12:00:00Z
2,7963,Report 7963: Sasquatch walks past window of ho...,Class A,2004-02-09T12:00:00Z,55.2035,-132.8202,2004-02-09,12:00:00Z
3,9317,"Report 9317: Driver on Alcan Highway has noon,...",Class A,2004-06-18T12:00:00Z,62.9375,-141.5667,2004-06-18,12:00:00Z
4,13038,Report 13038: Snowmobiler has encounter in dee...,Class A,2004-02-15T12:00:00Z,61.0595,-149.7853,2004-02-15,12:00:00Z


In [6]:
# Remove report number from title
bf_loc_transformed[['Report', 'Title']] = bf_loc_transformed.title.str.split(":",1,expand=True)
bf_loc_transformed.head()

Unnamed: 0,number,title,classification,timestamp,latitude,longitude,Date_loc,Time,Report,Title
0,637,Report 637: Campers' encounter just after dark...,Class A,2000-06-16T12:00:00Z,61.5,-142.9,2000-06-16,12:00:00Z,Report 637,Campers' encounter just after dark in the Wra...
1,2917,Report 2917: Family observes large biped from car,Class A,1995-05-15T12:00:00Z,55.1872,-132.7982,1995-05-15,12:00:00Z,Report 2917,Family observes large biped from car
2,7963,Report 7963: Sasquatch walks past window of ho...,Class A,2004-02-09T12:00:00Z,55.2035,-132.8202,2004-02-09,12:00:00Z,Report 7963,Sasquatch walks past window of house at night
3,9317,"Report 9317: Driver on Alcan Highway has noon,...",Class A,2004-06-18T12:00:00Z,62.9375,-141.5667,2004-06-18,12:00:00Z,Report 9317,"Driver on Alcan Highway has noon, road encoun..."
4,13038,Report 13038: Snowmobiler has encounter in dee...,Class A,2004-02-15T12:00:00Z,61.0595,-149.7853,2004-02-15,12:00:00Z,Report 13038,Snowmobiler has encounter in deep snow near P...


In [7]:
# remove unwanted columns
# the time column does not provide beneficial data (all reports were at 12:00:00Z)
bf_loc_transformed = bf_loc_transformed[['number', 'classification', 'latitude', 'longitude', 'Date_loc', 'Title']]
bf_loc_transformed.head()

Unnamed: 0,number,classification,latitude,longitude,Date_loc,Title
0,637,Class A,61.5,-142.9,2000-06-16,Campers' encounter just after dark in the Wra...
1,2917,Class A,55.1872,-132.7982,1995-05-15,Family observes large biped from car
2,7963,Class A,55.2035,-132.8202,2004-02-09,Sasquatch walks past window of house at night
3,9317,Class A,62.9375,-141.5667,2004-06-18,"Driver on Alcan Highway has noon, road encoun..."
4,13038,Class A,61.0595,-149.7853,2004-02-15,Snowmobiler has encounter in deep snow near P...


In [8]:
# sort reports by report number
bf_loc_transformed.sort_values(by=['number'], inplace=True)
bf_loc_transformed.reset_index(drop=True, inplace=True)
bf_loc_transformed

Unnamed: 0,number,classification,latitude,longitude,Date_loc,Title
0,60,Class B,48.64056,-121.80530,1994-05-13,Missing Cattle and large footprints found
1,76,Class B,36.37139,-92.25139,2001-08-11,Three fisherman hear thrashing about and loud...
2,77,Class B,46.98333,-121.09220,1983-09-01,Couple hear vocalizations while camping at Mi...
3,80,Class B,43.69005,-122.37550,1983-08-12,"Late Arriving Campers Hear Running, Stomping,..."
4,83,Class A,37.35944,-119.64360,1988-10-01,Early dawn sighting by hunters near Bass Lake
...,...,...,...,...,...,...
3989,62352,Class B,38.17600,-95.34650,2017-04-17,Group of fisherman with a possible sighting o...
3990,62354,Class A,48.92982,-122.20180,2018-09-15,Son and father observe possible bigfoot famil...
3991,62393,Class A,37.79250,-92.05000,2017-07-08,Ozarks: Daylight sighting on eastern boundary...
3992,62403,Class B,48.56245,-121.85880,2018-09-15,Mushroom picker has ongoing activity in same ...


In [9]:
# double-check to see if any columns hold null values
bf_loc_transformed.count()

number            3994
classification    3994
latitude          3994
longitude         3994
Date_loc          3994
Title             3994
dtype: int64

### Cleaning of bfro_reports_geocoded
location_details, season, temperature_high, temperature_mid, temperature_low, dew_point, humidity, cloud_cover, moon_phase, precip_intensity, precip_probability, precip_type, pressure, summary, uv_index, visibility, wind_bearing, and wind_speed removed.
More accurate information regarding weather will be pulled from OpenWeatherMap later.

In [10]:
bigfoot_geo_df.head()

Unnamed: 0,observed,location_details,county,state,season,title,latitude,longitude,date,number,...,moon_phase,precip_intensity,precip_probability,precip_type,pressure,summary,uv_index,visibility,wind_bearing,wind_speed
0,Ed L. was salmon fishing with a companion in P...,East side of Prince William Sound,Valdez-Chitina-Whittier County,Alaska,Fall,,,,,1261.0,...,,,,,,,,,,
1,heh i kinda feel a little dumb that im reporti...,"the road is off us rt 80, i dont know the exit...",Warren County,New Jersey,Fall,,,,,438.0,...,,,,,,,,,,
2,I was on my way to Claremont from Lebanon on R...,Close to Claremont down 120 not far from Kings...,Sullivan County,New Hampshire,Summer,Report 55269: Dawn sighting at Stevens Brook o...,43.41549,-72.33093,2016-06-07,55269.0,...,0.1,0.001,0.7,rain,998.87,Mostly cloudy throughout the day.,6.0,9.7,262.0,0.49
3,I was northeast of Macy Nebraska along the Mis...,Latitude & Longitude : 42.158230 -96.344197,Thurston County,Nebraska,Spring,Report 59757: Possible daylight sighting of a ...,42.15685,-96.34203,2018-05-25,59757.0,...,0.38,0.0,0.0,,1008.07,Partly cloudy in the morning.,10.0,8.25,193.0,3.33
4,"While this incident occurred a long time ago, ...","Ward County, Just outside of a the Minuteman T...",Ward County,North Dakota,Spring,Report 751: Hunter describes described being s...,48.25422,-101.3166,2000-04-21,751.0,...,0.6,,,rain,1011.47,Partly cloudy until evening.,6.0,10.0,237.0,11.14


In [11]:
# Begin filtering data
bf_geo_cols = ['observed', 'county', 'state', 'latitude', 'longitude', 'date','number', 'classification', 'geohash']
bf_geo_transformed = bigfoot_geo_df[bf_geo_cols].copy()

# Drop null values on 'number'
bf_geo_transformed = bf_geo_transformed[bf_geo_transformed['number'].notna()]

bf_geo_transformed.head()

Unnamed: 0,observed,county,state,latitude,longitude,date,number,classification,geohash
0,Ed L. was salmon fishing with a companion in P...,Valdez-Chitina-Whittier County,Alaska,,,,1261.0,Class A,
1,heh i kinda feel a little dumb that im reporti...,Warren County,New Jersey,,,,438.0,Class B,
2,I was on my way to Claremont from Lebanon on R...,Sullivan County,New Hampshire,43.41549,-72.33093,2016-06-07,55269.0,Class A,drswfpd1x1
3,I was northeast of Macy Nebraska along the Mis...,Thurston County,Nebraska,42.15685,-96.34203,2018-05-25,59757.0,Class B,9z7rzdmv7y
4,"While this incident occurred a long time ago, ...",Ward County,North Dakota,48.25422,-101.3166,2000-04-21,751.0,Class A,c8xfw2rt0n


In [12]:
# sort reports by report number
bf_geo_transformed.sort_values(by=['number'], inplace=True)
bf_geo_transformed.reset_index(drop=True, inplace=True)
bf_geo_transformed.head()

Unnamed: 0,observed,county,state,latitude,longitude,date,number,classification,geohash
0,"These two gentlmen, brothers, recluses age 50'...",Skagit County,Washington,48.64056,-121.8053,1994-05-13,60.0,Class B,c29ksq8pfc
1,My wife and I were off the trail to take some ...,Washington County,New York,,,,70.0,Class B,
2,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Baxter County,Arkansas,36.37139,-92.25139,2001-08-11,76.0,Class B,9yqquv0wdy
3,"We had driven into Milk pond, up Chinook pass,...",Kittitas County,Washington,46.98333,-121.0922,1983-09-01,77.0,Class B,c23gk3p3ep
4,"My veterinarian sister, former girlfriend, and...",Lane County,Oregon,43.69005,-122.3755,1983-08-12,80.0,Class B,9rbbx38bup


In [13]:
# double-check to make sure most necessary columns do not hold null values
# null values may be filled in merges
bf_geo_transformed.count()

observed          4711
county            4747
state             4747
latitude          3797
longitude         3797
date              3797
number            4747
classification    4747
geohash           3797
dtype: int64

### Cleaning of bfro_reports

In [14]:
bfro_rep_df.head()

Unnamed: 0,YEAR,SEASON,STATE,COUNTY,LOCATION_DETAILS,OBSERVED,OTHER_WITNESSES,TIME_AND_CONDITIONS,REPORT_NUMBER,REPORT_CLASS,MONTH,DATE,NEAREST_TOWN,NEAREST_ROAD,ALSO_NOTICED,OTHER_STORIES,ENVIRONMENT,A_&_G_References
0,Early 1990's,Fall,Alaska,Valdez-Chitina-Whittier County,East side of Prince William Sound,Ed L. was salmon fishing with a companion in P...,On a commercial fishing boat at anchor at the ...,"Early Fall, in the early 1990's.",1261.0,Class A,,,,,,,,
1,,,,,,,,,,,,,,,,,,
2,2000,Fall,New Jersey,Warren County,"the road is off us rt 80, i dont know the exit...",heh i kinda feel a little dumb that im reporti...,"my friend was asleep, and i was lying next to ...",lighting was from a camp fire that was dieing ...,438.0,Class B,September,2nd or 3rd,Allumuchy,not sure of the road,nothing,no but after the events stated i looked at you...,"good size river, steep hills, very rugged area...",
3,2016,Summer,New Hampshire,Sullivan County,Close to Claremont down 120 not far from Kings...,I was on my way to Claremont from Lebanon on R...,Just myself,5am. Dawn. Clear weather,55269.0,Class A,June,7,Claremont,Rte 120,,,swamp with forest behind it. There was a large...,
4,2018,Spring,Nebraska,Thurston County,Latitude & Longitude : 42.158230 -96.344197,I was northeast of Macy Nebraska along the Mis...,2 witnesses saw the creature. 3 people were pr...,1:35 pm on a clear and very hot sunny day. Tem...,59757.0,Class B,May,25,Macy,Highway 201,I had audio recorders set up in the area and r...,There are MANY encounters in this general area...,This is a heavily forested area bordering the ...,


In [15]:
# Begin filtering json data
bf_reports_cols = ['OBSERVED', 'COUNTY', 'STATE', 'REPORT_NUMBER', 'REPORT_CLASS']
bf_reports_transformed = bfro_rep_df[bf_reports_cols].copy()

# Rename column headers
bf_reports_transformed = bf_reports_transformed.rename(columns={"OBSERVED": 'observed',
                                                               'COUNTY': 'county',
                                                               'STATE': 'state',
                                                               'REPORT_NUMBER': 'number',
                                                               'REPORT_CLASS': 'classification'})

# Drop null values on 'number'
bf_reports_transformed = bf_reports_transformed[bf_reports_transformed['number'].notna()]

bf_reports_transformed.head()


Unnamed: 0,observed,county,state,number,classification
0,Ed L. was salmon fishing with a companion in P...,Valdez-Chitina-Whittier County,Alaska,1261.0,Class A
2,heh i kinda feel a little dumb that im reporti...,Warren County,New Jersey,438.0,Class B
3,I was on my way to Claremont from Lebanon on R...,Sullivan County,New Hampshire,55269.0,Class A
4,I was northeast of Macy Nebraska along the Mis...,Thurston County,Nebraska,59757.0,Class B
5,"While this incident occurred a long time ago, ...",Ward County,North Dakota,751.0,Class A


In [16]:
# sort reports by report number
bf_reports_transformed.sort_values(by=['number'], inplace=True)
bf_reports_transformed.reset_index(drop=True, inplace=True)
bf_reports_transformed.head()

Unnamed: 0,observed,county,state,number,classification
0,"These two gentlmen, brothers, recluses age 50'...",Skagit County,Washington,60.0,Class B
1,My wife and I were off the trail to take some ...,Washington County,New York,70.0,Class B
2,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Baxter County,Arkansas,76.0,Class B
3,"We had driven into Milk pond, up Chinook pass,...",Kittitas County,Washington,77.0,Class B
4,"My veterinarian sister, former girlfriend, and...",Lane County,Oregon,80.0,Class B


In [17]:
# double-check to make sure most necessary columns do not hold null values
# null values may be filled in merges
bf_reports_transformed.count()

observed          4711
county            4747
state             4747
number            4747
classification    4747
dtype: int64

### Compare counts of all three DFs

In [18]:
# Compare counts of all three df
bf_loc_transformed.count()

number            3994
classification    3994
latitude          3994
longitude         3994
Date_loc          3994
Title             3994
dtype: int64

In [19]:
# Compare counts of all three df
bf_geo_transformed.count()

observed          4711
county            4747
state             4747
latitude          3797
longitude         3797
date              3797
number            4747
classification    4747
geohash           3797
dtype: int64

In [20]:
bf_reports_transformed.count()

observed          4711
county            4747
state             4747
number            4747
classification    4747
dtype: int64

## Merging of three dataframes

In [21]:
# Merge bf_geo_transformed and bf_reports_transformed DataFrames
bf_merge = pd.merge(bf_geo_transformed, bf_reports_transformed, on='number', how='outer', suffixes=("_geocoded", "_reports"))
bf_merge.head()

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude,longitude,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports
0,"These two gentlmen, brothers, recluses age 50'...",Skagit County,Washington,48.64056,-121.8053,1994-05-13,60.0,Class B,c29ksq8pfc,"These two gentlmen, brothers, recluses age 50'...",Skagit County,Washington,Class B
1,My wife and I were off the trail to take some ...,Washington County,New York,,,,70.0,Class B,,My wife and I were off the trail to take some ...,Washington County,New York,Class B
2,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Baxter County,Arkansas,36.37139,-92.25139,2001-08-11,76.0,Class B,9yqquv0wdy,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Baxter County,Arkansas,Class B
3,"We had driven into Milk pond, up Chinook pass,...",Kittitas County,Washington,46.98333,-121.0922,1983-09-01,77.0,Class B,c23gk3p3ep,"We had driven into Milk pond, up Chinook pass,...",Kittitas County,Washington,Class B
4,"My veterinarian sister, former girlfriend, and...",Lane County,Oregon,43.69005,-122.3755,1983-08-12,80.0,Class B,9rbbx38bup,"My veterinarian sister, former girlfriend, and...",Lane County,Oregon,Class B


In [22]:
# Final merge with bf_loc_transformed into DataFrame and drop na
final_bf_merge = pd.merge(bf_merge, bf_loc_transformed, on='number', how='outer', suffixes=("_reports", "_location"))

final_bf_merge

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude_reports,longitude_reports,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports,classification,latitude_location,longitude_location,Date_loc,Title
0,"These two gentlmen, brothers, recluses age 50'...",Skagit County,Washington,48.64056,-121.80530,1994-05-13,60.0,Class B,c29ksq8pfc,"These two gentlmen, brothers, recluses age 50'...",Skagit County,Washington,Class B,Class B,48.64056,-121.80530,1994-05-13,Missing Cattle and large footprints found
1,My wife and I were off the trail to take some ...,Washington County,New York,,,,70.0,Class B,,My wife and I were off the trail to take some ...,Washington County,New York,Class B,,,,,
2,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Baxter County,Arkansas,36.37139,-92.25139,2001-08-11,76.0,Class B,9yqquv0wdy,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Baxter County,Arkansas,Class B,Class B,36.37139,-92.25139,2001-08-11,Three fisherman hear thrashing about and loud...
3,"We had driven into Milk pond, up Chinook pass,...",Kittitas County,Washington,46.98333,-121.09220,1983-09-01,77.0,Class B,c23gk3p3ep,"We had driven into Milk pond, up Chinook pass,...",Kittitas County,Washington,Class B,Class B,46.98333,-121.09220,1983-09-01,Couple hear vocalizations while camping at Mi...
4,"My veterinarian sister, former girlfriend, and...",Lane County,Oregon,43.69005,-122.37550,1983-08-12,80.0,Class B,9rbbx38bup,"My veterinarian sister, former girlfriend, and...",Lane County,Oregon,Class B,Class B,43.69005,-122.37550,1983-08-12,"Late Arriving Campers Hear Running, Stomping,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4929,,,,,,,51949.0,,,,,,,Class B,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,,,,55604.0,,,,,,,Class B,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,,,,58445.0,,,,,,,Class B,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,,,,59610.0,,,,,,,Class B,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


#### Dropping Duplicate/Incomplete Columns

##### By comparing observed_geocoded and observed_reports, we see that the only values that do not match are NaN or 'None'. 
Because of this, we will drop the observed_reports.

In [23]:
# check to see if values from separate datasets match
# checking observation
final_bf_merge[final_bf_merge['observed_geocoded']!=final_bf_merge['observed_reports']]

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude_reports,longitude_reports,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports,classification,latitude_location,longitude_location,Date_loc,Title
266,,El Dorado County,California,38.93333,-119.98330,1996-04-10,919.0,Class A,9qftmxwg46,,El Dorado County,California,Class A,Class A,38.93333,-119.98330,1996-04-10,Car passenger has nighttime sighting near Lak...
592,,Morrow County,Ohio,40.38609,-82.69884,1977-06-01,1625.0,Class A,dpjjy0hd12,,Morrow County,Ohio,Class A,Class A,40.38609,-82.69884,1977-06-01,Children have daytime close encounter near Sp...
708,,Le Flore County,Oklahoma,34.84167,-94.63111,2001-03-20,1966.0,Class B,9yhwnwexrq,,Le Flore County,Oklahoma,Class B,Class B,34.84167,-94.63111,2001-03-20,May/June 83/84 Oklahoma Leflore
924,,Midland County,Michigan,43.79834,-84.21140,1972-11-15,2830.0,Class A,dpu15tppu4,,Midland County,Michigan,Class A,Class A,43.79834,-84.21140,1972-11-15,BF shakes bush and screams at 2 hunters
982,,Monroe County,Tennessee,35.25970,-84.28315,2000-08-03,3008.0,Class A,dnk0d4ffsk,,Monroe County,Tennessee,Class A,Class A,35.25970,-84.28315,2000-08-03,Gold dredgers have afternoon encounter in Che...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4929,,,,,,,51949.0,,,,,,,Class B,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,,,,55604.0,,,,,,,Class B,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,,,,58445.0,,,,,,,Class B,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,,,,59610.0,,,,,,,Class B,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


##### Counties match except for NaN - will drop county_reports

In [24]:
# checking counties
final_bf_merge[final_bf_merge['county_geocoded']!=final_bf_merge['county_reports']]

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude_reports,longitude_reports,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports,classification,latitude_location,longitude_location,Date_loc,Title
4747,,,,,,,211.0,,,,,,,Class A,50.40230,-114.44190,1995-11-18,Daylight sighting by a hunter east of Alberta...
4748,,,,,,,444.0,,,,,,,Class C,33.49960,-84.54220,1986-04-15,Family sightings over 100 years in Fayette an...
4749,,,,,,,483.0,,,,,,,Class C,33.94175,-86.17965,1996-04-15,Multiple witnesses describe multiple encounters
4750,,,,,,,857.0,,,,,,,Class B,49.70889,-125.25810,1968-09-24,Possible sighting around 2:00AM near Courtenay
4751,,,,,,,1528.0,,,,,,,Class B,50.06295,-100.81880,2000-10-15,Possible sasquatch seen by farmer near Hwy 21...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4929,,,,,,,51949.0,,,,,,,Class B,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,,,,55604.0,,,,,,,Class B,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,,,,58445.0,,,,,,,Class B,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,,,,59610.0,,,,,,,Class B,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


##### States match except for NaN- will drop state_reports

In [25]:
# checking states
final_bf_merge[final_bf_merge['state_geocoded']!=final_bf_merge['state_reports']]

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude_reports,longitude_reports,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports,classification,latitude_location,longitude_location,Date_loc,Title
4747,,,,,,,211.0,,,,,,,Class A,50.40230,-114.44190,1995-11-18,Daylight sighting by a hunter east of Alberta...
4748,,,,,,,444.0,,,,,,,Class C,33.49960,-84.54220,1986-04-15,Family sightings over 100 years in Fayette an...
4749,,,,,,,483.0,,,,,,,Class C,33.94175,-86.17965,1996-04-15,Multiple witnesses describe multiple encounters
4750,,,,,,,857.0,,,,,,,Class B,49.70889,-125.25810,1968-09-24,Possible sighting around 2:00AM near Courtenay
4751,,,,,,,1528.0,,,,,,,Class B,50.06295,-100.81880,2000-10-15,Possible sasquatch seen by farmer near Hwy 21...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4929,,,,,,,51949.0,,,,,,,Class B,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,,,,55604.0,,,,,,,Class B,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,,,,58445.0,,,,,,,Class B,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,,,,59610.0,,,,,,,Class B,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


##### Latitude matches, but latitude_location is more complete - will drop latitude_reports

In [26]:
# checking latitude
final_bf_merge[final_bf_merge['latitude_reports']!=final_bf_merge['latitude_location']]

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude_reports,longitude_reports,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports,classification,latitude_location,longitude_location,Date_loc,Title
1,My wife and I were off the trail to take some ...,Washington County,New York,,,,70.0,Class B,,My wife and I were off the trail to take some ...,Washington County,New York,Class B,,,,,
2,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Baxter County,Arkansas,36.37139,-92.25139,2001-08-11,76.0,Class B,9yqquv0wdy,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Baxter County,Arkansas,Class B,Class B,36.37139,-92.25139,2001-08-11,Three fisherman hear thrashing about and loud...
5,when we toped a small hill and started down th...,Lewis County,Tennessee,,,,81.0,Class A,,when we toped a small hill and started down th...,Lewis County,Tennessee,Class A,,,,,
12,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,,,,105.0,Class A,,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,Class A,,,,,
19,This incident occurred in late February of 198...,Chemung County,New York,,,,145.0,Class B,,This incident occurred in late February of 198...,Chemung County,New York,Class B,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4929,,,,,,,51949.0,,,,,,,Class B,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,,,,55604.0,,,,,,,Class B,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,,,,58445.0,,,,,,,Class B,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,,,,59610.0,,,,,,,Class B,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


##### Longitude matches, but longitude_location is more complete - will drop longitude_reports

In [27]:
# checking longitude
final_bf_merge[final_bf_merge['longitude_reports']!=final_bf_merge['longitude_location']]

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude_reports,longitude_reports,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports,classification,latitude_location,longitude_location,Date_loc,Title
1,My wife and I were off the trail to take some ...,Washington County,New York,,,,70.0,Class B,,My wife and I were off the trail to take some ...,Washington County,New York,Class B,,,,,
5,when we toped a small hill and started down th...,Lewis County,Tennessee,,,,81.0,Class A,,when we toped a small hill and started down th...,Lewis County,Tennessee,Class A,,,,,
11,This weekend 8/19/00 while hunting off of Salm...,Tillamook County,Oregon,45.73510,-123.46600,2000-08-19,102.0,Class B,c20hnmyf38,This weekend 8/19/00 while hunting off of Salm...,Tillamook County,Oregon,Class B,Class B,45.73510,-123.46600,2000-08-19,"Couple hears unknown howl, later finds possib..."
12,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,,,,105.0,Class A,,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,Class A,,,,,
14,"Although I, nor my boys have actually seen a b...",Neosho County,Kansas,37.44208,-95.35479,1975-10-01,113.0,Class B,9ysjn0hgbm,"Although I, nor my boys have actually seen a b...",Neosho County,Kansas,Class B,Class B,37.44208,-95.35479,1975-10-01,Son and friend find footprints. Later somethi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4929,,,,,,,51949.0,,,,,,,Class B,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,,,,55604.0,,,,,,,Class B,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,,,,58445.0,,,,,,,Class B,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,,,,59610.0,,,,,,,Class B,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


##### Dates match, but date_loc is more complete - will drop date

In [28]:
# checking longitude
final_bf_merge[final_bf_merge['date']!=final_bf_merge['Date_loc']]

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude_reports,longitude_reports,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports,classification,latitude_location,longitude_location,Date_loc,Title
1,My wife and I were off the trail to take some ...,Washington County,New York,,,,70.0,Class B,,My wife and I were off the trail to take some ...,Washington County,New York,Class B,,,,,
5,when we toped a small hill and started down th...,Lewis County,Tennessee,,,,81.0,Class A,,when we toped a small hill and started down th...,Lewis County,Tennessee,Class A,,,,,
12,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,,,,105.0,Class A,,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,Class A,,,,,
19,This incident occurred in late February of 198...,Chemung County,New York,,,,145.0,Class B,,This incident occurred in late February of 198...,Chemung County,New York,Class B,,,,,
21,A tremendous storm came in off the Pacific. M...,Trinity County,California,,,,160.0,Class B,,A tremendous storm came in off the Pacific. M...,Trinity County,California,Class B,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4929,,,,,,,51949.0,,,,,,,Class B,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,,,,55604.0,,,,,,,Class B,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,,,,58445.0,,,,,,,Class B,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,,,,59610.0,,,,,,,Class B,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


##### Checking classifications - many holes in each.
Will remove to own dataframe.

In [29]:
# checking classification
final_bf_merge[final_bf_merge['classification']!=final_bf_merge['classification_geocoded']]

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude_reports,longitude_reports,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports,classification,latitude_location,longitude_location,Date_loc,Title
1,My wife and I were off the trail to take some ...,Washington County,New York,,,,70.0,Class B,,My wife and I were off the trail to take some ...,Washington County,New York,Class B,,,,,
5,when we toped a small hill and started down th...,Lewis County,Tennessee,,,,81.0,Class A,,when we toped a small hill and started down th...,Lewis County,Tennessee,Class A,,,,,
12,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,,,,105.0,Class A,,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,Class A,,,,,
19,This incident occurred in late February of 198...,Chemung County,New York,,,,145.0,Class B,,This incident occurred in late February of 198...,Chemung County,New York,Class B,,,,,
21,A tremendous storm came in off the Pacific. M...,Trinity County,California,,,,160.0,Class B,,A tremendous storm came in off the Pacific. M...,Trinity County,California,Class B,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4929,,,,,,,51949.0,,,,,,,Class B,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,,,,55604.0,,,,,,,Class B,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,,,,58445.0,,,,,,,Class B,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,,,,59610.0,,,,,,,Class B,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


In [30]:
# checking classification
final_bf_merge[final_bf_merge['classification']!=final_bf_merge['classification_reports']]

Unnamed: 0,observed_geocoded,county_geocoded,state_geocoded,latitude_reports,longitude_reports,date,number,classification_geocoded,geohash,observed_reports,county_reports,state_reports,classification_reports,classification,latitude_location,longitude_location,Date_loc,Title
1,My wife and I were off the trail to take some ...,Washington County,New York,,,,70.0,Class B,,My wife and I were off the trail to take some ...,Washington County,New York,Class B,,,,,
5,when we toped a small hill and started down th...,Lewis County,Tennessee,,,,81.0,Class A,,when we toped a small hill and started down th...,Lewis County,Tennessee,Class A,,,,,
12,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,,,,105.0,Class A,,It was September of 1997 and I was outside on ...,Allegheny County,Pennsylvania,Class A,,,,,
19,This incident occurred in late February of 198...,Chemung County,New York,,,,145.0,Class B,,This incident occurred in late February of 198...,Chemung County,New York,Class B,,,,,
21,A tremendous storm came in off the Pacific. M...,Trinity County,California,,,,160.0,Class B,,A tremendous storm came in off the Pacific. M...,Trinity County,California,Class B,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4929,,,,,,,51949.0,,,,,,,Class B,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,,,,55604.0,,,,,,,Class B,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,,,,58445.0,,,,,,,Class B,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,,,,59610.0,,,,,,,Class B,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


##### Remove classifications to their own dataframe, since they do not match.

In [31]:
# select number and classifications to add to separate classification dataframe
classification_cols = ['number', 'classification', 'classification_geocoded', 'classification_reports']
classification = final_bf_merge[classification_cols].copy()
classification

Unnamed: 0,number,classification,classification_geocoded,classification_reports
0,60.0,Class B,Class B,Class B
1,70.0,,Class B,Class B
2,76.0,Class B,Class B,Class B
3,77.0,Class B,Class B,Class B
4,80.0,Class B,Class B,Class B
...,...,...,...,...
4929,51949.0,Class B,,
4930,55604.0,Class B,,
4931,58445.0,Class B,,
4932,59610.0,Class B,,


##### Drop unnecessary columns
* observed_reports
* county_reports
* state_reports
* latitude_reports
* longitude_reports
* date
* classification
* classification_geocoded
* classification_reports

In [32]:
# drop columns
bigfoot_data = final_bf_merge.drop(['observed_reports',
                                    'county_reports',
                                    'state_reports',
                                    'latitude_reports',
                                    'longitude_reports',
                                    'date',
                                    'classification',
                                    'classification_geocoded',
                                    'classification_reports'
                                       ], axis=1)

# Rename column headers
bigfoot_data = bigfoot_data.rename(columns={'observed_geocoded':'observation',
                                            'county_geocoded':'county',
                                            'state_geocoded':'state',
                                            'latitude_location':'latitude',
                                            'longitude_location':'longitude',
                                            'Date_loc':'date',
                                            'Title':'moreInfo'})

bigfoot_data

Unnamed: 0,observation,county,state,number,geohash,latitude,longitude,date,moreInfo
0,"These two gentlmen, brothers, recluses age 50'...",Skagit County,Washington,60.0,c29ksq8pfc,48.64056,-121.80530,1994-05-13,Missing Cattle and large footprints found
1,My wife and I were off the trail to take some ...,Washington County,New York,70.0,,,,,
2,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Baxter County,Arkansas,76.0,9yqquv0wdy,36.37139,-92.25139,2001-08-11,Three fisherman hear thrashing about and loud...
3,"We had driven into Milk pond, up Chinook pass,...",Kittitas County,Washington,77.0,c23gk3p3ep,46.98333,-121.09220,1983-09-01,Couple hear vocalizations while camping at Mi...
4,"My veterinarian sister, former girlfriend, and...",Lane County,Oregon,80.0,9rbbx38bup,43.69005,-122.37550,1983-08-12,"Late Arriving Campers Hear Running, Stomping,..."
...,...,...,...,...,...,...,...,...,...
4929,,,,51949.0,,44.90046,-63.84574,2015-10-11,Large rock flung at two hikers visiting Mount...
4930,,,,55604.0,,52.78074,-116.36510,2016-09-06,Two hunters experience prolonged tree shaking...
4931,,,,58445.0,,42.36689,-73.18228,2017-10-30,Camper has an unnerving night on October Moun...
4932,,,,59610.0,,50.81168,-114.78290,2018-06-12,Retired mail carrier reports sundown knocks a...


##### Separate file into location data and observation data

In [33]:
sighting_location = bigfoot_data[['number','county','state','latitude','longitude','geohash', 'date']]
sighting_location.head()

Unnamed: 0,number,county,state,latitude,longitude,geohash,date
0,60.0,Skagit County,Washington,48.64056,-121.8053,c29ksq8pfc,1994-05-13
1,70.0,Washington County,New York,,,,
2,76.0,Baxter County,Arkansas,36.37139,-92.25139,9yqquv0wdy,2001-08-11
3,77.0,Kittitas County,Washington,46.98333,-121.0922,c23gk3p3ep,1983-09-01
4,80.0,Lane County,Oregon,43.69005,-122.3755,9rbbx38bup,1983-08-12


In [34]:
observations = bigfoot_data[['number','date','observation','moreInfo']]
observations.head()

Unnamed: 0,number,date,observation,moreInfo
0,60.0,1994-05-13,"These two gentlmen, brothers, recluses age 50'...",Missing Cattle and large footprints found
1,70.0,,My wife and I were off the trail to take some ...,
2,76.0,2001-08-11,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,Three fisherman hear thrashing about and loud...
3,77.0,1983-09-01,"We had driven into Milk pond, up Chinook pass,...",Couple hear vocalizations while camping at Mi...
4,80.0,1983-08-12,"My veterinarian sister, former girlfriend, and...","Late Arriving Campers Hear Running, Stomping,..."


In [35]:
# create output files
output_file = 'templates/bigfootData.json'
sighting_output = 'templates/sightingsLocation.json'
observation_output = 'templates/observationReports.json'

In [36]:
# write all data to json
bigfootData = bigfoot_data.to_json(output_file, orient='index')
bigfootData

In [37]:
# write sightings data to json
sightings = sighting_location.to_json(sighting_output, orient='index')
sightings

In [38]:
# write sightings data to json
observationReports = observations.to_json(observation_output, orient='index')
observationReports

Create database connection

In [39]:
connection_string = "postgres:postgres@localhost:5432/Bigfoot"
engine = create_engine(f'postgresql://{connection_string}')