## COVID-19 Modeling Pipeline

In [1]:
!bq --location=US mk --dataset covid_19_modeled

Dataset 'cs327e-sp2020:covid_19_modeled' successfully created.


In [2]:
%%bigquery
create or replace table covid_19_modeled.Cases
as select null as id, * from covid_19_staging.Cases

### Compute fingerprint of location fields (state, country) 

In [3]:
%%bigquery
update covid_19_modeled.Cases set id = FARM_FINGERPRINT(country) 
where state is null

#### Task 1: Compute fingerprint of state + country where state is not null

In [4]:
%%bigquery
update covid_19_modeled.Cases set id = FARM_FINGERPRINT(concat(state, country)) 
where state is not null

#### Make sure that id field has no null values

In [5]:
%%bigquery
select count(*) as null_id_count
from covid_19_modeled.Cases
where id is null

Unnamed: 0,null_id_count
0,0


In [6]:
%%bigquery
select id, state, country
from covid_19_modeled.Cases
order by state, country
limit 5

Unnamed: 0,id,state,country
0,8576431891811451300,,Azerbaijan
1,8778414404485170876,,Afghanistan
2,8778414404485170876,,Afghanistan
3,8778414404485170876,,Afghanistan
4,8778414404485170876,,Afghanistan


### Split Cases table

In [27]:
%%bigquery
create or replace table covid_19_modeled.Location_SQL_1
as select distinct id, state, country, latitude, longitude, fips, admin2, combined_key
from covid_19_modeled.Cases

#### Task 2: Get record count from table Location_SQL_1

In [28]:
%%bigquery
select count(*) as location_count from
covid_19_modeled.Location_SQL_1

Unnamed: 0,location_count
0,4431


In [29]:
%%bigquery
create or replace table covid_19_modeled.Event_SQL_1
as select id as location_id, last_update, confirmed, deaths, recovered, active
from covid_19_modeled.Cases

#### Task 3: Get record count from table Event_SQL_1

In [30]:
%%bigquery
select count(*) as event_count from
covid_19_modeled.Event_SQL_1

Unnamed: 0,event_count
0,98669


#### Sample older and recent Event records

In [31]:
%%bigquery
select * from covid_19_modeled.Event_SQL_1
order by last_update
limit 5

Unnamed: 0,location_id,last_update,confirmed,deaths,recovered,active
0,3093811823925351433,1/22/2020 17:00,1.0,,,
1,2544652828731166483,1/22/2020 17:00,1.0,,,
2,400699263222839825,1/22/2020 17:00,,,,
3,-8459520092734636284,1/22/2020 17:00,5.0,,,
4,3061248092517028102,1/22/2020 17:00,2.0,,,


In [32]:
%%bigquery
select * from covid_19_modeled.Event_SQL_1
order by last_update desc
limit 5

Unnamed: 0,location_id,last_update,confirmed,deaths,recovered,active
0,6443493987885756991,4/6/20 9:37,914,4,216,694
1,9155895331965305746,4/6/20 6:20,373,5,57,311
2,-3396123447326000985,4/6/20 5:30,536,6,389,141
3,-6225137598979003815,4/6/20 2:36,139,2,132,5
4,-4927258461359090633,4/6/20 2:21,67803,3212,64014,577


### Option A: Standardize timestamps with SQL

In [33]:
%%bigquery
create or replace table covid_19_modeled.Event_SQL_2 as
select *
from covid_19_modeled.Event_SQL_1
where strpos(last_update, '/') > 0

#### Note: Event_SQL_2 has all the records with '/'

#### Task 4: Get record count for timestamps containing '/' (i.e. mm/dd/yyyy or mm/dd/yy):

In [34]:
%%bigquery
select count(*) as count_ts_slash
from covid_19_modeled.Event_SQL_2

Unnamed: 0,count_ts_slash
0,22335


In [36]:
%%bigquery
create or replace table covid_19_modeled.Event_SQL_3 as
select location_id, cast(last_update as datetime) last_update, confirmed, deaths, recovered, active 
from covid_19_modeled.Event_SQL_1
where strpos(last_update, '-') > 0

#### Note: Event_SQL_3 has all the records with '-'

#### Task 5: Get record count for timestamps containing '-' (i.e. yyyy-mm-dd):

In [37]:
%%bigquery
select count(*) as count_ts_hyphen
from covid_19_modeled.Event_SQL_3

Unnamed: 0,count_ts_hyphen
0,76334


In [38]:
%%bigquery
select last_update, length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) as year_length
from
(select distinct last_update
from covid_19_modeled.Event_SQL_2)
limit 12

Unnamed: 0,last_update,year_length
0,2/1/2020 19:53,4
1,1/24/20 17:00,2
2,1/23/20 17:00,2
3,2/1/2020 1:52,4
4,1/28/20 23:00,2
5,1/25/20 17:00,2
6,1/30/20 16:00,2
7,1/27/20 23:59,2
8,1/26/20 16:00,2
9,1/22/2020 17:00,4


In [39]:
%%bigquery
select last_update, parse_datetime('%m/%d/%y %H:%M', last_update) as last_update_datetime
from
(select distinct last_update
from covid_19_modeled.Event_SQL_2)
where length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) = 2
limit 12

Unnamed: 0,last_update,last_update_datetime
0,1/24/20 17:00,2020-01-24 17:00:00
1,1/23/20 17:00,2020-01-23 17:00:00
2,1/28/20 23:00,2020-01-28 23:00:00
3,1/25/20 17:00,2020-01-25 17:00:00
4,1/30/20 16:00,2020-01-30 16:00:00
5,1/27/20 23:59,2020-01-27 23:59:00
6,1/26/20 16:00,2020-01-26 16:00:00
7,1/29/20 19:30,2020-01-29 19:30:00
8,4/2/20 23:32,2020-04-02 23:32:00
9,4/6/20 23:28,2020-04-06 23:28:00


In [43]:
%%bigquery
create or replace table covid_19_modeled.Event_SQL_4 as
select location_id, parse_datetime('%m/%d/%y %H:%M', last_update) as last_update, confirmed, deaths, recovered, active
from covid_19_modeled.Event_SQL_2
where length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) = 2
union all
select location_id, parse_datetime('%m/%d/%Y %H:%M', last_update) as last_update, confirmed, deaths, recovered, active
from covid_19_modeled.Event_SQL_2
where length(split(split(last_update, ' ')[offset(0)], '/')[offset(2)]) = 4

In [44]:
%%bigquery
select * from
(select distinct last_update 
from covid_19_modeled.Event_SQL_4)
order by last_update
limit 8

Unnamed: 0,last_update
0,2020-01-22 17:00:00
1,2020-01-23 17:00:00
2,2020-01-24 17:00:00
3,2020-01-25 17:00:00
4,2020-01-26 16:00:00
5,2020-01-27 23:59:00
6,2020-01-28 23:00:00
7,2020-01-29 19:30:00


#### Task 6: Make new table Event_SQL_5 as the union of Event_SQL_3 + Event_SQL_4

In [48]:
%%bigquery
create or replace table covid_19_modeled.Event_SQL_5 as
select * from covid_19_modeled.Event_SQL_3
union all
select * from covid_19_modeled.Event_SQL_4

In [49]:
%%bigquery
select count(*) total_event_count
from covid_19_modeled.Event_SQL_5

Unnamed: 0,total_event_count
0,98669


#### Task 7: Remove duplicate events from Event_SQL_5 with select distinct

In [54]:
%%bigquery
create or replace table covid_19_modeled.Event_SQL_5 as
select distinct location_id, last_update, confirmed, deaths, recovered, active from covid_19_modeled.Event_SQL_5

In [55]:
%%bigquery
select count(*) as total_event_count from covid_19_modeled.Event_SQL_5

Unnamed: 0,total_event_count
0,53619


In [56]:
%%bigquery
select location_id, last_update, count(*) as duplicate_events
from covid_19_modeled.Event_SQL_5
group by location_id, last_update
having count(*) > 1
order by count(*) desc
limit 5

Unnamed: 0,location_id,last_update,duplicate_events
0,-5615092215703141091,2020-04-19 23:41:01,120
1,-5615092215703141091,2020-04-20 23:36:47,119
2,-5615092215703141091,2020-04-17 23:30:52,116
3,-5615092215703141091,2020-04-16 23:30:51,116
4,-5615092215703141091,2020-04-18 22:32:47,113


In [57]:
%%bigquery
select * from covid_19_modeled.Event_SQL_5
where location_id = -118082929383066870 and last_update = '2020-03-14 16:53:03'

Unnamed: 0,location_id,last_update,confirmed,deaths,recovered,active
0,-118082929383066870,2020-03-14 16:53:03,13,0,0,
1,-118082929383066870,2020-03-14 16:53:03,7,0,0,


In [58]:
%%bigquery
select *, 
rank() over (partition by location_id, last_update order by confirmed desc) as rank
from covid_19_modeled.Event_SQL_5
where location_id in (-118082929383066870, 5199822387082299175)
limit 30

Unnamed: 0,location_id,last_update,confirmed,deaths,recovered,active,rank
0,-118082929383066870,2020-03-10 02:33:04,4,0,0,,1
1,-118082929383066870,2020-03-11 10:13:20,5,0,0,,1
2,-118082929383066870,2020-03-11 20:00:00,6,0,0,,1
3,-118082929383066870,2020-03-12 21:39:10,6,0,0,,1
4,-118082929383066870,2020-03-14 16:53:03,13,0,0,,1
5,-118082929383066870,2020-03-14 16:53:03,7,0,0,,2
6,-118082929383066870,2020-03-16 22:33:03,17,0,0,,1
7,-118082929383066870,2020-03-17 23:13:10,26,0,0,,1
8,-118082929383066870,2020-03-19 23:43:04,44,0,0,,1
9,-118082929383066870,2020-03-21 00:13:22,55,0,0,,1


In [59]:
%%bigquery
create or replace table covid_19_modeled.Event_SQL_Final as
select location_id, last_update, confirmed, deaths, recovered, active
from
(select *, 
rank() over (partition by location_id, last_update order by confirmed desc) as rank
from covid_19_modeled.Event_SQL_5)
where rank = 1

#### Check primary key constraint on Event_SQL_Final

In [60]:
%%bigquery
select count(*) total_event_count
from covid_19_modeled.Event_SQL_Final

Unnamed: 0,total_event_count
0,13982


In [61]:
%%bigquery
select count(*) distinct_event_count
from
(select distinct location_id, last_update
from covid_19_modeled.Event_SQL_Final) 

Unnamed: 0,distinct_event_count
0,13980


### Option B: Standardize timestamps and remove duplicate event records with Beam

#### Remember to change kernel before running beam script

In [2]:
%run Event_beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []


#### Note: Event_beam.py processes only 100 records and is used for testing and development

In [1]:
%run Event_beam_dataflow.py

  kms_key=transform.kms_key))


#### Check primary key constraint on Event_Beam_DF table

In [3]:
%%bigquery
select count(*) as event_count from covid_19_modeled.Event_Beam_DF

Unnamed: 0,event_count
0,13989


In [4]:
%%bigquery
select count(*) as distinct_event_count from
(select distinct last_update, location_id as distinct_event_count from covid_19_modeled.Event_Beam_DF)

Unnamed: 0,distinct_event_count
0,13980


#### We still have 9 duplicate records. TO DO: debug and fix duplicate bug in Beam

### Check for differences between Event_Beam_DF and Event_SQL_Final

In [6]:
%%bigquery
select *, count(*) as duplicate_records
from
(select * 
from covid_19_modeled.Event_Beam_DF
union all
select * from 
covid_19_modeled.Event_SQL_Final)
group by location_id, last_update, confirmed, deaths, recovered, active
having count(*) != 2
order by location_id, last_update

Unnamed: 0,location_id,last_update,confirmed,deaths,recovered,active,duplicate_records
0,-5471559963534614326,2020-03-29 00:14:00,579,6,570,3,3
1,-4911682739529236434,2020-04-02 00:16:00,254,2,252,0,3
2,-4688529123323169636,2020-03-22 23:45:00,102,2,0,0,1
3,-3161036227036728327,2020-03-18 01:37:00,146,2,144,0,3
4,-502071653873635964,2020-02-23 11:19:00,18,0,18,0,3
5,1660675897502087225,2020-04-18 22:32:47,106,0,0,106,1
6,4313167634023297110,2020-04-02 01:30:00,1019,4,1014,1,3
7,5158096975639556671,2020-03-08 05:31:00,76,3,73,0,3
8,6017228697697032702,2020-03-24 04:19:00,168,6,162,0,3
9,7324441919734091551,2020-03-16 08:47:00,75,0,75,0,3


#### Note: These 10 records are mismatches between Beam and SQL

In [8]:
%%bigquery
(select * 
from covid_19_modeled.Event_Beam_DF
union distinct
select * from 
covid_19_modeled.Event_SQL_Final)
except distinct
(select * from covid_19_modeled.Event_Beam_DF
intersect distinct
select * from covid_19_modeled.Event_SQL_Final)

Unnamed: 0,location_id,last_update,confirmed,deaths,recovered,active
0,-4688529123323169636,2020-03-22 23:45:00,102,2,0,0
1,1660675897502087225,2020-04-18 22:32:47,106,0,0,106


#### Use Event_SQL_Final as final version of Event table

In [7]:
%%bigquery
create or replace table covid_19_modeled.Event as
select * from covid_19_modeled.Event_SQL_Final

### Location table.  

In [2]:
%%bigquery
select * from covid_19_modeled.Location_SQL_1
limit 8

Unnamed: 0,id,state,country,latitude,longitude,fips,admin2,combined_key
0,7301308919198672302,,UK,55.0,-3.0,,,
1,7301308919198672302,,UK,,,,,
2,8573415270917882368,"Los Angeles, CA",US,,,,,
3,8573415270917882368,"Los Angeles, CA",US,34.0522,-118.2437,,,
4,-2512886450838328062,"Lackland, TX",US,,,,,
5,-1661025486387849213,"Wake County, NC",US,35.8032,-78.5661,,,
6,1889376381529737478,"Norfolk County, MA",US,42.1767,-71.1449,,,
7,8276317383438951174,"Kershaw County, SC",US,34.3672,-80.5883,,,


In [8]:
%%bigquery
select count(*) location_count
from covid_19_modeled.Location_SQL_1

Unnamed: 0,location_count
0,4431


### Option A: Standardize city, state with SQL

In [9]:
%%bigquery
select id, state, strpos(state, ',') as index, country, latitude, longitude, fips, admin2, combined_key
from covid_19_modeled.Location_SQL_1
where strpos(state, ',') > 0
limit 6

Unnamed: 0,id,state,index,country,latitude,longitude,fips,admin2,combined_key
0,8573415270917882368,"Los Angeles, CA",12,US,,,,,
1,8573415270917882368,"Los Angeles, CA",12,US,34.0522,-118.2437,,,
2,-2512886450838328062,"Lackland, TX",9,US,,,,,
3,-1661025486387849213,"Wake County, NC",12,US,35.8032,-78.5661,,,
4,1889376381529737478,"Norfolk County, MA",15,US,42.1767,-71.1449,,,
5,8276317383438951174,"Kershaw County, SC",15,US,34.3672,-80.5883,,,


In [10]:
%%bigquery
select state as orig_state, split(state, ',')[offset(0)] parsed_state, split(state, ',')[offset(1)] parsed_city,
from covid_19_modeled.Location_SQL_1
where strpos(state, ',') > 0
limit 10

Unnamed: 0,orig_state,parsed_state,parsed_city
0,"Los Angeles, CA",Los Angeles,CA
1,"Los Angeles, CA",Los Angeles,CA
2,"Lackland, TX",Lackland,TX
3,"Wake County, NC",Wake County,NC
4,"Norfolk County, MA",Norfolk County,MA
5,"Kershaw County, SC",Kershaw County,SC
6,"Orange County, CA",Orange County,CA
7,"Shasta County, CA",Shasta County,CA
8,"Pinal County, AZ",Pinal County,AZ
9,"Norwell County, MA",Norwell County,MA


In [11]:
%%bigquery
create or replace table covid_19_modeled.Location_SQL_2 as
select id, split(state, ',')[offset(0)] city, split(state, ',')[offset(1)] state, country, latitude, longitude, 
fips, admin2, combined_key
from covid_19_modeled.Location_SQL_1
where strpos(state, ',') > 0

In [12]:
%%bigquery
select count(*) as city_state_location_count
from covid_19_modeled.Location_SQL_2

Unnamed: 0,city_state_location_count
0,162


In [15]:
%%bigquery
select id, state, cast(null as string) as city, country, latitude, longitude, fips, admin2, combined_key
from covid_19_modeled.Location_SQL_1
where strpos(state, ',') = 0
limit 6

Unnamed: 0,id,state,city,country,latitude,longitude,fips,admin2,combined_key
0,-8514889717648639735,Maine,,US,43.95415864,-69.85074999,23023,Sagadahoc,"Sagadahoc, Maine, US"
1,-8514889717648639735,Maine,,US,45.83839062,-69.2860223,23021,Piscataquis,"Piscataquis, Maine, US"
2,-8514889717648639735,Maine,,US,44.1664747,-70.20380627,23001,Androscoggin,"Androscoggin, Maine, US"
3,-8514889717648639735,Maine,,US,44.05996956,-69.54227124,23015,Lincoln,"Lincoln, Maine, US"
4,-8514889717648639735,Maine,,US,43.8370751,-70.37226999,23005,Cumberland,"Cumberland, Maine, US"
5,-8514889717648639735,Maine,,US,44.4858305,-69.12061935,23027,Waldo,"Waldo, Maine, US"


In [16]:
%%bigquery
create or replace table covid_19_modeled.Location_SQL_3 as
select id, state, cast(null as string) as city, country, latitude, longitude, fips, admin2, combined_key
from covid_19_modeled.Location_SQL_1
where strpos(state, ',') = 0

In [17]:
%%bigquery
select count(*) as state_location_count
from covid_19_modeled.Location_SQL_2

Unnamed: 0,state_location_count
0,162


In [18]:
%%bigquery
select count(*) null_state_location_count
from covid_19_modeled.Location_SQL_1
where state is null

Unnamed: 0,null_state_location_count
0,491


#### Task 9: Create table Location_SQL_4 based on Location_SQL_2 + Location_SQL_3 + null states

In [19]:
%%bigquery
create or replace table covid_19_modeled.Location_SQL_4 as
(select id, state, city, country, latitude, longitude, fips, admin2, combined_key 
 from covid_19_modeled.Location_SQL_2
union all
select id, state, city, country, latitude, longitude, fips, admin2, combined_key 
 from covid_19_modeled.Location_SQL_3
union all
select id, state, null as city, country, latitude, longitude, fips, admin2, combined_key 
 from covid_19_modeled.Location_SQL_1
 where state is null
)


In [20]:
%%bigquery
select count(*) as location_count
from covid_19_modeled.Location_SQL_4

Unnamed: 0,location_count
0,4431


#### Check primary key constraint on Location table

In [9]:
%%bigquery
select count(distinct id) as distinct_location_count from covid_19_modeled.Location_SQL_4

Unnamed: 0,distinct_location_count
0,564


#### Remove duplicate location records with SQL

In [15]:
%%bigquery
select *, 
rank() over (partition by id order by combined_key desc) as rank
from covid_19_modeled.Location_SQL_4
where id in (-823610271364515484, -5175654635300698393, 7752613693761280071)

Unnamed: 0,id,state,city,country,latitude,longitude,fips,admin2,combined_key,rank
0,-5175654635300698393,,,Nepal,28.1667,84.25,,,Nepal,1
1,-5175654635300698393,,,Nepal,28.1667,84.25,,,,2
2,-5175654635300698393,,,Nepal,28.3949,84.124,,,,2
3,-5175654635300698393,,,Nepal,,,,,,2
4,-823610271364515484,,,Iran,32.427908,53.688046,,,Iran,1
5,-823610271364515484,,,Iran,,,,,,2
6,-823610271364515484,,,Iran,32.0,53.0,,,,2
7,-823610271364515484,,,Iran,32.4279,53.688,,,,2
8,7752613693761280071,,,Qatar,25.3548,51.1839,,,Qatar,1
9,7752613693761280071,,,Qatar,,,,,,2


In [19]:
%%bigquery
create or replace table covid_19_modeled.Location_SQL_5 as
select id, city, state, country, latitude, longitude, fips, admin2, combined_key from
(select *, 
rank() over (partition by id order by combined_key desc) as rank
from covid_19_modeled.Location_SQL_4)
where rank = 1

In [20]:
%%bigquery
create or replace table covid_19_modeled.Location_SQL_Final as
select id, city, state, country, latitude, longitude, fips, admin2, combined_key from
(select *, 
rank() over (partition by id order by latitude desc, longitude desc) as rank
from covid_19_modeled.Location_SQL_5)
where rank = 1

#### Recheck primary key on Location_SQL_Final

In [21]:
%%bigquery
select count(*) as location_count from covid_19_modeled.Location_SQL_Final

Unnamed: 0,location_count
0,570


In [22]:
%%bigquery
select count(distinct id) as distinct_location_count from covid_19_modeled.Location_SQL_Final

Unnamed: 0,distinct_location_count
0,564


#### Note: still have 6 duplicate records. TO DO: debug and fix duplicate record bug

### Option B: Remove duplicate location records with Beam 

In [3]:
%run Location_beam.py

#### Note: Location_Beam only process 100 records and is used for testing and development

In [2]:
%run Location_beam_dataflow.py

  kms_key=transform.kms_key))


### Check primary key on the Location_Beam_DF table

In [3]:
%%bigquery
select count(*) as total_location_count from covid_19_modeled.Location_Beam_DF

Unnamed: 0,total_location_count
0,564


In [4]:
%%bigquery
select count(distinct id) as distinct_location_count from covid_19_modeled.Location_Beam_DF

Unnamed: 0,distinct_location_count
0,564


### Check foreign key (location_id on Event_Beam_DF)

In [5]:
%%bigquery
select count(*) as foreign_key_violations
from covid_19_modeled.Event e left join covid_19_modeled.Location_Beam_DF l on e.location_id = l.id
where l.id is null

Unnamed: 0,foreign_key_violations
0,0


In [6]:
%%bigquery
select count(*) as foreign_key_violations
from covid_19_modeled.Event e left join covid_19_modeled.Location_SQL_Final l on e.location_id = l.id
where l.id is null

Unnamed: 0,foreign_key_violations
0,0


### Check for deltas between Location_Beam_DF and Location_SQL_Final 

In [7]:
%%bigquery
select *, count(*) as count
from
(select *
from covid_19_modeled.Location_SQL_Final
union all
select *
from covid_19_modeled.Location_Beam_DF)
group by id, city, state, country, latitude, longitude, fips, admin2, combined_key
having count(*) != 2
order by id

Unnamed: 0,id,city,state,country,latitude,longitude,fips,admin2,combined_key,count
0,-9159398279919354893,,Massachusetts,US,42.35026951,-71.90493363,25027.0,Worcester,"Worcester, Massachusetts, US",1
1,-9159398279919354893,,Massachusetts,US,42.6687626,-70.94687179,25009.0,Essex,"Essex, Massachusetts, US",1
2,-9129809244117411434,,Kentucky,US,38.04178222,-84.7417556,21239.0,Woodford,"Woodford, Kentucky, US",1
3,-9129809244117411434,,Kentucky,US,38.97065142,-84.72615357,21015.0,Boone,"Boone, Kentucky, US",1
4,-9014299851662934226,,,Barbados,13.1939,-59.5432,,,Barbados,1
...,...,...,...,...,...,...,...,...,...,...
375,8902200129014313670,,Maryland,US,39.62357628,-78.69280486,24001.0,Allegany,"Allegany, Maryland, US",1
376,9096347281618933621,,,Morocco,31.7917,-7.0926,,,Morocco,1
377,9096347281618933621,,,Morocco,31.7917,-7.0926,,,,1
378,9155895331965305746,,,Taiwan*,23.7,121,,,Taiwan*,1


In [11]:
%%bigquery
select *, count(*) as count
from
(select *
from covid_19_modeled.Location_SQL_Final 
union all
select *
from covid_19_modeled.Location_Beam_DF)
group by id, city, state, country, latitude, longitude, fips, admin2, combined_key
having count(*) != 2
order by id

Unnamed: 0,id,city,state,country,latitude,longitude,fips,admin2,combined_key,count
0,-9159398279919354893,,Massachusetts,US,42.6687626,-70.94687179,25009.0,Essex,"Essex, Massachusetts, US",1
1,-9159398279919354893,,Massachusetts,US,42.35026951,-71.90493363,25027.0,Worcester,"Worcester, Massachusetts, US",1
2,-9129809244117411434,,Kentucky,US,38.97065142,-84.72615357,21015.0,Boone,"Boone, Kentucky, US",1
3,-9129809244117411434,,Kentucky,US,38.04178222,-84.7417556,21239.0,Woodford,"Woodford, Kentucky, US",1
4,-9014299851662934226,,,Barbados,13.1939,-59.5432,,,,1
...,...,...,...,...,...,...,...,...,...,...
375,8902200129014313670,,Maryland,US,38.21274277,-75.33200012,24047.0,Worcester,"Worcester, Maryland, US",1
376,9096347281618933621,,,Morocco,31.7917,-7.0926,,,,1
377,9096347281618933621,,,Morocco,31.7917,-7.0926,,,Morocco,1
378,9155895331965305746,,,Taiwan*,23.7,121,,,,1


In [12]:
%%bigquery
(select * 
from covid_19_modeled.Location_Beam_DF
union distinct
select * from 
covid_19_modeled.Location_SQL_Final)
except distinct
(select * from covid_19_modeled.Location_Beam_DF
intersect distinct
select * from covid_19_modeled.Location_SQL_Final)

Unnamed: 0,id,city,state,country,latitude,longitude,fips,admin2,combined_key
0,-391570144277816991,,Tennessee,US,36.15496772,-86.29775884,47189.0,Wilson,"Wilson, Tennessee, US"
1,-40751580946174479,,,Ecuador,-1.8312,-78.1834,,,Ecuador
2,3966931624883821615,,Virgin Islands,US,18.3358,-64.8963,78.0,,"Virgin Islands, US"
3,-7671506513443445254,,Victoria,Australia,-37.8136,144.9631,,,"Victoria, Australia"
4,-6364447854744233741,,,Bulgaria,42.7339,25.4858,,,Bulgaria
...,...,...,...,...,...,...,...,...,...
375,-6843305877291741994,,Gibraltar,United Kingdom,36.1408,-5.3536,,,
376,6828782105295939729,,Montserrat,United Kingdom,16.7425,-62.1874,,,
377,3971402856878482589,,Isle of Man,United Kingdom,54.2361,-4.5481,,,
378,-2198268741802906303,,Cayman Islands,United Kingdom,19.3133,-81.2546,,,


#### Note: found 380 deltas between SQL and Beam tables

#### Task 9: Create Location table from Location_Beam_DF (choose Location_Beam_DF over Location_SQL_Final because Location_SQL_Final still has a few duplicate records)

In [13]:
%%bigquery
create table covid_19_modeled.Location
as select * 
from covid_19_modeled.Location_Beam_DF


### Done modeling Event and Location tables. 