# COVID19 Notebook
website: `https://github.com/CSSEGISandData/COVID-19`

Make sure HDFS is running. See instruction on `cisc-525-util` project repository.

```bash
cd ~/data
hdfs dfs -mkdir -p /user/student
hdfs dfs -copyFromLocal COVID-19 /user/student

jupyter notebook
```

*** 14 June 2021: Good to demonstrate ***

In [1]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc
import subprocess
from pyspark.sql.functions import col, max as max_
import datetime

In [2]:
spark = SparkSession.builder.appName("covid19-app").config("spark.config.option", "value").getOrCreate()
scfg = SparkConf().setAppName('covid19-app')


In [3]:
student_dir   = 'hdfs://localhost:9000/user/student'
covid_19_path = f'{student_dir}'
csse_data     = f'{covid_19_path}/csse_covid_19_data'
csse_daily    = f'{csse_data}/csse_covid_19_daily_reports'
csse_daily_us = f'{csse_data}/csse_covid_19_daily_reports_us'
csse_ts       = f'{csse_data}/csse_covid_19_time_series'

day = f'{csse_daily}/05-15-2020.csv'
ts_confirmed_us = f'{csse_ts}/time_series_covid19_confirmed_US.csv'

In [4]:
ts_df = spark.read.option('header', 'true').csv(ts_confirmed_us)
day_df = spark.read.option('header', 'true').csv(day)

## RDD Section

In [5]:
# Converting from a dataframe df to a resilient distributed data rdd
ts_rdd = ts_df.rdd

In [6]:
# Data Column number

# UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,
# 1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,
# 2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,
# 2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20,2/20/20,2/21/20,2/22/20,2/23/20,2/24/20,
# 2/25/20,2/26/20,2/27/20,2/28/20,2/29/20,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20,3/6/20,3/7/20,
# 3/8/20,3/9/20,3/10/20,3/11/20,3/12/20,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,
# 3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,
# 3/30/20,3/31/20,4/1/20,4/2/20,4/3/20,4/4/20,4/5/20,4/6/20,4/7/20,4/8/20,4/9/20,4/10/20,
# 4/11/20,4/12/20,4/13/20,4/14/20,4/15/20,4/16/20,4/17/20,4/18/20,4/19/20,4/20/20,4/21/20,
# 4/22/20,4/23/20,4/24/20,4/25/20,4/26/20,4/27/20,4/28/20,4/29/20,4/30/20,5/1/20,5/2/20,
# 5/3/20,5/4/20,5/5/20,5/6/20,5/7/20,5/8/20,5/9/20,5/10/20,5/11/20,5/12/20,5/13/20,5/14/20,
# 5/15/20

TS_COLUMNS = ['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
              'Country_Region', 'Lat', 'Long_', 'Combined_Key']

TS_DATE_START_COLUMN = 11

ts_first_row = ts_rdd.first()
ts_first_row


Row(UID='84001001', iso2='US', iso3='USA', code3='840', FIPS='1001.0', Admin2='Autauga', Province_State='Alabama', Country_Region='US', Lat='32.53952745', Long_='-86.64408227', Combined_Key='Autauga, Alabama, US', 1/22/20='0', 1/23/20='0', 1/24/20='0', 1/25/20='0', 1/26/20='0', 1/27/20='0', 1/28/20='0', 1/29/20='0', 1/30/20='0', 1/31/20='0', 2/1/20='0', 2/2/20='0', 2/3/20='0', 2/4/20='0', 2/5/20='0', 2/6/20='0', 2/7/20='0', 2/8/20='0', 2/9/20='0', 2/10/20='0', 2/11/20='0', 2/12/20='0', 2/13/20='0', 2/14/20='0', 2/15/20='0', 2/16/20='0', 2/17/20='0', 2/18/20='0', 2/19/20='0', 2/20/20='0', 2/21/20='0', 2/22/20='0', 2/23/20='0', 2/24/20='0', 2/25/20='0', 2/26/20='0', 2/27/20='0', 2/28/20='0', 2/29/20='0', 3/1/20='0', 3/2/20='0', 3/3/20='0', 3/4/20='0', 3/5/20='0', 3/6/20='0', 3/7/20='0', 3/8/20='0', 3/9/20='0', 3/10/20='0', 3/11/20='0', 3/12/20='0', 3/13/20='0', 3/14/20='0', 3/15/20='0', 3/16/20='0', 3/17/20='0', 3/18/20='0', 3/19/20='0', 3/20/20='0', 3/21/20='0', 3/22/20='0', 3/23/20='0'

In [7]:
# Iterating through the list of values of the first row
for val in ts_first_row:
    print(val)


84001001
US
USA
840
1001.0
Autauga
Alabama
US
32.53952745
-86.64408227
Autauga, Alabama, US
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
5
6
6
6
6
8
8
10
12
12
12
12
12
12
12
17
18
19
19
19
23
24
24
24
25
26
28
30
32
33
36
36
36
37
39
41
42
43
47
51
54
54
56
58
62
63
72
81
88
90
100
100
108
118
124
130
135
148
151
156
160
171
191
192
204
211
216
227
237
239
241
248
259
264
271
282
295
315
323
334
361
369
371
377
404
415
435
438
447
458
474
480
493
500
505
528
538
554
562
570
584
601
620
623
656
663
670
691
708
736
749
764
785
797
822
850
862
872
885
905
918
939
953
971
988
995
1006
1029
1042
1064
1078
1086
1086
1109
1126
1145
1175
1186
1200
1224
1229
1235
1245
1252
1258
1276
1281
1293
1304
1316
1318
1337
1343
1357
1365
1375
1391
1424
1429
1440
1442
1454
1462
1474
1477
1488
1494
1505
1526
1530
1543
1551
1567
1586
1601
1614
1650
1659
1675
1676
1697
1697
1711
1736
1750
1758
1770
1776
1785
1792
1799
1812
1821


In [8]:
# Iterate through the list of values of first row using column name with ROW data type
for key in TS_COLUMNS:
    print(key, '=', ts_first_row[key])
    
# ts_by_dates = ts_first[TS_DATE_START_COLUMN:]
# for ts_by_date in ts_by_dates:
#     print(ts_by_date)

UID = 84001001
iso2 = US
iso3 = USA
code3 = 840
FIPS = 1001.0
Admin2 = Autauga
Province_State = Alabama
Country_Region = US
Lat = 32.53952745
Long_ = -86.64408227
Combined_Key = Autauga, Alabama, US


In [9]:
# Something about date and time
start_date = datetime.date(2020, 1, 22)
print (start_date.strftime('%m/%d/%y'))
start_date += datetime.timedelta(days=1)
cur_date_str = '{}/{}/{}'.format(start_date.month, start_date.day, start_date.year-2000)
print (start_date.strftime('%0m/%d/%y'))
print(cur_date_str)
print(cur_date_str == '1/23/20')

cur_date = datetime.date(2020, 1, 22)
DATE_COLUMNS = []
while True:
    cur_date += datetime.timedelta(days=1)
    cur_date_str = '{}/{}/{}'.format(cur_date.month, cur_date.day, cur_date.year-2000)
    print(cur_date_str)
    DATE_COLUMNS.append(cur_date_str)
    if cur_date_str == '6/7/20':
        break

for date_str in DATE_COLUMNS:
    print(date_str)

01/22/20
01/23/20
1/23/20
True
1/23/20
1/24/20
1/25/20
1/26/20
1/27/20
1/28/20
1/29/20
1/30/20
1/31/20
2/1/20
2/2/20
2/3/20
2/4/20
2/5/20
2/6/20
2/7/20
2/8/20
2/9/20
2/10/20
2/11/20
2/12/20
2/13/20
2/14/20
2/15/20
2/16/20
2/17/20
2/18/20
2/19/20
2/20/20
2/21/20
2/22/20
2/23/20
2/24/20
2/25/20
2/26/20
2/27/20
2/28/20
2/29/20
3/1/20
3/2/20
3/3/20
3/4/20
3/5/20
3/6/20
3/7/20
3/8/20
3/9/20
3/10/20
3/11/20
3/12/20
3/13/20
3/14/20
3/15/20
3/16/20
3/17/20
3/18/20
3/19/20
3/20/20
3/21/20
3/22/20
3/23/20
3/24/20
3/25/20
3/26/20
3/27/20
3/28/20
3/29/20
3/30/20
3/31/20
4/1/20
4/2/20
4/3/20
4/4/20
4/5/20
4/6/20
4/7/20
4/8/20
4/9/20
4/10/20
4/11/20
4/12/20
4/13/20
4/14/20
4/15/20
4/16/20
4/17/20
4/18/20
4/19/20
4/20/20
4/21/20
4/22/20
4/23/20
4/24/20
4/25/20
4/26/20
4/27/20
4/28/20
4/29/20
4/30/20
5/1/20
5/2/20
5/3/20
5/4/20
5/5/20
5/6/20
5/7/20
5/8/20
5/9/20
5/10/20
5/11/20
5/12/20
5/13/20
5/14/20
5/15/20
5/16/20
5/17/20
5/18/20
5/19/20
5/20/20
5/21/20
5/22/20
5/23/20
5/24/20
5/25/20
5/26/20
5/27/

In [10]:
# Printing out the content of the values by date columns. 
# Date columns are to extend over time.

start_date = datetime.date(2020, 1, 22)
ts_first = ts_rdd.first();
ts_by_dates = ts_first[TS_DATE_START_COLUMN:]

# for ts_by_date in ts_by_dates:
#     print(ts_by_date)
    
for date_str in DATE_COLUMNS:
    print(date_str, '=', ts_first[date_str])

1/23/20 = 0
1/24/20 = 0
1/25/20 = 0
1/26/20 = 0
1/27/20 = 0
1/28/20 = 0
1/29/20 = 0
1/30/20 = 0
1/31/20 = 0
2/1/20 = 0
2/2/20 = 0
2/3/20 = 0
2/4/20 = 0
2/5/20 = 0
2/6/20 = 0
2/7/20 = 0
2/8/20 = 0
2/9/20 = 0
2/10/20 = 0
2/11/20 = 0
2/12/20 = 0
2/13/20 = 0
2/14/20 = 0
2/15/20 = 0
2/16/20 = 0
2/17/20 = 0
2/18/20 = 0
2/19/20 = 0
2/20/20 = 0
2/21/20 = 0
2/22/20 = 0
2/23/20 = 0
2/24/20 = 0
2/25/20 = 0
2/26/20 = 0
2/27/20 = 0
2/28/20 = 0
2/29/20 = 0
3/1/20 = 0
3/2/20 = 0
3/3/20 = 0
3/4/20 = 0
3/5/20 = 0
3/6/20 = 0
3/7/20 = 0
3/8/20 = 0
3/9/20 = 0
3/10/20 = 0
3/11/20 = 0
3/12/20 = 0
3/13/20 = 0
3/14/20 = 0
3/15/20 = 0
3/16/20 = 0
3/17/20 = 0
3/18/20 = 0
3/19/20 = 0
3/20/20 = 0
3/21/20 = 0
3/22/20 = 0
3/23/20 = 0
3/24/20 = 1
3/25/20 = 5
3/26/20 = 6
3/27/20 = 6
3/28/20 = 6
3/29/20 = 6
3/30/20 = 8
3/31/20 = 8
4/1/20 = 10
4/2/20 = 12
4/3/20 = 12
4/4/20 = 12
4/5/20 = 12
4/6/20 = 12
4/7/20 = 12
4/8/20 = 12
4/9/20 = 17
4/10/20 = 18
4/11/20 = 19
4/12/20 = 19
4/13/20 = 19
4/14/20 = 23
4/15/20 = 24
4/16

### Group By Province or State


In [11]:
# When we group by provice or state, we get a list of nodes. each node
# consists of the key (name of the state) and a list of da rows for each 
# of the states.

ts_states = ts_rdd.groupBy(lambda x: x['Province_State'])
# dir(ts_states)
sorted_by_states = ts_states.sortByKey('Province_State')
# print(sorted_by_states)

sorted_by_states.collect()

for row in sorted_by_states.collect():
    print(row[0], len(row[1]))

Alabama 69
Alaska 33
American Samoa 1
Arizona 17
Arkansas 77
California 60
Colorado 66
Connecticut 10
Delaware 5
Diamond Princess 1
District of Columbia 3
Florida 69
Georgia 161
Grand Princess 1
Guam 1
Hawaii 7
Idaho 46
Illinois 104
Indiana 94
Iowa 101
Kansas 107
Kentucky 122
Louisiana 66
Maine 18
Maryland 26
Massachusetts 17
Michigan 87
Minnesota 89
Mississippi 84
Missouri 118
Montana 58
Nebraska 95
Nevada 19
New Hampshire 12
New Jersey 23
New Mexico 35
New York 64
North Carolina 102
North Dakota 55
Northern Mariana Islands 1
Ohio 90
Oklahoma 79
Oregon 38
Pennsylvania 69
Puerto Rico 80
Rhode Island 7
South Carolina 48
South Dakota 68
Tennessee 97
Texas 256
Utah 37
Vermont 16
Virgin Islands 1
Virginia 135
Washington 41
West Virginia 57
Wisconsin 74
Wyoming 25


In [12]:

print(len(ts_states.collect()))
for state in sorted(ts_states.collect()):
    print(state[0], len(state[1]))
    for item in state[1]:
        print ('\t', item['Admin2'])

58
Alabama 69
	 Autauga
	 Baldwin
	 Barbour
	 Bibb
	 Blount
	 Bullock
	 Butler
	 Calhoun
	 Chambers
	 Cherokee
	 Chilton
	 Choctaw
	 Clarke
	 Clay
	 Cleburne
	 Coffee
	 Colbert
	 Conecuh
	 Coosa
	 Covington
	 Crenshaw
	 Cullman
	 Dale
	 Dallas
	 DeKalb
	 Elmore
	 Escambia
	 Etowah
	 Fayette
	 Franklin
	 Geneva
	 Greene
	 Hale
	 Henry
	 Houston
	 Jackson
	 Jefferson
	 Lamar
	 Lauderdale
	 Lawrence
	 Lee
	 Limestone
	 Lowndes
	 Macon
	 Madison
	 Marengo
	 Marion
	 Marshall
	 Mobile
	 Monroe
	 Montgomery
	 Morgan
	 Out of AL
	 Perry
	 Pickens
	 Pike
	 Randolph
	 Russell
	 Shelby
	 St. Clair
	 Sumter
	 Talladega
	 Tallapoosa
	 Tuscaloosa
	 Unassigned
	 Walker
	 Washington
	 Wilcox
	 Winston
Alaska 33
	 Aleutians East
	 Aleutians West
	 Anchorage
	 Bethel
	 Bristol Bay
	 Bristol Bay plus Lake and Peninsula
	 Chugach
	 Copper River
	 Denali
	 Dillingham
	 Fairbanks North Star
	 Haines
	 Hoonah-Angoon
	 Juneau
	 Kenai Peninsula
	 Ketchikan Gateway
	 Kodiak Island
	 Kusilvak
	 Matanuska-Susitn

	 Hidalgo
	 Lea
	 Lincoln
	 Los Alamos
	 Luna
	 McKinley
	 Mora
	 Otero
	 Out of NM
	 Quay
	 Rio Arriba
	 Roosevelt
	 San Juan
	 San Miguel
	 Sandoval
	 Santa Fe
	 Sierra
	 Socorro
	 Taos
	 Torrance
	 Unassigned
	 Union
	 Valencia
New York 64
	 Albany
	 Allegany
	 Bronx
	 Broome
	 Cattaraugus
	 Cayuga
	 Chautauqua
	 Chemung
	 Chenango
	 Clinton
	 Columbia
	 Cortland
	 Delaware
	 Dutchess
	 Erie
	 Essex
	 Franklin
	 Fulton
	 Genesee
	 Greene
	 Hamilton
	 Herkimer
	 Jefferson
	 Kings
	 Lewis
	 Livingston
	 Madison
	 Monroe
	 Montgomery
	 Nassau
	 New York
	 Niagara
	 Oneida
	 Onondaga
	 Ontario
	 Orange
	 Orleans
	 Oswego
	 Otsego
	 Out of NY
	 Putnam
	 Queens
	 Rensselaer
	 Richmond
	 Rockland
	 Saratoga
	 Schenectady
	 Schoharie
	 Schuyler
	 Seneca
	 St. Lawrence
	 Steuben
	 Suffolk
	 Sullivan
	 Tioga
	 Tompkins
	 Ulster
	 Unassigned
	 Warren
	 Washington
	 Wayne
	 Westchester
	 Wyoming
	 Yates
North Carolina 102
	 Alamance
	 Alexander
	 Alleghany
	 Anson
	 Ashe
	 Avery
	 Beaufort
	 Be

In [13]:
# ts_filtered = ts_rdd.filter(lambda x: x['Admin2'] != None)
ts_admin2s = ts_rdd.groupBy(lambda x: x['Admin2'])
# ts_states
ts_admin2s = ts_admin2s.filter(lambda x: x[0] != None)
# print(ts_admin2s.collect())
print(len(ts_admin2s.collect()))
for admin2 in sorted(ts_admin2s.collect()):
    print(admin2[0], len(admin2[1]))


1980
Abbeville 1
Acadia 1
Accomack 1
Ada 1
Adair 4
Adams 12
Addison 1
Adjuntas 1
Aguada 1
Aguadilla 1
Aguas Buenas 1
Aibonito 1
Aiken 1
Aitkin 1
Alachua 1
Alamance 1
Alameda 1
Alamosa 1
Albany 2
Albemarle 1
Alcona 1
Alcorn 1
Aleutians East 1
Aleutians West 1
Alexander 2
Alexandria 1
Alfalfa 1
Alger 1
Allamakee 1
Allegan 1
Allegany 2
Alleghany 2
Allegheny 1
Allen 5
Allendale 1
Alpena 1
Alpine 1
Amador 1
Amelia 1
Amherst 1
Amite 1
Anasco 1
Anchorage 1
Anderson 5
Andrew 1
Andrews 1
Androscoggin 1
Angelina 1
Anne Arundel 1
Anoka 1
Anson 1
Antelope 1
Antrim 1
Apache 1
Appanoose 1
Appling 1
Appomattox 1
Aransas 1
Arapahoe 1
Archer 1
Archuleta 1
Arecibo 1
Arenac 1
Arkansas 1
Arlington 1
Armstrong 2
Aroostook 1
Arroyo 1
Arthur 1
Ascension 1
Ashe 1
Ashland 2
Ashley 1
Ashtabula 1
Asotin 1
Assumption 1
Atascosa 1
Atchison 2
Athens 1
Atkinson 1
Atlantic 1
Atoka 1
Attala 1
Audrain 1
Audubon 1
Auglaize 1
Augusta 1
Aurora 1
Austin 1
Autauga 1
Avery 1
Avoyelles 1
Baca 1
Bacon 1
Bailey 1
Baker 3
Baldwi

## Dataframe

In [14]:
ts_df

DataFrame[UID: string, iso2: string, iso3: string, code3: string, FIPS: string, Admin2: string, Province_State: string, Country_Region: string, Lat: string, Long_: string, Combined_Key: string, 1/22/20: string, 1/23/20: string, 1/24/20: string, 1/25/20: string, 1/26/20: string, 1/27/20: string, 1/28/20: string, 1/29/20: string, 1/30/20: string, 1/31/20: string, 2/1/20: string, 2/2/20: string, 2/3/20: string, 2/4/20: string, 2/5/20: string, 2/6/20: string, 2/7/20: string, 2/8/20: string, 2/9/20: string, 2/10/20: string, 2/11/20: string, 2/12/20: string, 2/13/20: string, 2/14/20: string, 2/15/20: string, 2/16/20: string, 2/17/20: string, 2/18/20: string, 2/19/20: string, 2/20/20: string, 2/21/20: string, 2/22/20: string, 2/23/20: string, 2/24/20: string, 2/25/20: string, 2/26/20: string, 2/27/20: string, 2/28/20: string, 2/29/20: string, 3/1/20: string, 3/2/20: string, 3/3/20: string, 3/4/20: string, 3/5/20: string, 3/6/20: string, 3/7/20: string, 3/8/20: string, 3/9/20: string, 3/10/20:

In [15]:
dir(ts_df)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_collectAsArrow',
 '_jcols',
 '_jdf',
 '_jmap',
 '_jseq',
 '_lazy_rdd',
 '_repr_html_',
 '_sc',
 '_schema',
 '_sort_cols',
 '_support_repr_html',
 'agg',
 'alias',
 'approxQuantile',
 'cache',
 'checkpoint',
 'coalesce',
 'colRegex',
 'collect',
 'columns',
 'corr',
 'count',
 'cov',
 'createGlobalTempView',
 'createOrReplaceGlobalTempView',
 'createOrReplaceTempView',
 'createTempView',
 'crossJoin',
 'crosstab',
 'cube',
 'describe',
 'distinct',
 'drop',
 'dropDuplicates',
 'drop_duplicates',
 'dropna',
 'dtypes',
 'exceptAll',
 'explain',
 'fillna',
 'filter',
 'first',
 'foreach',
 'f

In [16]:
ts_df.count()

3342

In [17]:
ts_df.first()['UID']

'84001001'

In [18]:
out = ts_df.groupBy('Province_State').count().orderBy('Province_State')
out.collect()


[Row(Province_State='Alabama', count=69),
 Row(Province_State='Alaska', count=33),
 Row(Province_State='American Samoa', count=1),
 Row(Province_State='Arizona', count=17),
 Row(Province_State='Arkansas', count=77),
 Row(Province_State='California', count=60),
 Row(Province_State='Colorado', count=66),
 Row(Province_State='Connecticut', count=10),
 Row(Province_State='Delaware', count=5),
 Row(Province_State='Diamond Princess', count=1),
 Row(Province_State='District of Columbia', count=3),
 Row(Province_State='Florida', count=69),
 Row(Province_State='Georgia', count=161),
 Row(Province_State='Grand Princess', count=1),
 Row(Province_State='Guam', count=1),
 Row(Province_State='Hawaii', count=7),
 Row(Province_State='Idaho', count=46),
 Row(Province_State='Illinois', count=104),
 Row(Province_State='Indiana', count=94),
 Row(Province_State='Iowa', count=101),
 Row(Province_State='Kansas', count=107),
 Row(Province_State='Kentucky', count=122),
 Row(Province_State='Louisiana', count=66

In [19]:
ts_df.first()['Province_State']

'Alabama'

In [20]:
ts_df.dtypes

[('UID', 'string'),
 ('iso2', 'string'),
 ('iso3', 'string'),
 ('code3', 'string'),
 ('FIPS', 'string'),
 ('Admin2', 'string'),
 ('Province_State', 'string'),
 ('Country_Region', 'string'),
 ('Lat', 'string'),
 ('Long_', 'string'),
 ('Combined_Key', 'string'),
 ('1/22/20', 'string'),
 ('1/23/20', 'string'),
 ('1/24/20', 'string'),
 ('1/25/20', 'string'),
 ('1/26/20', 'string'),
 ('1/27/20', 'string'),
 ('1/28/20', 'string'),
 ('1/29/20', 'string'),
 ('1/30/20', 'string'),
 ('1/31/20', 'string'),
 ('2/1/20', 'string'),
 ('2/2/20', 'string'),
 ('2/3/20', 'string'),
 ('2/4/20', 'string'),
 ('2/5/20', 'string'),
 ('2/6/20', 'string'),
 ('2/7/20', 'string'),
 ('2/8/20', 'string'),
 ('2/9/20', 'string'),
 ('2/10/20', 'string'),
 ('2/11/20', 'string'),
 ('2/12/20', 'string'),
 ('2/13/20', 'string'),
 ('2/14/20', 'string'),
 ('2/15/20', 'string'),
 ('2/16/20', 'string'),
 ('2/17/20', 'string'),
 ('2/18/20', 'string'),
 ('2/19/20', 'string'),
 ('2/20/20', 'string'),
 ('2/21/20', 'string'),
 ('2

In [21]:
ts_df.columns

['UID',
 'iso2',
 'iso3',
 'code3',
 'FIPS',
 'Admin2',
 'Province_State',
 'Country_Region',
 'Lat',
 'Long_',
 'Combined_Key',
 '1/22/20',
 '1/23/20',
 '1/24/20',
 '1/25/20',
 '1/26/20',
 '1/27/20',
 '1/28/20',
 '1/29/20',
 '1/30/20',
 '1/31/20',
 '2/1/20',
 '2/2/20',
 '2/3/20',
 '2/4/20',
 '2/5/20',
 '2/6/20',
 '2/7/20',
 '2/8/20',
 '2/9/20',
 '2/10/20',
 '2/11/20',
 '2/12/20',
 '2/13/20',
 '2/14/20',
 '2/15/20',
 '2/16/20',
 '2/17/20',
 '2/18/20',
 '2/19/20',
 '2/20/20',
 '2/21/20',
 '2/22/20',
 '2/23/20',
 '2/24/20',
 '2/25/20',
 '2/26/20',
 '2/27/20',
 '2/28/20',
 '2/29/20',
 '3/1/20',
 '3/2/20',
 '3/3/20',
 '3/4/20',
 '3/5/20',
 '3/6/20',
 '3/7/20',
 '3/8/20',
 '3/9/20',
 '3/10/20',
 '3/11/20',
 '3/12/20',
 '3/13/20',
 '3/14/20',
 '3/15/20',
 '3/16/20',
 '3/17/20',
 '3/18/20',
 '3/19/20',
 '3/20/20',
 '3/21/20',
 '3/22/20',
 '3/23/20',
 '3/24/20',
 '3/25/20',
 '3/26/20',
 '3/27/20',
 '3/28/20',
 '3/29/20',
 '3/30/20',
 '3/31/20',
 '4/1/20',
 '4/2/20',
 '4/3/20',
 '4/4/20',
 '4/5

In [22]:
first_row = ts_df.first()
print(first_row.UID)
print(first_row['UID'])

84001001
84001001


In [23]:
ts_df.select('Province_State').show()

+--------------+
|Province_State|
+--------------+
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
+--------------+
only showing top 20 rows



In [24]:
ts_df.sort('Province_State').select('Province_State').show()

+--------------+
|Province_State|
+--------------+
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
|       Alabama|
+--------------+
only showing top 20 rows



In [25]:

ts_df.filter(ts_df['5/15/20'] != '0').select('Province_State', '5/15/20').orderBy(desc('5/15/20')).show()

+--------------+-------+
|Province_State|5/15/20|
+--------------+-------+
|       Georgia|    999|
|       Florida|    997|
|         Texas|    993|
|   Mississippi|     99|
|       Wyoming|     99|
|       Vermont|     99|
|          Ohio|    989|
|   Connecticut|   9881|
|North Carolina|    983|
|      New York|   9825|
|South Carolina|     98|
|      Michigan|     98|
|      Illinois|     98|
|   Puerto Rico|     98|
|       Alabama|     98|
|       Indiana|    977|
|   Mississippi|     97|
|          Ohio|     97|
|North Carolina|     97|
| West Virginia|     97|
+--------------+-------+
only showing top 20 rows



In [26]:
target_date = '6/7/20'
target = ts_df.select('Admin2', 'Province_State', target_date).where(ts_df['Admin2'] != 'null')

In [27]:
target.Admin2

Column<b'Admin2'>

In [28]:
from pyspark.sql.types import (StructField, StringType, StructType, IntegerType)
data_fields = [StructField('Admin2', StringType(), True), StructField('Province_State', StringType(), True),
               StructField(target_date, StringType(), True)]
data_schema = StructType(data_fields)
newDF = spark.createDataFrame(target.rdd, schema=data_schema)
newDF.dtypes

[('Admin2', 'string'), ('Province_State', 'string'), ('6/7/20', 'string')]

In [29]:

target_list = []
for row in target.collect():
    target_list.append(['{}.{}'.format(row['Province_State'], row['Admin2']), int(row[target_date])])
target_list

[['Alabama.Autauga', 271],
 ['Alabama.Baldwin', 327],
 ['Alabama.Barbour', 200],
 ['Alabama.Bibb', 81],
 ['Alabama.Blount', 78],
 ['Alabama.Bullock', 237],
 ['Alabama.Butler', 453],
 ['Alabama.Calhoun', 184],
 ['Alabama.Chambers', 398],
 ['Alabama.Cherokee', 42],
 ['Alabama.Chilton', 115],
 ['Alabama.Choctaw', 155],
 ['Alabama.Clarke', 157],
 ['Alabama.Clay', 26],
 ['Alabama.Cleburne', 18],
 ['Alabama.Coffee', 247],
 ['Alabama.Colbert', 217],
 ['Alabama.Conecuh', 54],
 ['Alabama.Coosa', 40],
 ['Alabama.Covington', 92],
 ['Alabama.Crenshaw', 95],
 ['Alabama.Cullman', 203],
 ['Alabama.Dale', 135],
 ['Alabama.Dallas', 343],
 ['Alabama.DeKalb', 276],
 ['Alabama.Elmore', 423],
 ['Alabama.Escambia', 88],
 ['Alabama.Etowah', 272],
 ['Alabama.Fayette', 21],
 ['Alabama.Franklin', 631],
 ['Alabama.Geneva', 47],
 ['Alabama.Greene', 104],
 ['Alabama.Hale', 210],
 ['Alabama.Henry', 90],
 ['Alabama.Houston', 247],
 ['Alabama.Jackson', 93],
 ['Alabama.Jefferson', 2099],
 ['Alabama.Lamar', 31],
 ['Ala

In [30]:
def get_key(x):
    return x[1]

target_list
sorted(target_list, key = get_key, reverse=True)

# 'New York.New York', 206969 Jun 7
# 'New York.New York', 206511 Jun 6
# 'Georgia.Fulton', 4823 Jun 7
# 'Georgia.Fulton', 4822 Jun 6

[['Illinois.Cook', 82427],
 ['New York.Queens', 64105],
 ['California.Los Angeles', 63844],
 ['New York.Kings', 58248],
 ['New York.Bronx', 47490],
 ['New York.Nassau', 40904],
 ['New York.Suffolk', 40329],
 ['New York.Westchester', 33954],
 ['New York.New York', 26064],
 ['Pennsylvania.Philadelphia', 23529],
 ['Massachusetts.Middlesex', 21725],
 ['Michigan.Wayne', 21521],
 ['Florida.Miami-Dade', 19547],
 ['New Jersey.Hudson', 18972],
 ['New Jersey.Bergen', 18593],
 ['Massachusetts.Suffolk', 18134],
 ['New Jersey.Essex', 18115],
 ['New Jersey.Passaic', 16482],
 ["Maryland.Prince George's", 16434],
 ['New Jersey.Union', 16232],
 ['New Jersey.Middlesex', 16224],
 ['Connecticut.Fairfield', 16056],
 ['Massachusetts.Essex', 14755],
 ['Texas.Harris', 14586],
 ['New York.Richmond', 13888],
 ['Arizona.Maricopa', 13498],
 ['New York.Rockland', 13325],
 ['Maryland.Montgomery', 12734],
 ['Virginia.Fairfax', 12455],
 ['Texas.Dallas', 12093],
 ['Connecticut.New Haven', 11828],
 ['Massachusetts.Worc

### End DF demonstration

In [31]:
def count_by_country_region (day):
    df = spark.read.option("header", "true").csv(day)
    return df.groupBy('Country_region').count().orderBy(desc('count'))

In [32]:

count_by_country_region(day).show()

+--------------+-----+
|Country_region|count|
+--------------+-----+
|            US| 2983|
|         China|   34|
|         Italy|   21|
|         Spain|   19|
|       Germany|   17|
|        Canada|   15|
|United Kingdom|   12|
|        France|   11|
|     Australia|    8|
|   Netherlands|    5|
|       Denmark|    3|
|    Cabo Verde|    1|
|      Maldives|    1|
|         Yemen|    1|
|        Sweden|    1|
|        Russia|    1|
|      Malaysia|    1|
|   Afghanistan|    1|
|      Cambodia|    1|
|       Comoros|    1|
+--------------+-----+
only showing top 20 rows



In [33]:
import subprocess

dir_in = "/user/student/covid19/daily"
args = "hdfs dfs -ls "+dir_in+" | awk '{print $8}'"
proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

s_output, s_err = proc.communicate()
all_dart_dirs = s_output.split()