# Spatio Temporal Files using extracted locations from the GDELT Knowledge Graph
We are reading the CSV files into a pandas dataframe.
The pandas dataframes are converted into a netcdf file using latitude, longitude and time.
We are using xarray for the conversion from dataframe to netcdf.

In [None]:
!pip install --user arcgis
!pip install --user netcdf4
!pip install --user xarray

In [331]:
from datetime import date
import os
import pandas
import tempfile
import xarray

# Read the extracted locations from the temp folder into a dataframe

In [332]:
def read_gkg_locations_from_temp():
    gkg_locations = None
    with os.scandir(tempfile.gettempdir()) as dir_scanner:
        for dir_entry in dir_scanner:
            if dir_entry.is_file():
                if dir_entry.name.endswith(".gkg.csv"):
                        gkg_locations_temp = pandas.read_csv(dir_entry.path)
                        if gkg_locations is None:
                             gkg_locations = gkg_locations_temp
                        else:
                            gkg_locations = pandas.concat([gkg_locations, gkg_locations_temp], axis=0)
    return gkg_locations

In [333]:
corona_locations = read_gkg_locations_from_temp()
corona_locations

Unnamed: 0,GKGRECORDID,DATE,SourceCommonName,DocumentIdentifier,Location_Name,Location_Lat,Location_Lon
0,20200314000000-1,20200314000000,mykxlg.com,https://www.mykxlg.com/news/national/on-eve-of...,"Hollywood, California, United States",34.0983,-118.3270
1,20200314000000-1,20200314000000,mykxlg.com,https://www.mykxlg.com/news/national/on-eve-of...,"Anaheim, California, United States",33.8353,-117.9150
2,20200314000000-3,20200314000000,newportri.com,https://www.newportri.com/zz/news/20200313/sch...,"Philadelphia, Pennsylvania, United States",39.9523,-75.1638
3,20200314000000-3,20200314000000,newportri.com,https://www.newportri.com/zz/news/20200313/sch...,"Cleveland, Ohio, United States",41.4995,-81.6954
4,20200314000000-3,20200314000000,newportri.com,https://www.newportri.com/zz/news/20200313/sch...,"Valley High School, California, United States",33.7231,-117.9010
...,...,...,...,...,...,...,...
204435,20200315191500-1533,20200315191500,rockymounttelegram.com,https://www.rockymounttelegram.com/news/nation...,"Iditarod, Alaska, United States",62.5444,-158.0950
204436,20200315191500-1533,20200315191500,rockymounttelegram.com,https://www.rockymounttelegram.com/news/nation...,"Anchorage, Alaska, United States",61.2181,-149.9000
204437,20200315191500-1533,20200315191500,rockymounttelegram.com,https://www.rockymounttelegram.com/news/nation...,"Kaltag, Alaska, United States",64.3272,-158.7220
204438,20200315191500-1533,20200315191500,rockymounttelegram.com,https://www.rockymounttelegram.com/news/nation...,"Nulato, Alaska, United States",64.7194,-158.1030


# Convert the DATE column to datetime values
- Drop the original DATE column
- Rename the columns

In [334]:
corona_locations["time"] = corona_locations.apply(lambda record: pandas.to_datetime(str(record["DATE"]), format="%Y%m%d%H%M%S"), axis=1)
corona_locations.drop("DATE", axis=1, inplace=True)
corona_locations.rename(columns = {"Location_Lat":"y", "Location_Lon":"x"}, inplace=True)

# Convert the dataframe to a multidimensional xarray
**Warning:** Compute intensive, going to stress your CPU and memory!
- Set the dataframes index using longitude, latitude and date => Reduced to Geometry and time where Geometry is a string of x#y
- We cannot use WKB because bytes are not supported by dataframes, xarray needs hashable objects and netcdf does not support tuples and high level objects
- Aggregate into a field named "count" and drop the duplicate multi-index entries
- Convert to a xarray and fill "not a number" values in count with 0

In [335]:
def index_by_coordinates_and_time(locations):
    locations_multi = locations[["x", "y", "time"]].set_index(["x", "y", "time"])
    locations_multi["count"] = locations_multi.groupby(level=[0,1,2]).size()
    locations_multi = locations_multi.loc[~locations_multi.index.duplicated(keep="first")]
    locations_xarray = locations_multi.to_xarray()
    locations_xarray = locations_xarray.fillna(0)
    del locations_multi
    return locations_xarray

def index_by_location_and_time(locations):
    locations_multi = locations[["Location_Name", "time"]].set_index(["Location_Name", "time"])
    locations_multi["count"] = locations_multi.groupby(level=[0,1]).size()
    locations_multi = locations_multi.loc[~locations_multi.index.duplicated(keep="first")]
    locations_xarray = locations_multi.to_xarray()
    locations_xarray = locations_xarray.fillna(0)
    del locations_multi
    return locations_xarray

def to_plaintext(x, y):
    return "{}#{}".format(x, y)

def index_by_geometry_and_time(locations):
    locations_multi = locations[["x", "y", "time"]].copy(deep=True)
    locations_multi["Geometry"] = locations_multi.apply(lambda record: to_plaintext(record["x"], record["y"]), axis=1)
    locations_multi = locations_multi[["Geometry", "time"]].set_index(["Geometry", "time"])
    locations_multi["count"] = locations_multi.groupby(level=[0,1]).size()
    locations_multi = locations_multi.loc[~locations_multi.index.duplicated(keep="first")]
    locations_xarray = locations_multi.to_xarray()
    locations_xarray = locations_xarray.fillna(0)
    del locations_multi
    return locations_xarray

#corona_locations_xarray = index_by_coordinates_and_time(corona_locations)
#corona_locations_xarray = index_by_location_and_time(corona_locations)
corona_locations_xarray = index_by_geometry_and_time(corona_locations)
corona_locations_xarray

In [336]:
corona_locations_xarray["count"].mean()

# Save the xarray as a netcdf file
- **Error:** module 'dask.base' has no attribute 'get_scheduler'
- **Note:** We had to update dask to version '2.12.0'

In [337]:
corona_locations_xarray.to_netcdf("{}/corona_locations_{}.gkg.nc".format(tempfile.gettempdir(), date.today().strftime("%Y%m%d"), compute=True))

del corona_locations
del corona_locations_xarray