# Spatio Temporal Files using extracted locations from the GDELT Knowledge Graph
We are reading the CSV files into a pandas dataframe.
The pandas dataframes are converted into a netcdf file using latitude, longitude and time.
We are using xarray for the conversion from dataframe to netcdf.

In [1]:
!pip install --user netcdf4
!pip install --user xarray



You are using pip version 18.1, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.




You are using pip version 18.1, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [151]:
from datetime import date
import os
import pandas
import tempfile
import xarray

# Read the extracted locations from the temp folder into a dataframe

In [156]:
def read_gkg_locations_from_temp():
    gkg_locations = None
    with os.scandir(tempfile.gettempdir()) as dir_scanner:
        for dir_entry in dir_scanner:
            if dir_entry.is_file():
                if dir_entry.name.endswith(".gkg.csv"):
                        gkg_locations_temp = pandas.read_csv(dir_entry.path)
                        if gkg_locations is None:
                             gkg_locations = gkg_locations_temp
                        else:
                            gkg_locations = pandas.concat([gkg_locations, gkg_locations_temp], axis=0)
    return gkg_locations

In [158]:
corona_locations = read_gkg_locations_from_temp()
corona_locations

Unnamed: 0,GKGRECORDID,DATE,SourceCommonName,DocumentIdentifier,Location_Name,Location_Lat,Location_Lon
0,20200115001500-1165,20200115001500,7news.com.au,https://7news.com.au/lifestyle/health-wellbein...,"Wuhan, Hubei, China",30.5833,114.267000
1,20200115001500-1879,20200115001500,thejakartapost.com,https://www.thejakartapost.com/news/2020/01/14...,"Wuhan, Hubei, China",30.5833,114.267000
2,20200115003000-1503,20200115003000,nhk.or.jp,https://www3.nhk.or.jp/nhkworld/en/news/202001...,"Geneva, Genè, Switzerland",46.1956,6.148110
3,20200115003000-1503,20200115003000,nhk.or.jp,https://www3.nhk.or.jp/nhkworld/en/news/202001...,"Hubei, Guangdong, China",23.2656,116.054000
4,20200115003000-1503,20200115003000,nhk.or.jp,https://www3.nhk.or.jp/nhkworld/en/news/202001...,"Wuhan, Hubei, China",30.5833,114.267000
...,...,...,...,...,...,...,...
112413,20200314234500-914,20200314234500,kgw.com,https://www.kgw.com/article/news/health/corona...,"Washington, Washington, United States",38.8951,-77.036400
112414,20200314234500-914,20200314234500,kgw.com,https://www.kgw.com/article/news/health/corona...,"Linn County, Oregon, United States",44.4996,-122.585000
112415,20200314234500-919,20200314234500,guardian-series.co.uk,https://www.guardian-series.co.uk/news/nationa...,"London, London, City Of, United Kingdom",51.5000,-0.116667
112416,20200314234500-922,20200314234500,790wpic.com,https://www.790wpic.com/news/trump-says-he-was...,"Washington, Washington, United States",38.8951,-77.036400


# Convert the DATE column to datetime values
- Drop the original DATE column
- Rename the columns

In [159]:
corona_locations["time"] = corona_locations.apply(lambda record: pandas.to_datetime(str(record["DATE"]), format="%Y%m%d%H%M%S"), axis=1)
corona_locations.drop("DATE", axis=1, inplace=True)
corona_locations.rename(columns = {"Location_Lat":"y", "Location_Lon":"x"}, inplace=True)

# Convert the dataframe to a multidimensional xarray
- Set the dataframes index using longitude, latitude and date
- Aggregate into a field named "count" and drop the duplicate multi-index entries
- Convert to a xarray and fill "not a number" values in count with 0

In [130]:
corona_locations_multi = corona_locations[["x", "y", "time"]].set_index(["x", "y", "time"])
corona_locations_multi["count"] = corona_locations_multi.groupby(level=[0,1,2]).size()
corona_locations_multi = corona_locations_multi.loc[~corona_locations_multi.index.duplicated(keep="first")]
corona_locations_xarray = corona_locations_multi.to_xarray()
corona_locations_xarray = corona_locations_xarray.fillna(0)
corona_locations_xarray

In [149]:
corona_locations_xarray["count"].mean()

# Save the xarray as a netcdf file
**Warning:** Compute intensive, going to stress your CPU and memory!
- **Error:** module 'dask.base' has no attribute 'get_scheduler'
- **Note:** We had to update dask to version '2.12.0'

In [150]:
corona_locations_xarray.to_netcdf("{}/corona_locations_{}.gkg.nc".format(tempfile.gettempdir(), date.today().strftime("%Y%m%d"), compute=True))

del corona_locations
del corona_locations_multi
del corona_locations_xarray