<a href="https://colab.research.google.com/github/denniesbor/Module_3_Twist/blob/development/Knowledge_Graph_Queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Requirements

In [None]:
!pip install --user gdelt

# Import modules

In [66]:
from datetime import date, timedelta
from gdelt import gdelt as gdelt_client
import matplotlib.pyplot as plot
import pandas as pd
import re
import os
import seaborn
import tempfile
import warnings
warnings.filterwarnings('ignore')

# Query the knowledge graph
Use coverage option for querying all daily records. Otherwise records collected from the last 15 minutes are returned.
Use the date option to filter by date.

In [67]:
def get_graph(date, coverage=False):
    client = gdelt_client(version=2)
    graph = client.Search("2020-05-10", table="gkg", coverage=coverage)
    graph = graph.astype({"DATE": str})
    graph["DATE"] = graph["DATE"].apply(lambda dateStr: dateStr[:14])
    del client
    return graph

def get_graph_range(from_date, to_date, coverage=False):
    date_range = to_date-from_date
    if date_range.days < 1:
        return
    client = gdelt_client(version=2)
    graph = None
    for day in range(0, date_range.days + 1):
        date = from_date + timedelta(days=day)
        try:
          graph_temp = client.Search(date.strftime("%Y %m %d"), table="gkg", coverage=coverage)
        except:
          continue
        graph_temp = graph_temp.astype({"DATE": str})
        graph_temp["DATE"] = graph_temp["DATE"].apply(lambda dateStr: dateStr[:14])
        if graph is None:
          graph = graph_temp
          if(graph.shape[1] > 2000000):
            break
        else:
            graph = pandas.concat([graph, graph_temp], axis=0)
    del client
    return graph

def get_today_graph(coverage=False):
    return get_graph(date.today(), coverage)

def get_yesterday_graph(coverage=False):
    return get_graph(date.today()-timedelta(days=1), coverage)

In [None]:
days=0
for i in range(10):
  end_date = date.today()-timedelta(days=days)
  start_date = end_date-timedelta(days=days+300)
  graph = get_graph_range(start_date, end_date, coverage=False)
  graph.to_csv(f'/content/drive/MyDrive/Module3/twist/twist_{i}.csv', index=False)
  days+=300
  report_date = end_date

Text cleaning

# Location type exploding and filtering

In [4]:
from enum import Enum

class location_type(Enum):
    """Location type
        Defines the different location types.
    """
    UNKNOWN = 0
    COUNTRY = 1
    USSTATE = 2
    USCITY = 3
    WORLDCITY = 4
    WORLDSTATE = 5

class gdelt_location:
    """GDELT location
        Defines a GDELT location.
    """
    def __init__(self, location_typeid=0, name=None, country_code=None, admin1_code=None, lat=None, lon=None, feature_id=None):        
        self.location_type = location_type(int(location_typeid))
        self.location_name = name
        self.country_code = country_code
        self.admin1_code = admin1_code
        self.location_lat = lat
        self.location_lon = lon
        self.feature_id = feature_id
        
    def has_location_type(self, location_type):
        return location_type == self.location_type
    
    def location_type_matches(self, location_types):
        return self.location_type in location_types
    
    def __str__(self):
        return self.location_name
    
class location_filter():
    """Location Filter
        Defines different filters which can be applied on the dataframes.
    """
    def filter_by_type(self, gkg_dataframe, location_type):
        return gkg_dataframe.loc[gkg_dataframe.apply(lambda record: record["GDELT_Locations"].has_location_type(location_type), axis=1)]
    
    def filter_by_types(self, gkg_dataframe, location_types):
        return gkg_dataframe.loc[gkg_dataframe.apply(lambda record: record["GDELT_Locations"].location_type_matches(location_types), axis=1)]

def split_location_entries(locations):
    return [gdelt_location(*location) if 7 == len(location) else gdelt_location() for location in locations]

def split_locations(record):
    return split_location_entries([location.split("#") for location in str(record["Locations"]).split(";")])


# Filter the locations by location type & Extract the coordinates and the name from the GDELT location
- We are exploding the records using the locations column
- We are filtering by using the location type (e.g. return only cities)

In [59]:
files = [file for file in os.listdir('/content/drive/MyDrive/Module3/twist') if file.endswith(r'.csv') and file!='twist.csv']

In [62]:
dfs = []
for file in files:
  df_test=pd.read_csv(f"/content/drive/MyDrive/Module3/twist/{file}", chunksize=100000, on_bad_lines='skip')
  for df in df_test:
    df_records = df[(df['V2Themes'].str.contains(r'health|disease\w*|covid|gene$|genetic\w*', flags=re.IGNORECASE, regex=True, na=False)\
      | df['Themes'].str.contains(r'health|disease\w*|covid|gene$|genetic\w*', flags=re.IGNORECASE, regex=True, na=False))]
    df_records['Sentiment'] = df_records['V2Tone'].apply(lambda x: float(x.split(',')[0])).round(decimals=2)
    df_records['Polarity'] = df_records['V2Tone'].apply(lambda x: float(x.split(',')[3])).round(decimals=2)

    df_locations = df_records.copy(deep=True)
    if df_locations.empty:
        df_locations["GDELT_Locations"] = []
        df_filtered_locations = df_locations
    else:
        df_locations["GDELT_Locations"] = df_records.apply(lambda record: split_locations(record), axis=1)
        df_locations_exploded = df_locations.explode("GDELT_Locations")

        filter = location_filter()
        df_filtered_locations = filter.filter_by_types(df_locations_exploded, [location_type.WORLDCITY, location_type.USCITY])
    df_filtered_locations["Location_Name"] = df_filtered_locations.apply(lambda record: record["GDELT_Locations"].location_name, axis=1)
    df_filtered_locations["Location_Lat"] = df_filtered_locations.apply(lambda record: record["GDELT_Locations"].location_lat, axis=1)
    df_filtered_locations["Location_Lon"] = df_filtered_locations.apply(lambda record: record["GDELT_Locations"].location_lon, axis=1)
    df_filtered_locations = df_filtered_locations[df_filtered_locations['Location_Name'].str.contains(r'Kenya', flags=re.IGNORECASE, regex=True, na=False)]
    # print(df_filtered_locations.iloc[0]["Location_Name"])
    point_locations = df_filtered_locations[["GKGRECORDID", "DATE","SourceCommonName","DocumentIdentifier","Sentiment", "Polarity","Extras"]].copy(deep=True)

    del df_locations_exploded
    del df_locations
    del df_records
    del df
    del df_filtered_locations
    dfs.append(point_locations)

# del graph

In [97]:
df_final = pd.concat(dfs)
# save the dfs
# df_final[["GKGRECORDID", "DATE","SourceCommonName","DocumentIdentifier","Sentiment", "Polarity","Extras"]].to_csv('/content/drive/MyDrive/Module3/twist/final.csv', index=False)

In [98]:
df = df_final.copy(deep=True)

In [99]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [100]:
df = df[df['GKGRECORDID'].str.contains(r'^2020\d+', regex=True, flags=re.IGNORECASE, na=False)]

In [101]:
text = df['GKGRECORDID'].iloc[0]

Unnamed: 0,GKGRECORDID,DATE,SourceCommonName,DocumentIdentifier,Sentiment,Polarity,Extras
455,20200206234500-455,20200210000000.0,businessinsider.sg,https://www.businessinsider.sg/desert-locust-p...,-4.54,5.87,<PAGE_LINKS>http://www.fao.org/ag/locusts/en/a...
1249,20200206234500-1249,20200210000000.0,standardmedia.co.ke,https://www.standardmedia.co.ke/article/200135...,2.95,6.49,<PAGE_LINKS>https://newsstand.standardmedia.co...
3233,20200207234500-882,20200210000000.0,standardmedia.co.ke,https://www.standardmedia.co.ke/article/200135...,-4.01,8.02,<PAGE_LINKS>https://newsstand.standardmedia.co...
3452,20200207234500-1101,20200210000000.0,africanews.com,http://www.africanews.com/2020/02/07/education...,0.74,7.36,<PAGE_AUTHORS>AfricaNews</PAGE_AUTHORS><PAGE_T...
5079,20200209234500-870,20200210000000.0,montrealgazette.com,https://montrealgazette.com/news/national/seco...,-1.2,5.39,<PAGE_ALTURL_AMP>https://montrealgazette.com/n...
