Skip to content

Commit

Permalink
Added a complex query notebook (#5)
Browse files Browse the repository at this point in the history
* #2 removed seaborn dependency

* #2 testing at esri

* added a sample space time cube

* Added a complex query using mentions table

Co-authored-by: Jan Tschada <jts@esri-de.com>
  • Loading branch information
esride-jts and Jan Tschada committed Aug 22, 2020
1 parent c62fad4 commit e25ee43
Show file tree
Hide file tree
Showing 4 changed files with 441 additions and 347 deletions.
Binary file added data/CORONA_Cube.nc
Binary file not shown.
210 changes: 210 additions & 0 deletions notebooks/Complex GDELT queries.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Query the mentions table"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"from datetime import date, timedelta\n",
"from gdelt import gdelt as gdelt_client\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 228,
"metadata": {},
"outputs": [],
"source": [
"def document_has_theme(document, themes=[]):\n",
" document_themes = str(document[\"Themes\"]).split(\";\")\n",
" for theme in themes:\n",
" if theme in document_themes:\n",
" return True\n",
" \n",
" return False\n",
"\n",
"def query_documents(graphs, mentions, themes=[]):\n",
" mentioned_documents = graphs.loc[graphs[\"DocumentIdentifier\"].isin(mentions[\"MentionIdentifier\"])]\n",
" if 0 < len(themes):\n",
" return mentioned_documents[mentioned_documents.apply(document_has_theme, args=[themes], axis=1)]\n",
" else:\n",
" return mentioned_documents\n",
" \n",
"def query_mentions(mentions, events):\n",
" return mentions.loc[mentions[\"GLOBALEVENTID\"].isin(events[\"GLOBALEVENTID\"])]\n",
"\n",
"def query_top_most_events(mentions, events):\n",
" top_most = mentions[\"GLOBALEVENTID\"].value_counts(sort=True)[:1]\n",
" top_most_event_ids = top_most.index.tolist()\n",
" #print(top_most_event_ids)\n",
" return events.loc[events[\"GLOBALEVENTID\"].isin(top_most_event_ids)]\n",
"\n",
"def query_complex(date_of_interest=date.today(), full_day=True):\n",
" client = gdelt_client(version=2)\n",
" events = client.Search(date_of_interest.strftime(\"%Y %m %d\"), table=\"events\", coverage=full_day)\n",
" mentions = client.Search(date_of_interest.strftime(\"%Y %m %d\"), table=\"mentions\", coverage=full_day)\n",
" graphs = client.Search(date_of_interest.strftime(\"%Y %m %d\"), table=\"gkg\", coverage=full_day) \n",
" del client\n",
" #return mentions\n",
" #return query_documents(graphs, mentions, [\"MEDICAL\"]) #pd.merge(events, mentions, on=\"GLOBALEVENTID\")\n",
" top_most_events = query_top_most_events(mentions, events)\n",
" return (top_most_events, query_mentions(mentions, top_most_events))"
]
},
{
"cell_type": "code",
"execution_count": 229,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"( GLOBALEVENTID SQLDATE MonthYear Year FractionDate Actor1Code \\\n",
" 18196 941769106 20200820 202008 2020 2020.6301 USA \n",
" \n",
" Actor1Name Actor1CountryCode Actor1KnownGroupCode Actor1EthnicCode ... \\\n",
" 18196 AMERICAN USA NaN NaN ... \n",
" \n",
" ActionGeo_Type ActionGeo_FullName \\\n",
" 18196 3 White House, District of Columbia, United States \n",
" \n",
" ActionGeo_CountryCode ActionGeo_ADM1Code ActionGeo_ADM2Code \\\n",
" 18196 US USDC NaN \n",
" \n",
" ActionGeo_Lat ActionGeo_Long ActionGeo_FeatureID DATEADDED \\\n",
" 18196 38.8951 -77.0364 531871 20200820000000 \n",
" \n",
" SOURCEURL \n",
" 18196 https://www.mycentraloregon.com/2020/08/19/tru... \n",
" \n",
" [1 rows x 62 columns],\n",
" GLOBALEVENTID EventTimeDate MentionTimeDate MentionType \\\n",
" 4531 941769106 20200820000000 20200820000000 1 \n",
" 4532 941769106 20200820000000 20200820000000 1 \n",
" 4533 941769106 20200820000000 20200820000000 1 \n",
" 6041 941769106 20200820000000 20200820011500 1 \n",
" 6042 941769106 20200820000000 20200820011500 1 \n",
" ... ... ... ... ... \n",
" 485648 941769106 20200820000000 20200820204500 1 \n",
" 485649 941769106 20200820000000 20200820204500 1 \n",
" 485650 941769106 20200820000000 20200820204500 1 \n",
" 485651 941769106 20200820000000 20200820204500 1 \n",
" 485652 941769106 20200820000000 20200820204500 1 \n",
" \n",
" MentionSourceName \\\n",
" 4531 omaha.com \n",
" 4532 mycentraloregon.com \n",
" 4533 dothaneagle.com \n",
" 6041 pottsmerc.com \n",
" 6042 abc13.com \n",
" ... ... \n",
" 485648 poconorecord.com \n",
" 485649 journalstandard.com \n",
" 485650 galesburg.com \n",
" 485651 the-daily-record.com \n",
" 485652 dailyrecord.co.uk \n",
" \n",
" MentionIdentifier SentenceID \\\n",
" 4531 https://omaha.com/news/national/govt-and-polit... 5 \n",
" 4532 https://www.mycentraloregon.com/2020/08/19/tru... 14 \n",
" 4533 https://dothaneagle.com/news/national/govt-and... 8 \n",
" 6041 https://www.pottsmerc.com/news/national/democr... 5 \n",
" 6042 https://abc13.com/politics/kamala-harris-barac... 8 \n",
" ... ... ... \n",
" 485648 https://www.poconorecord.com/zz/news/20200820/... 9 \n",
" 485649 https://www.journalstandard.com/zz/news/202008... 9 \n",
" 485650 https://www.galesburg.com/zz/news/20200820/ex-... 9 \n",
" 485651 https://www.the-daily-record.com/zz/news/20200... 9 \n",
" 485652 https://www.dailyrecord.co.uk/news/uk-world-ne... 16 \n",
" \n",
" Actor1CharOffset Actor2CharOffset ActionCharOffset InRawText \\\n",
" 4531 2171 -1 2237 0 \n",
" 4532 4743 -1 4800 1 \n",
" 4533 3055 -1 3121 0 \n",
" 6041 -1 2177 2255 0 \n",
" 6042 -1 2958 3023 0 \n",
" ... ... ... ... ... \n",
" 485648 2215 -1 2234 0 \n",
" 485649 2215 -1 2234 0 \n",
" 485650 2216 -1 2235 0 \n",
" 485651 2215 -1 2234 0 \n",
" 485652 4603 -1 4622 0 \n",
" \n",
" Confidence MentionDocLen MentionDocTone MentionDocTranslationInfo \\\n",
" 4531 20 5338 0.466744 NaN \n",
" 4532 70 6684 -2.455146 NaN \n",
" 4533 20 6543 -0.943396 NaN \n",
" 6041 20 5507 0.454030 NaN \n",
" 6042 20 10034 -0.425532 NaN \n",
" ... ... ... ... ... \n",
" 485648 40 2499 -5.985037 NaN \n",
" 485649 40 2499 -5.985037 NaN \n",
" 485650 40 2499 -5.970149 NaN \n",
" 485651 40 2499 -5.970149 NaN \n",
" 485652 40 5282 -5.348837 NaN \n",
" \n",
" Extras \n",
" 4531 NaN \n",
" 4532 NaN \n",
" 4533 NaN \n",
" 6041 NaN \n",
" 6042 NaN \n",
" ... ... \n",
" 485648 NaN \n",
" 485649 NaN \n",
" 485650 NaN \n",
" 485651 NaN \n",
" 485652 NaN \n",
" \n",
" [1201 rows x 16 columns])"
]
},
"execution_count": 229,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yesterday = date.today()-timedelta(days=1)\n",
"query_complex(yesterday)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit e25ee43

Please sign in to comment.