Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a complex query notebook #5

Merged
merged 5 commits into from
Aug 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added data/CORONA_Cube.nc
Binary file not shown.
210 changes: 210 additions & 0 deletions notebooks/Complex GDELT queries.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Query the mentions table"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"from datetime import date, timedelta\n",
"from gdelt import gdelt as gdelt_client\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 228,
"metadata": {},
"outputs": [],
"source": [
"def document_has_theme(document, themes=[]):\n",
" document_themes = str(document[\"Themes\"]).split(\";\")\n",
" for theme in themes:\n",
" if theme in document_themes:\n",
" return True\n",
" \n",
" return False\n",
"\n",
"def query_documents(graphs, mentions, themes=[]):\n",
" mentioned_documents = graphs.loc[graphs[\"DocumentIdentifier\"].isin(mentions[\"MentionIdentifier\"])]\n",
" if 0 < len(themes):\n",
" return mentioned_documents[mentioned_documents.apply(document_has_theme, args=[themes], axis=1)]\n",
" else:\n",
" return mentioned_documents\n",
" \n",
"def query_mentions(mentions, events):\n",
" return mentions.loc[mentions[\"GLOBALEVENTID\"].isin(events[\"GLOBALEVENTID\"])]\n",
"\n",
"def query_top_most_events(mentions, events):\n",
" top_most = mentions[\"GLOBALEVENTID\"].value_counts(sort=True)[:1]\n",
" top_most_event_ids = top_most.index.tolist()\n",
" #print(top_most_event_ids)\n",
" return events.loc[events[\"GLOBALEVENTID\"].isin(top_most_event_ids)]\n",
"\n",
"def query_complex(date_of_interest=date.today(), full_day=True):\n",
" client = gdelt_client(version=2)\n",
" events = client.Search(date_of_interest.strftime(\"%Y %m %d\"), table=\"events\", coverage=full_day)\n",
" mentions = client.Search(date_of_interest.strftime(\"%Y %m %d\"), table=\"mentions\", coverage=full_day)\n",
" graphs = client.Search(date_of_interest.strftime(\"%Y %m %d\"), table=\"gkg\", coverage=full_day) \n",
" del client\n",
" #return mentions\n",
" #return query_documents(graphs, mentions, [\"MEDICAL\"]) #pd.merge(events, mentions, on=\"GLOBALEVENTID\")\n",
" top_most_events = query_top_most_events(mentions, events)\n",
" return (top_most_events, query_mentions(mentions, top_most_events))"
]
},
{
"cell_type": "code",
"execution_count": 229,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"( GLOBALEVENTID SQLDATE MonthYear Year FractionDate Actor1Code \\\n",
" 18196 941769106 20200820 202008 2020 2020.6301 USA \n",
" \n",
" Actor1Name Actor1CountryCode Actor1KnownGroupCode Actor1EthnicCode ... \\\n",
" 18196 AMERICAN USA NaN NaN ... \n",
" \n",
" ActionGeo_Type ActionGeo_FullName \\\n",
" 18196 3 White House, District of Columbia, United States \n",
" \n",
" ActionGeo_CountryCode ActionGeo_ADM1Code ActionGeo_ADM2Code \\\n",
" 18196 US USDC NaN \n",
" \n",
" ActionGeo_Lat ActionGeo_Long ActionGeo_FeatureID DATEADDED \\\n",
" 18196 38.8951 -77.0364 531871 20200820000000 \n",
" \n",
" SOURCEURL \n",
" 18196 https://www.mycentraloregon.com/2020/08/19/tru... \n",
" \n",
" [1 rows x 62 columns],\n",
" GLOBALEVENTID EventTimeDate MentionTimeDate MentionType \\\n",
" 4531 941769106 20200820000000 20200820000000 1 \n",
" 4532 941769106 20200820000000 20200820000000 1 \n",
" 4533 941769106 20200820000000 20200820000000 1 \n",
" 6041 941769106 20200820000000 20200820011500 1 \n",
" 6042 941769106 20200820000000 20200820011500 1 \n",
" ... ... ... ... ... \n",
" 485648 941769106 20200820000000 20200820204500 1 \n",
" 485649 941769106 20200820000000 20200820204500 1 \n",
" 485650 941769106 20200820000000 20200820204500 1 \n",
" 485651 941769106 20200820000000 20200820204500 1 \n",
" 485652 941769106 20200820000000 20200820204500 1 \n",
" \n",
" MentionSourceName \\\n",
" 4531 omaha.com \n",
" 4532 mycentraloregon.com \n",
" 4533 dothaneagle.com \n",
" 6041 pottsmerc.com \n",
" 6042 abc13.com \n",
" ... ... \n",
" 485648 poconorecord.com \n",
" 485649 journalstandard.com \n",
" 485650 galesburg.com \n",
" 485651 the-daily-record.com \n",
" 485652 dailyrecord.co.uk \n",
" \n",
" MentionIdentifier SentenceID \\\n",
" 4531 https://omaha.com/news/national/govt-and-polit... 5 \n",
" 4532 https://www.mycentraloregon.com/2020/08/19/tru... 14 \n",
" 4533 https://dothaneagle.com/news/national/govt-and... 8 \n",
" 6041 https://www.pottsmerc.com/news/national/democr... 5 \n",
" 6042 https://abc13.com/politics/kamala-harris-barac... 8 \n",
" ... ... ... \n",
" 485648 https://www.poconorecord.com/zz/news/20200820/... 9 \n",
" 485649 https://www.journalstandard.com/zz/news/202008... 9 \n",
" 485650 https://www.galesburg.com/zz/news/20200820/ex-... 9 \n",
" 485651 https://www.the-daily-record.com/zz/news/20200... 9 \n",
" 485652 https://www.dailyrecord.co.uk/news/uk-world-ne... 16 \n",
" \n",
" Actor1CharOffset Actor2CharOffset ActionCharOffset InRawText \\\n",
" 4531 2171 -1 2237 0 \n",
" 4532 4743 -1 4800 1 \n",
" 4533 3055 -1 3121 0 \n",
" 6041 -1 2177 2255 0 \n",
" 6042 -1 2958 3023 0 \n",
" ... ... ... ... ... \n",
" 485648 2215 -1 2234 0 \n",
" 485649 2215 -1 2234 0 \n",
" 485650 2216 -1 2235 0 \n",
" 485651 2215 -1 2234 0 \n",
" 485652 4603 -1 4622 0 \n",
" \n",
" Confidence MentionDocLen MentionDocTone MentionDocTranslationInfo \\\n",
" 4531 20 5338 0.466744 NaN \n",
" 4532 70 6684 -2.455146 NaN \n",
" 4533 20 6543 -0.943396 NaN \n",
" 6041 20 5507 0.454030 NaN \n",
" 6042 20 10034 -0.425532 NaN \n",
" ... ... ... ... ... \n",
" 485648 40 2499 -5.985037 NaN \n",
" 485649 40 2499 -5.985037 NaN \n",
" 485650 40 2499 -5.970149 NaN \n",
" 485651 40 2499 -5.970149 NaN \n",
" 485652 40 5282 -5.348837 NaN \n",
" \n",
" Extras \n",
" 4531 NaN \n",
" 4532 NaN \n",
" 4533 NaN \n",
" 6041 NaN \n",
" 6042 NaN \n",
" ... ... \n",
" 485648 NaN \n",
" 485649 NaN \n",
" 485650 NaN \n",
" 485651 NaN \n",
" 485652 NaN \n",
" \n",
" [1201 rows x 16 columns])"
]
},
"execution_count": 229,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yesterday = date.today()-timedelta(days=1)\n",
"query_complex(yesterday)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading