<a href="https://colab.research.google.com/github/gylam/siads696-sprsum2024-team05/blob/main/GL_api_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Document retrieval**
<br>
Goal: Retrieve all English non-text documents containing themes - output table with document text and associated theme labels



In [None]:
import requests
import pandas as pd
from functools import reduce
import typing
import time

## URLs for API calls
# Uses profile for API calls, excludes images/URLs, includes all following filters
url_base = "https://api.reliefweb.int/v1/reports?appname=rwint-user-0&profile=list&slim=1&filter[operator]=AND"
# Sort by ascending document ID
url_sort = "&sort[]=id:asc"
# Filter for English language reports
url_lang = "&filter[conditions][0][field]=language.id&filter[conditions][0][value]=267"
# Exclude non-text formats (Infographic, Interactive, Map, Other)
url_format1 = "&filter[conditions][1][field]=format.id&filter[conditions][1][value][]=12570&filter[conditions][1][value][]=12"
url_format2 = "&filter[conditions][1][value][]=9&filter[conditions][1][value][]=38974&filter[conditions][1][operator]=OR&filter[conditions][1][negate]=true"
# Filter for documents tagged with format, source, and at least 1 theme
url_theme_filt = "&filter[conditions][2][field]=theme&filter[conditions][3][field]=format&filter[conditions][4][field]=source"
# Filter for documents with Climate Change theme - for unsupervised learning
url_theme_climate = "&filter[conditions][5][field]=theme.id&filter[conditions][5][value]=4588"
# Return fields from document - body, theme, format, primary country, document ID
url_fields = "&fields[include][]=body&fields[include][]=theme&fields[include][]=format&fields[include][]=primary_country&fields[include][]=id"

def extract_docs(doc_limit: int = 1000, tot_docs: int = None) -> pd.DataFrame:
  """ Call API to extract all English-language text documents with themes
      Inputs:
        - doc_limit: Number of documents to retrieve at a time (default and max is 1000)
        - tot_docs: Total number of documents to retrieve (default is to retrieve all matching results)
      Output:
        - doc_df: Dataframe containing document text and themes
  """
  print(f"Initial args: doc_limit = {doc_limit}, tot_docs = {tot_docs}")
  # Set counter to track number of iterations until reach last set of documents
  counter = 0
  doc_df_created = False

  # If no limit to tot_docs is specified, initially set to doc_limit
  if tot_docs == None:
    tot_docs = doc_limit
    tot_docs_bool = False
  else:
    tot_docs_bool = True

  # URL to indicate number of documents to return and how many to offset by
  url_lim = f"&limit={doc_limit}&offset="

  # Call API until all documents have been retrieved
  while counter*doc_limit <= tot_docs:
    print(f"counter = {counter} with {counter*doc_limit} documents elapsed")
#    print(f"tot_docs: {tot_docs}")
    # Make API request
    full_url = url_base + url_sort + url_lang + url_format1 + url_format2 + \
                 url_theme_filt + url_fields + url_lim + str(counter*doc_limit)
#    print(f"full_url: {full_url}")
    response = requests.get(full_url, stream = True)

    # Check that valid API call was made
    if response.status_code == 200:
      # If so, convert JSON output into dictionary
      try:
        data = response.json()
      except Exception as e:
        print(f"Error {e}")
        time.sleep(2)
        continue

      # Generate dataframes for each list of dictionaries and merge on document ID
      df_all = pd.json_normalize(data["data"])
      df_theme = pd.json_normalize(data["data"],
                                  record_path = ["fields", "theme"],
                                  meta = [["fields", "id"], ["fields", "title"],
                                          ["fields", "body"],
                                          ["fields", "primary_country", "name"],
                                          ["fields", "primary_country", "location", "lat"],
                                          ["fields", "primary_country", "location", "lon"],
                                          ["fields", "date", "created"]],
                                  record_prefix = "theme.",
                                  errors = "ignore"
                                  ).astype("str")
      df_country = pd.json_normalize(data["data"],
                                      record_path = ["fields", "country"],
                                      meta = ["fields", "id"],
                                      meta_prefix = "fields.",
                                      record_prefix = "country.")
      df_source = pd.json_normalize(data["data"],
                                      record_path = ["fields", "source"],
                                      meta = ["fields", "id"],
                                      meta_prefix = "fields.",
                                      record_prefix = "source.")
      df_format = pd.json_normalize(data["data"],
                                      record_path = ["fields", "format"],
                                      meta = ["fields", "id"],
                                      meta_prefix = "fields.",
                                      record_prefix = "format.")
      df_to_merge = [df_theme, #df_country[["country.name", "fields.id"]],
                      df_source[["source.name", "fields.id"]],
                      df_format[["format.name", "fields.id"]]]
      merged_df_curr = reduce(lambda left, right: \
                                  pd.merge(left, right, on = ["fields.id"],
                                           how = "left"), df_to_merge)

      # For first call, create dataframe and update total number of documents if no total specified
      if doc_df_created == False:
        doc_df = merged_df_curr
#        df_all_total = df_all
        if tot_docs_bool == False:
          tot_docs = data["totalCount"]
        doc_df_created = True
      # Otherwise, concatenate to doc_df
      else:
        doc_df_num_docs_prev = len(doc_df['fields.id'].unique())
#        print(f"Doc IDs in doc_df already: {set(merged_df_curr['fields.id'])&set(doc_df['fields.id'])}")
        doc_df = pd.concat([doc_df, merged_df_curr])
        doc_df_num_docs_curr = len(doc_df['fields.id'].unique())
        print(f"Number of docs in df: {doc_df_num_docs_curr}, diff = {doc_df_num_docs_curr - doc_df_num_docs_prev}")

        # print(f"df_all # rows: {len(df_all)}")
        # df_all_total_prev = len(df_all_total)
        # df_all_total = pd.concat([df_all_total, df_all])
        # print(f"Number of docs in df_all_total: {len(df_all_total)}, diff = {len(df_all_total) - df_all_total_prev}")
    counter += 1
    print("\n")
  return doc_df



In [None]:
#output = extract_docs(4, 8)
output = extract_docs()
print(f"output.shape: {output.shape}")
output.head()

Initial args: doc_limit = 1000, tot_docs = None
counter = 0 with 0 documents elapsed


counter = 1 with 1000 documents elapsed
Number of docs in df: 2000, diff = 1000


counter = 2 with 2000 documents elapsed
Number of docs in df: 3000, diff = 1000


counter = 3 with 3000 documents elapsed
Error ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
counter = 3 with 3000 documents elapsed
Number of docs in df: 4000, diff = 1000


counter = 4 with 4000 documents elapsed
Error ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
counter = 4 with 4000 documents elapsed
Error ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
counter = 4 with 4000 documents elapsed
Error ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes re

Unnamed: 0,theme.id,theme.name,fields.id,fields.title,fields.body,fields.primary_country.name,fields.primary_country.location.lat,fields.primary_country.location.lon,fields.date.created,source.name,format.name
0,4590,Coordination,5341,Weather Hazards Impacts Assessment for Africa:...,,World,,,2006-10-12T04:00:00+00:00,Famine Early Warning System Network,News and Press Release
1,4590,Coordination,5503,Weather Hazards Impacts Assessment for Africa:...,,World,,,2006-11-02T05:00:00+00:00,Famine Early Warning System Network,News and Press Release
2,4590,Coordination,5596,Weather Hazards Impacts Assessment for Africa:...,,World,,,2006-11-08T05:00:00+00:00,Famine Early Warning System Network,News and Press Release
3,4590,Coordination,5645,Somalia Flood Watch - Issued: 14 November 2006,,Somalia,5.79,47.33,2006-11-14T05:00:00+00:00,Food and Agriculture Organization of the Unite...,News and Press Release
4,4587,Agriculture,7494,Weather Hazards Impacts Assessment for Africa:...,"<font size=1 face=""Arial"">Note: Map production...",World,,,2007-08-09T04:00:00+00:00,Famine Early Warning System Network,News and Press Release
