<a href="https://colab.research.google.com/github/gylam/siads696-sprsum2024-team05/blob/main/1_Preprocessing_GL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Document retrieval**
<br>
Goal: Retrieve all English non-text documents containing themes - output table with document text and associated theme labels



In [None]:
import requests
import pandas as pd
from functools import reduce
import typing

## URLs for API calls
# Sorts reports by latest uploaded, excludes images, includes all following filters
url_base = "https://api.reliefweb.int/v1/reports?appname=rwint-user-0&profile=list&preset=latest&slim=1&filter[operator]=AND"
# Filter for English language reports
url_lang = "&filter[conditions][0][field]=language.id&filter[conditions][0][value]=267"
# Exclude non-text formats (Infographic, Interactive, Map, Other)
url_format1 = "&filter[conditions][1][field]=format.id&filter[conditions][1][value][]=12570&filter[conditions][1][value][]=12"
url_format2 = "&filter[conditions][1][value][]=9&filter[conditions][1][value][]=38974&filter[conditions][1][operator]=OR&filter[conditions][1][negate]=true"
# Filter for documents tagged with at least 1 theme
url_theme_filt = "&filter[conditions][2][field]=theme"
# Filter for documents with Climate Change theme - for unsupervised learning
url_theme_climate = "&filter[conditions][3][field]=theme.id&filter[conditions][3][value]=4588"
# Return fields from document - body, theme, format, primary country, document ID
url_fields = "&fields[include][]=body&fields[include][]=theme&fields[include][]=format&fields[include][]=primary_country&fields[include][]=id"

def extract_docs(doc_limit: int = 1000, tot_docs: int = None) -> pd.DataFrame:
  """ Call API to extract all English-language text documents with themes
      Inputs:
        - doc_limit: Number of documents to retrieve at a time (default and max is 1000)
        - tot_docs: Total number of documents to retrieve (default is to retrieve all matching results)
      Output:
        - doc_df: Dataframe containing document text and themes
  """

  # Set counter to track number of iterations until reach last set of documents
  counter = 0

  # If no limit to tot_docs is specified, initially set to doc_limit
  if tot_docs == None:
    tot_docs = doc_limit

  # URL to indicate number of documents to return and how many to offset by
  url_lim = f"&limit={doc_limit}&offset="
  print(f"doc_limit: {doc_limit}")

  # Call API until all documents have been retrieved
  while counter*doc_limit <= tot_docs:
    print(f"counter = {counter} with {counter*doc_limit} documents elapsed")
    # Make API request
    full_url = url_base + url_lang + url_format1 + url_format2 + url_theme_filt + \
                url_fields + url_lim + str(counter*doc_limit)
    print(f"full_url: {full_url}")
    response = requests.get(full_url, stream = True)
    print(response)

    # Check that valid API call was made
    if response.status_code == 200:
        # If so, convert JSON output into dictionary
        data = response.json()
        print(f"total number of docs: {data['totalCount']}")

        # Generate dataframes for each list of dictionaries and merge on document ID
        df_theme = pd.json_normalize(data["data"],
                                    record_path = ["fields", "theme"],
                                    meta = [["fields", "id"], ["fields", "title"], ["fields", "body"],
                                            ["fields", "primary_country", "name"],
                                            ["fields", "primary_country", "location", "lat"],
                                            ["fields", "primary_country", "location", "lon"],
                                            ["fields", "date", "created"]],
                                    record_prefix = "theme.",
                                    errors = "ignore"
                                    ).astype("str")
        df_country = pd.json_normalize(data["data"],
                                       record_path = ["fields", "country"],
                                       meta = ["fields", "id"],
                                       meta_prefix = "fields.",
                                       record_prefix = "country.")
        df_source = pd.json_normalize(data["data"],
                                       record_path = ["fields", "source"],
                                       meta = ["fields", "id"],
                                       meta_prefix = "fields.",
                                       record_prefix = "source.")
        df_format = pd.json_normalize(data["data"],
                                       record_path = ["fields", "format"],
                                       meta = ["fields", "id"],
                                       meta_prefix = "fields.",
                                       record_prefix = "format.")
        df_to_merge = [df_theme, #df_country[["country.name", "fields.id"]],
                       df_source[["source.name", "fields.id"]],
                       df_format[["format.name", "fields.id"]]]
        merged_df_curr = reduce(lambda left, right: pd.merge(left, right, on = ["fields.id"],
                                                             how = "left"), df_to_merge)

        # For first call, update total number of documents and create dataframe
        if counter == 0:
 #           tot_docs = data["totalCount"]
            doc_df = merged_df_curr
        # Otherwise, concatenate to doc_df
        else:
            doc_df = pd.concat([doc_df, merged_df_curr])
    counter += 1
#  print(doc_df)
#  doc_df.to_csv("doc_df_test.csv")
  return doc_df



In [None]:
extract_docs(4, 8)

doc_limit: 4
counter = 0 with 0 documents elapsed
full_url: https://api.reliefweb.int/v1/reports?appname=rwint-user-0&profile=list&preset=latest&slim=1&filter[operator]=AND&filter[conditions][0][field]=language.id&filter[conditions][0][value]=267&filter[conditions][1][field]=format.id&filter[conditions][1][value][]=12570&filter[conditions][1][value][]=12&filter[conditions][1][value][]=9&filter[conditions][1][value][]=38974&filter[conditions][1][operator]=OR&filter[conditions][1][negate]=true&filter[conditions][2][field]=theme&fields[include][]=body&fields[include][]=theme&fields[include][]=format&fields[include][]=primary_country&fields[include][]=id&limit=4&offset=0
<Response [200]>
total number of docs: 522238
counter = 1 with 4 documents elapsed
full_url: https://api.reliefweb.int/v1/reports?appname=rwint-user-0&profile=list&preset=latest&slim=1&filter[operator]=AND&filter[conditions][0][field]=language.id&filter[conditions][0][value]=267&filter[conditions][1][field]=format.id&filte

Unnamed: 0,theme.id,theme.name,fields.id,fields.title,fields.body,fields.primary_country.name,fields.primary_country.location.lat,fields.primary_country.location.lon,fields.date.created,source.name,format.name
0,4593,Food and Nutrition,4064599,"Sudan: Conflict in Al Fasher, North Darfur - F...",**HIGHLIGHTS**\n\n- The humanitarian situation...,Sudan,15.0,30.0,2024-05-23T16:15:34+00:00,UN Office for the Coordination of Humanitarian...,Situation Report
1,4595,Health,4064599,"Sudan: Conflict in Al Fasher, North Darfur - F...",**HIGHLIGHTS**\n\n- The humanitarian situation...,Sudan,15.0,30.0,2024-05-23T16:15:34+00:00,UN Office for the Coordination of Humanitarian...,Situation Report
2,4600,Protection and Human Rights,4064599,"Sudan: Conflict in Al Fasher, North Darfur - F...",**HIGHLIGHTS**\n\n- The humanitarian situation...,Sudan,15.0,30.0,2024-05-23T16:15:34+00:00,UN Office for the Coordination of Humanitarian...,Situation Report
3,4603,Shelter and Non-Food Items,4064599,"Sudan: Conflict in Al Fasher, North Darfur - F...",**HIGHLIGHTS**\n\n- The humanitarian situation...,Sudan,15.0,30.0,2024-05-23T16:15:34+00:00,UN Office for the Coordination of Humanitarian...,Situation Report
4,4593,Food and Nutrition,4064589,Upcoming hurricane season and persistent viole...,"- Experts predicting 23 named storms, out of t...",Haiti,19.18,-72.43,2024-05-23T15:31:08+00:00,World Vision,News and Press Release
5,4600,Protection and Human Rights,4064589,Upcoming hurricane season and persistent viole...,"- Experts predicting 23 named storms, out of t...",Haiti,19.18,-72.43,2024-05-23T15:31:08+00:00,World Vision,News and Press Release
6,4587,Agriculture,4064586,Afghanistan: Countrywide Weekly Market Report:...,**Highlights**\n\nIn the third week of May 202...,Afghanistan,33.84,66.03,2024-05-23T15:27:31+00:00,World Food Programme,Situation Report
7,4593,Food and Nutrition,4064586,Afghanistan: Countrywide Weekly Market Report:...,**Highlights**\n\nIn the third week of May 202...,Afghanistan,33.84,66.03,2024-05-23T15:27:31+00:00,World Food Programme,Situation Report
8,4593,Food and Nutrition,4064585,Impact of Cuts | Outcome of 2023 pilot study: ...,"**SUMMARY**\n\nThis study has highlighted how,...",Afghanistan,33.84,66.03,2024-05-23T15:22:10+00:00,World Food Programme,Assessment
9,4597,Humanitarian Financing,4064585,Impact of Cuts | Outcome of 2023 pilot study: ...,"**SUMMARY**\n\nThis study has highlighted how,...",Afghanistan,33.84,66.03,2024-05-23T15:22:10+00:00,World Food Programme,Assessment
