<a href="https://colab.research.google.com/github/gylam/siads696-sprsum2024-team05/blob/main/1_api_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Document retrieval**
<br>
Goal: Retrieve all English non-text documents containing themes - output table with document text and associated theme labels



In [None]:
import pandas as pd
import numpy as np
import requests
import time
import os
from bs4 import BeautifulSoup
import warnings
import typing

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
warnings.filterwarnings('ignore')

In [None]:
## URLs for API calls

# Uses profile for API calls, includes all following filters
url_base = 'https://api.reliefweb.int/v1/reports?appname=apidoc&profile=full&filter[operator]=AND'
# Sort by ascending document ID
url_sort = '&sort[]=id:asc'
# Filter for English language reports
url_lang = '&filter[conditions][0][field]=language.id&filter[conditions][0][value]=267'
# Exclude non-text formats (Infographic, Interactive, Map, Other)
url_format1 = '&filter[conditions][1][field]=format.id&filter[conditions][1][value][]=12570&filter[conditions][1][value][]=12'
url_format2 = '&filter[conditions][1][value][]=9&filter[conditions][1][value][]=38974&filter[conditions][1][operator]=OR&filter[conditions][1][negate]=true'

In [None]:
# Constants
base_path = '/content/drive/MyDrive/_Course materials/S5M1-2 696 - Milestone II/Milestone 2 shared folder/data/'

In [None]:
def extract_docs(doc_limit: int = 1000, tot_docs: int = None) -> int:
  """ Call API to extract all English-language text documents with themes
      Inputs:
        - doc_limit: Number of documents to retrieve at a time (default and max is 1000)
        - tot_docs: Total number of documents to retrieve (default is to retrieve all matching results)
      Output:
        - doc_df: Dataframe containing document text and themes
  """

  # Set counter to track number of iterations until reach last set of documents
  counter = 0
  doc_df_created = False
  report_num = 0

  # Initializing df to store all documents from current API call
  curr_df = pd.DataFrame(columns = ['report_id', 'theme_id', 'theme_name', 'text'])

  # If no limit to tot_docs is specified, initially set to doc_limit
  if tot_docs == None:
    tot_docs = doc_limit
    tot_docs_bool = False
  else:
    tot_docs_bool = True

  # URL to indicate number of documents to return and how many to offset by
  url_lim = f'&limit={doc_limit}&offset='

  # Call API until all documents have been retrieved
  while counter*doc_limit <= tot_docs:
    print(f"counter = {counter} with {counter*doc_limit} documents elapsed")

    # Make API request
    if counter == 0:
      full_url = url_base + url_sort + url_lang + url_lim + str(counter*doc_limit)
    else:
      full_url = url
    response = requests.get(full_url, stream = True)

    # Check that valid API call was made
    if response.status_code == 200:
      # If so, convert JSON output into dictionary
      try:
        rw_dict = response.json()
      except Exception as e:
        print(f'Error with retrieving API: {e}')
        time.sleep(1)
        continue

    # For first iteration, update total number of documents to retrieve
    if tot_docs_bool == False:
      tot_docs = rw_dict["totalCount"]

    # Extract fields from all retrieved reports into a list
    all_reports_data=[]
    # For all API calls except for last call, retrieve doc_limit documents
    if abs(tot_docs - counter*doc_limit) >= doc_limit:
      for i in np.arange(int(doc_limit)):
        all_reports_data.append(rw_dict['data'][i]['fields'])
    # For last set of reports, retrieve all remaining reports
    else:
      for i in np.arange(int(abs(tot_docs - counter*doc_limit))):
        all_reports_data.append(rw_dict['data'][i]['fields'])

    # For each extracted report, attempt to extract theme and other relevant fields
    for data_num, data in enumerate(all_reports_data):
      try:
        theme_id = []
        theme_name = []

        # Extract themes as a single list into 1 row
        temp_theme_list = data['theme']
        theme_id.append([theme['id'] for theme in temp_theme_list])
        theme_name.append([theme['name'] for theme in temp_theme_list])

        ## Create report text df
        report_id = [data['id']]
        # Extract body - clean up HTML tags
        body_col = data['body']
        soup_body = BeautifulSoup(body_col, 'lxml')
        text = soup_body.get_text().replace('\n', ' ')
        # Extract title - clean up HTML tags
        title_col = data['title']
        soup_title = BeautifulSoup(title_col, 'lxml')
        title = soup_title.get_text().replace('\n', ' ')
        # Combine title and text into a single column
        combined_text = title + ' ' + text

        # Store current URL, unless reach final set of documents
        if abs(tot_docs - counter*doc_limit) >= doc_limit:
          current_url = [list(rw_dict['links']['next'].values())[0]]
        else:
          current_url = ['']

        # Store country information, document date, source name/id, format
        latitude = [data['primary_country']['location']['lat']]
        longitude = [data['primary_country']['location']['lon']]
        country_iso3 = [data['primary_country']['iso3']]
        country_name = [data['primary_country']['name']]
        date_created = [data['date']['created']]
        source_id = [str(data['source'][0]['id'])]
        source_name = [data['source'][0]['name']]
        format = [data['format'][0]['name']]

        # Create dataframe from extracted fields
        df_text = pd.DataFrame(list(zip(report_id, theme_id, theme_name, [title], [text], [combined_text],
                                        [data['url']], latitude, longitude, country_iso3,
                                        country_name, date_created, source_id, source_name, format )),
                               columns = ['report_id', 'theme_id', 'theme_name', 'title', 'text', 'combined_text',
                                          'url', 'latitude', 'longitude', 'country_iso3','country_name',
                                          'date_created', 'source_id', 'source_name', 'format'])
        # Add column with word count
        df_text['word_count'] = df_text['combined_text'].apply(lambda x: len(str(x).split(' ')))

        # Update curr_df to store all documents from current API call so far
        curr_df = pd.concat([curr_df, df_text]) #df_merged

        # Update url for next report API call (unless reached last set of documents)
        if abs(tot_docs - counter*doc_limit) >= doc_limit:
          url= list(rw_dict['links']['next'].values())[0]
        else:
          url = ['']

      except Exception as e:
        print(f'Error (in report extraction) {e}')
        # Update url for next API call
        url = list(rw_dict['links']['next'].values())[0]

    # Increment counter for tracking number of reports added to dataframe
    counter += 1

    try:
      # Drop rows that are non-text formats
      curr_df = curr_df[(curr_df['format'] != 'Map') & (curr_df['format'] != 'Interactive') & \
                        (curr_df['format'] != 'Infographic') & (curr_df['format'] != 'Other')]
      curr_df = curr_df.dropna()
      curr_df = curr_df.reset_index().drop('index', axis = 1)

      # When reach at least 1000 documents in curr_df, save to pickle and reset
      curr_report_num = len(curr_df['report_id'].unique())
      if curr_report_num >= 1000:
        # Add current number of reports to report_num
        report_num += curr_report_num

        # Save current df to pickle
        curr_df.to_pickle(base_path + 'gl_files_v2/gl_pickle_' + str(curr_report_num) + 'docs_' + str(counter*doc_limit) + '.pickle')

        # Reset curr_df
        curr_df = pd.DataFrame(columns = ['report_id', 'theme_id', 'theme_name', 'text'])

    except Exception as e:
      print(f'Error (in appending curr_df) {e}')
      continue

  return report_num



In [None]:
output = extract_docs(1000)
#print(f"output.shape: {output.shape} with {len(output['report_id'].unique())} unique documents")
print(f'number of unique documents: {output}')

Initial args: doc_limit = 1000, tot_docs = None
counter = 0 with 0 documents elapsed
counter = 1 with 1000 documents elapsed
counter = 2 with 2000 documents elapsed
counter = 3 with 3000 documents elapsed
counter = 4 with 4000 documents elapsed
counter = 5 with 5000 documents elapsed
counter = 6 with 6000 documents elapsed
counter = 7 with 7000 documents elapsed
counter = 8 with 8000 documents elapsed
counter = 9 with 9000 documents elapsed
counter = 10 with 10000 documents elapsed
counter = 11 with 11000 documents elapsed
counter = 12 with 12000 documents elapsed
counter = 13 with 13000 documents elapsed
counter = 14 with 14000 documents elapsed
counter = 15 with 15000 documents elapsed
counter = 16 with 16000 documents elapsed
counter = 17 with 17000 documents elapsed
counter = 18 with 18000 documents elapsed
counter = 19 with 19000 documents elapsed
counter = 20 with 20000 documents elapsed
counter = 21 with 21000 documents elapsed
counter = 22 with 22000 documents elapsed
counter =

KeyError: 'next'

In [None]:
# Function to read in all pickle files and concatenate to dataframe
def read_pickles(base_path):
  dir_list = os.listdir(base_path)
  df = pd.DataFrame(columns = ['report_id', 'theme_id', 'theme_name', 'title', 'text', 'combined_text',
                               'url', 'latitude', 'longitude', 'country_iso3','country_name',
                               'date_created', 'source_id', 'source_name', 'format', 'word_count'])

  for filename in dir_list:
    path = f'{base_path}/{filename}'
    df_new = pd.read_pickle(path)
    # Update df
    df= pd.concat([df, df_new])
  return df

df = read_pickles(base_path+'gl_files_v2/')
print(df.shape)
display(df.head())

# Save full data frame to pickle file
df.to_pickle(base_path + 'gl_full_pickle.pickle')

(459937, 16)


Unnamed: 0,report_id,theme_id,theme_name,title,text,combined_text,url,latitude,longitude,country_iso3,country_name,date_created,source_id,source_name,format,word_count
0,10365,[4590],[Coordination],The MFEWS Central America Weather Hazards and ...,Note: document is two pages,The MFEWS Central America Weather Hazards and ...,https://reliefweb.int/node/10365,17.22,-88.69,blz,Belize,2008-07-23T04:00:00+00:00,529,Famine Early Warning System Network,News and Press Release,20.0
1,10419,[4590],[Coordination],The MFEWS Central America Weather Hazards and ...,Note: Document is two pages.,The MFEWS Central America Weather Hazards and ...,https://reliefweb.int/node/10419,17.22,-88.69,blz,Belize,2008-07-30T04:00:00+00:00,529,Famine Early Warning System Network,News and Press Release,21.0
2,10453,[4590],[Coordination],The MFEWS Central America Weather Hazards and ...,NOTE: Document is two pages.,The MFEWS Central America Weather Hazards and ...,https://reliefweb.int/node/10453,17.22,-88.69,blz,Belize,2008-08-06T04:00:00+00:00,529,Famine Early Warning System Network,News and Press Release,21.0
3,10506,[4590],[Coordination],Central America Weather Hazards Assessment: Au...,Note: Document is two pages,Central America Weather Hazards Assessment: Au...,https://reliefweb.int/node/10506,17.22,-88.69,blz,Belize,2008-08-14T04:00:00+00:00,529,Famine Early Warning System Network,News and Press Release,15.0
4,10657,[4590],[Coordination],Central America Weather Hazards Assessment: Au...,Note: Document is two pages,Central America Weather Hazards Assessment: Au...,https://reliefweb.int/node/10657,17.22,-88.69,blz,Belize,2008-08-28T04:00:00+00:00,529,Famine Early Warning System Network,News and Press Release,16.0
