# Relief Web API fetch reports incrementaly

In [None]:
import requests
import requests
import pandas as pd
import json
import os

In [None]:
# File to store last fetched offset
OFFSET_FILE = "last_offset.json"

# API endpoint
# Go to https://reliefweb.int/updates to crate your query,
# use the API converted to convert your query --> https://reliefweb.int/search/converter?appname=owl1.0&search-url=https%3A%2F%2Freliefweb.int%2Fupdates%3Fadvanced-search%3D%2528L267%2529_%2528DA20160101-20250131%2529&submit=Convert
URL = "https://api.reliefweb.int/v1/reports?appname=owl1.0&profile=list&preset=latest&slim=1&query%5Bvalue%5D=date.created%3A%5B2016-01-01+TO+2025-01-31%7D+AND+language.id%3A267&query%5Boperator%5D=AND"

# Load last saved offset
if os.path.exists(OFFSET_FILE):
    with open(OFFSET_FILE, "r") as file:
        last_offset = json.load(file).get("last_offset", 0)
else:
    last_offset = 0  # Start from the beginning if no offset file exists

# API parameters (Set limit to 10 for testing)
params = {
    'appname': 'app_1',
    'limit': 10,  #  Testing with 10 records per request
    'offset': last_offset,  # Resume from last fetched offset
    'sort[]': ['date.created:asc'],  # Fetch incrementally
    'fields[include][]': [
        "id", "title", "body", "country", "date.changed", "date.created",
        "date.original", "disaster", "feature.name", "file", "format", "headline",
        "language", "ocha_product", "origin", "primary_country.name",
        "primary_country.iso3", "source", "status", "theme.name", "url_alias"
    ]
}

# Fetch reports in batches
new_reports = []
request_count = 0  # Track daily API requests
max_requests = 5  # Limit the number of requests for testing (adjust as needed)

while request_count < max_requests:  # Stop after max_requests for testing
    response = requests.get(URL, params=params)
    request_count += 1  # Count API calls

    if response.status_code != 200:
        print(f"Error fetching data: {response.status_code}")
        break

    data = response.json()
    reports = data.get('data', [])

    if not reports:
        print("No new reports available.")
        break

    new_reports.extend(reports)

    # Extract new offset info
    current_offset = data.get('stats', {}).get('current_offset', params['offset'])
    remaining_reports = data.get('stats', {}).get('remaining_reports', 0)

    # Update offset for next request
    params['offset'] = current_offset + params['limit']

    print(f"Fetched {len(reports)} new reports. Remaining: {remaining_reports}")

    # Stop if no more reports are available
    if remaining_reports <= 0:
        break

# Convert new reports to DataFrame
if new_reports:
    df = pd.json_normalize(new_reports)
    #display(df.head())
    print('files fetched')

    # Save last offset to resume next time
    with open(OFFSET_FILE, "w") as file:
        json.dump({"last_offset": params['offset']}, file)

    print(f"Successfully fetched {len(new_reports)} new reports. Next offset: {params['offset']}")
else:
    print("No new reports to fetch today.")



Fetched 10 new reports. Remaining: 0
files fetched
✅ Successfully fetched 10 new reports. Next offset: 30


In [None]:
report_data=df[['id', 'fields.title', 'fields.body', 'fields.country',  'fields.date.changed', 'fields.date.created', 'fields.date.original', 'fields.disaster', 'fields.file', 'fields.format',
   'fields.language','fields.origin', 'fields.primary_country.name', 'fields.primary_country.iso3', 'fields.source','fields.theme', 'fields.status',  'fields.url_alias']].copy()
report_data.rename(columns={col: col.replace("fields.", "") for col in df.columns}, inplace=True)
report_data.rename(columns={'primary_country.name': 'primary_country_name'}, inplace=True)
report_data.rename(columns={'primary_country.iso3': 'primary_country_iso3'}, inplace=True)
report_data['country'] = report_data['country'].apply(lambda x: x[0]['name'] if isinstance(x, list) and x else None)
report_data['filename']= report_data['file'].apply(lambda x: x[0]['filename'] if isinstance(x, list) and x else None)
report_data['file'] = report_data['file'].apply(lambda x: x[0]['url'] if isinstance(x, list) and x else None)
report_data['format'] = report_data['format'].apply(lambda x: x[0]['name'] if isinstance(x, list) and x else None)
report_data['language'] = report_data['language'].apply(lambda x: x[0]['name'] if isinstance(x, list) and x else None)
report_data['source'] = report_data['language'].apply(lambda x: x[0]['shortname'] if isinstance(x, list) and x else None)
report_data['theme'] = report_data['theme'].apply(lambda x: [d.get('name', '') for d in x] if isinstance(x, list) else [])

In [None]:
def df_to_ndjson(input_df, output_ndjson_path):
    # Convert the entire DataFrame to a list of dictionaries at once
    records = input_df.to_dict('records')

    # Open the NDJSON file for writing
    with open(output_ndjson_path, 'w') as ndjson_file:
        # Iterate over the list of dictionaries and write each as a JSON object
        for record in records:
            json.dump(record, ndjson_file)
            ndjson_file.write('\n')  # Add a newline after each JSON object

In [None]:
today_date = pd.to_datetime('today').strftime('%Y-%m-%d')
output_path = f'analysis_pages_{today_date}.ndjson'
df_to_ndjson(report_data, output_path)


In [None]:
# _# Load the NDJSON file into a DataFrame
# df = pd.read_json(output_path, lines=True)
# df