In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, NavigableString
import re

In [3]:
relief_web_urls = ['https://reliefweb.int/disaster/eq-2023-000015-tur'
                   ,'https://reliefweb.int/disaster/dr-2021-000022-afg'
                   ,'https://reliefweb.int/disaster/ff-2023-000133-afg'
                   ,'https://reliefweb.int/disaster/eq-2023-000184-afg']


In [16]:
res = requests.get(relief_web_urls[0])
html = res.text
soup = BeautifulSoup(html)


In [None]:
<meta property="og:title" content="Afghanistan: Earthquakes - Oct 2023" />\n
<meta property="og:description" content="Humanitarian situation reports, response plans, news, analyses, evaluations, assessments, maps, infographics and more on Afghanistan: Earthquakes - Oct 2023" />


In [28]:
relief_web_fields = {
    'Disaster Description' : ['h2',{'class': 'cd-block-title rw-entity-text__title', 'id': 'overview-title'}]
    ,'glide' : ['dd',{'class':'rw-entity-meta__tag-value rw-entity-meta__tag-value--glide rw-entity-meta__tag-value--simple rw-entity-meta__tag-value--last'}]

}

In [29]:
def get_discrete_tag_text(soup, tag, attributes={}):
    results = soup.find_all(tag, attributes)
    r_list=[]
    for r in results:
        r_list.append(r.text.strip())
    return r_list

 
tag = 'dd'
attributes = {}
attributes = {'class':'rw-entity-meta__tag-value rw-entity-meta__tag-value--glide rw-entity-meta__tag-value--simple rw-entity-meta__tag-value--last'}

get_discrete_tag_text(soup, relief_web_fields['glide'][0], attributes = relief_web_fields['glide'][1])




#<dd class="rw-entity-meta__tag-value--status--ongoing rw-entity-meta__tag-value rw-entity-meta__tag-value--status rw-entity-meta__tag-value--simple">

['EQ-2023-000015-TUR']

In [36]:
# Find all <a> tags (links) in the HTML
links = soup.find_all('a')

#this gets the desc
target_tag = soup.find('h2', {'class': 'cd-block-title rw-entity-text__title', 'id': 'overview-title'})


def extract_metadata(soup):
    #get title
    meta_tags = soup.find_all('meta')  # Find all <meta> tags in the HTML content

    # Extract content from specific <meta> tags using their property attribute
    for tag in meta_tags:
        if tag.get('property') == 'og:title':
            title = tag.get('content')
            #print(f"og:title content: {title}")
        elif tag.get('property') == 'og:description':
            description = tag.get('content')
            #print(f"og:description content: {description}")

    return title, description
    
title, description = extract_metadata(soup)
#print(title, description)

def extract_affected_countries(soup):
    # Find all <h3> tags with class 'rw-river-article__title'
    countries_section = soup.find('section', id='countries')
    
    # Find all <h3> tags with class 'rw-river-article__title' within the 'countries' section
    country_titles = countries_section.find_all('h3', class_='rw-river-article__title')
    
    countries = []
    for title in country_titles:
        country_name = title.text.strip()
        countries.append(country_name)
    
    return countries

x = extract_affected_countries(soup)
print(x)

def extract_content(soup):
    #this gets the full text content
    target_tag = soup.find('div', {'class': 'rw-entity-text__content', 'id': 'overview-content'})
    
    #within the text content, take it paragraphy by paragraph
    if target_tag:
        content=[]
        paras = target_tag.find_all('p')
        for p in paras:
            links = p.find_all('a')  # Find all <a> tags within each <p> tag
            l=[]
            for link in links:
                href = link.get('href')  # Get the 'href' attribute from each <a> tag

                l.append(href)

            #if there are no urls, set to None so fillna can deal with it later
            if len(l) == 0:
                l = None
            content.append([p.text.strip(),l])
    return content


#x = extract_content(soup)
#x[1]

['Afghanistan']


In [41]:
#load text to df
df_reliefweb_disaster_summary = pd.DataFrame(columns = ['record_type','source_url','glide_id','source_level_country','source_title','source_desc','source_original_text','reference_url'])
for url in relief_web_urls:
    print (url)
    res = requests.get(url)
    soup = BeautifulSoup(res.text)

    glide_id = get_discrete_tag_text(soup, relief_web_fields['glide'][0], attributes = relief_web_fields['glide'][1])[0]

    title, description = extract_metadata(soup)
    countries_affected = extract_affected_countries(soup)
    content = extract_content(soup)

    for c in content:
        row = ['disaster summary',url,glide_id,countries_affected,title,description,]
        row.extend(c)
        df_reliefweb_disaster_summary.loc[len(df_reliefweb_disaster_summary)] = row


    
    
    

https://reliefweb.int/disaster/eq-2023-000015-tur
https://reliefweb.int/disaster/dr-2021-000022-afg
https://reliefweb.int/disaster/ff-2023-000133-afg
https://reliefweb.int/disaster/eq-2023-000184-afg


In [42]:

def extract_reliefweb_summary_reference(text):
    # reliefweb disaster summary text ends with a parenthetical reference to a detailed source.
    # this function finds and extracts them
    
    #find (OHCA, 16 Feb 2023) - 
    #    but within that, find specifically 'OHCA' and '16 Feb 2023'
    #    groups 2 and 3 respectively
    source_and_date = re.search(r'\((([\w\s]+), (\d+ \w+ \d{4}))\)$', text)
    
    if source_and_date:
        source = source_and_date.group(2)
        reported_date = source_and_date.group(3)
    
        #now that we have the metadata in hand, remove if from the source
        text = text[:source_and_date.span()[0]].strip()
        
    else:
        source = None
        reported_date = None
    
    return pd.Series({'text':text, 'source':source, 'reported_date':reported_date})


extract_reliefweb_summary_reference('is expected to increase in the coming days/weeks. (OCHA asdf, 16 Feb 2023)')

text             is expected to increase in the coming days/weeks.
source                                                   OCHA asdf
reported_date                                          16 Feb 2023
dtype: object

In [43]:
df_reliefweb_disaster_summary[['text','authoring_org','reported_date']] = df_reliefweb_disaster_summary['source_original_text'].apply(extract_reliefweb_summary_reference)
df_reliefweb_disaster_summary[['reference_url','authoring_org','reported_date']] = df_reliefweb_disaster_summary[['reference_url','authoring_org','reported_date']].bfill()


In [47]:
df_reliefweb_disaster_summary.to_csv("c://temp//foo.csv", encoding='utf-8-sig', index=False)

In [45]:
df_reliefweb_disaster_summary.loc[21:22]

Unnamed: 0,record_type,source_url,glide_id,source_level_country,source_title,source_desc,source_original_text,reference_url,text,authoring_org,reported_date
21,disaster summary,https://reliefweb.int/disaster/eq-2023-000015-tur,EQ-2023-000015-TUR,"[Syrian Arab Republic, Türkiye]",Türkiye/Syria: Earthquakes - Feb 2023,"Humanitarian situation reports, response plans...",Humanitarian needs remain 100 days after devas...,[https://reliefweb.int/node/3963175/],Humanitarian needs remain 100 days after devas...,OCHA,17 May 2023
22,disaster summary,https://reliefweb.int/disaster/eq-2023-000015-tur,EQ-2023-000015-TUR,"[Syrian Arab Republic, Türkiye]",Türkiye/Syria: Earthquakes - Feb 2023,"Humanitarian situation reports, response plans...","To date, partners have directly provided 4 mil...",[https://reliefweb.int/node/3963175/],"To date, partners have directly provided 4 mil...",OCHA,17 May 2023


In [None]:
import requests

params = {
    'appname': 'amcross',  
    'profile': 'full',
    'preset': 'latest',
    'limit': 3,
    'query[fields][]':'format.name',
    'query[value]':'Situation Report',
    'fields[include][]':['source.shortname',"format","body-html"]
}


api_endpoint = 'https://api.reliefweb.int/v1/reports?appname=amcross'

# Make the API request
response = requests.get(api_endpoint, params=params)

# Check the status of the response
if response.status_code == 200:
    # Parse and use the response data (in JSON format)
    data = response.json()
    situation_reports = data['data']
    
    # Process the situation reports as needed
    for report in situation_reports:
        # Extract relevant information from each report
        #report_title = report['fields']['title']


        print(report)

else:
    print(f"Error: {response.status_code} - {response.text}")
