In [54]:
import requests
import pandas as pd
import uuid
import json
import os

In [120]:
fpath = "D://projects//_external_files//reliefweb_situation_reports//"
output_fpath = "D://projects//_external_files//surveyor//rw_siturep_preprocessed//"
api_endpoint = 'https://api.reliefweb.int/v1/reports?appname=amcross'


def pull_situation_reports():
    # first get the min and max_ids
    # with every invocation of this function, first any new
    # situation reports will be pulled, followed by
    # getting 50 older reports. In this way, the
    # historical perspective will be ever increasing
    file_ids = [f.split("_")[0] for f in os.listdir(fpath)]
    file_ids.sort()
    min_id = file_ids[0]
    max_id = file_ids[-1]

    #set a high limit for latest in case the job doesn't run for a long time
    new_params = {
    'appname': 'amcross','profile': 'full','preset': 'latest','limit': 800
    ,'query[fields][]':'format.name','query[value]':'Situation Report'
    ,'filter[field]': "id",'filter[value][from]':max_id
    }

    historical_params = {
    'appname': 'amcross','profile': 'full','preset': 'latest','limit': 50
    ,'query[fields][]':'format.name','query[value]':'Situation Report'
    ,'filter[field]': "id",'filter[value][to]':min_id
    }

    def send_request(params):
        # Make the API request
        response = requests.get(api_endpoint, params=params)
        
        # Check the status of the response
        if response.status_code == 200:
            # Parse and use the response data (in JSON format)
            data = response.json()
            return data['data']
        
        else:
            print(f"Error: {response.status_code} - {response.text}")



    def generate_filename(json):
    
        #set defaults
        rec_id = uuid.uuid4().hex
        rec_date = 'yyyy_mm_dd'
        p_country = 'country_name'
    
        try:
            rec_id = json['id']
        except:
            pass
    
        try:
            rec_date = json['fields']['date']['original'].split("T")[0]
        except:
            pass
            
        try:
            p_country = json['fields']['primary_country']['name'].replace(" ","_").lower()
        except:
            pass
            
        return f"{rec_id}_{rec_date}_reliefweb_{p_country}.json"

   
    results = send_request(new_params)
    print(f"new results: {len(results)-1}")
    for x in results:
        fname = generate_filename(x)
        with open(f"{fpath}{fname}", 'w') as outfile:
            outfile.write(json.dumps(x))

    results = send_request(historical_params)
    print(f"historical results: {len(results)-1}")
    for x in results:
        fname = generate_filename(x)
        with open(f"{fpath}{fname}", 'w') as outfile:
            outfile.write(json.dumps(x))
    
pull_situation_reports()           

#,'filter[field]': {"date.created": {"from":"2023-11-22T13:37:21+00:00", "to":"2023-11-24T13:37:21+00:00"}}

new results: 0
historical results: 49


In [79]:
def extract_themes(j):
    #must pass in after levelling json up to j['fields']
    themes = j.get('theme')
    if themes is None:
        return None
        
    ts =[]
    for theme in themes:
        ts.append(theme['name'].lower())
    return '; '.join(ts)

In [112]:
df_reliefweb_situation_report = pd.DataFrame(columns = ['record_type','source_url','glide_id','source_level_country','source_title','source_desc',
                                                        'source_original_text','reference_url','text','authoring_org','reported_date'])
def parse_json(j):
    #for j in json_data:
    
    #parse 
    try:
        reference_url = j['href']
            
        j = j['fields']
        #print(j)
        
        glide_id = None
        disaster = j.get('disaster')
        if disaster:
            glide_id = disaster[0]['glide']
    
        
        rec_id = j['id']
        title = j['title']
        original_text = j['body']
        link_to_doc = j['url_alias']
        file_url = j['file'][0]['url']
        primary_country_iso3 = j['primary_country']['iso3']
        primary_country = j['primary_country']['shortname']
        author_org = j['source'][0]['shortname']
        report_date = j['date']['original']
        themes = extract_themes(j)
       
        original_text_list = original_text.split("\n\n")
        for o in original_text_list:
            row = ['situation report',reference_url,glide_id,primary_country,title,themes,o,file_url,o,author_org,report_date]
            df_reliefweb_situation_report.loc[len(df_reliefweb_situation_report)] = row
    except:
        pass

In [121]:
#prep receiving df
df_reliefweb_situation_report = pd.DataFrame(columns = ['record_type','source_url','glide_id','source_level_country','source_title','source_desc',
                                                        'source_original_text','reference_url','text','authoring_org','reported_date'])

# get files on disk
files_on_disk = {}
for f in os.listdir(fpath):
    k = f.split('_')[0]
    files_on_disk[k]=f

# open file
parse_file = f"{output_fpath}reliefweb_situation_reports.xlsx"
if os.path.exists(parse_file):
    df = pd.read_excel(parse_file)
    already_done = [w.split('/')[-1] for w in list(set(df['source_url'].tolist()))]
    for f in files_on_disk:
        if f not in already_done:
            with open(f"{fpath}{files_on_disk[f]}", 'r') as file:
                json_data = json.load(file)
                parse_json(json_data)
else:
    print("file not found - will process everything")
    for f in files_on_disk:
        with open(f"{fpath}{files_on_disk[f]}", 'r') as file:
            json_data = json.load(file)
            parse_json(json_data)
            


In [126]:
df_reliefweb_situation_report = pd.concat([df_reliefweb_situation_report, df], ignore_index=True)
df_reliefweb_situation_report = df_reliefweb_situation_report.sort_values(by='source_url')

In [128]:
df_reliefweb_situation_report.to_excel(f"{output_fpath}reliefweb_situation_reports.xlsx", index=False)

In [125]:
df_reliefweb_situation_report.sample()

Unnamed: 0,record_type,source_url,glide_id,source_level_country,source_title,source_desc,source_original_text,reference_url,text,authoring_org,reported_date
311,situation report,https://api.reliefweb.int/v1/reports/3968945,DR-2022-000238-DJI,Ethiopia,East Africa Cross Border Trade Bulletin (March...,agriculture; food and nutrition,- The share of maize in East Africa cross-bo...,https://reliefweb.int/attachments/54416ada-942...,- The share of maize in East Africa cross-bo...,FEWS NET,2023-06-07T00:00:00+00:00


In [None]:
, encoding='utf-8-sig'

## End

In [51]:


params = {
    'appname': 'amcross'  
    ,'profile': 'full'
    ,'preset': 'latest'
    ,'limit': 5
    ,'query[fields][]':'format.name'
    ,'query[value]':'Situation Report'
    ,'filter[field]': "id"
    ,'filter[value][to]':"4017723"
    #,'filter[field]': {"id": {"to":"4017723"}}
    #,'filter[field]': {"date.created": {"from":"2023-11-22T13:37:21+00:00", "to":"2023-11-24T13:37:21+00:00"}}
    #if I remove this, seems like it returns everything
    #,'fields[include][]':['source',"format",'title',"body",'disaster.glide','primary_country']
}


api_endpoint = 'https://api.reliefweb.int/v1/reports?appname=amcross'

# Make the API request
response = requests.get(api_endpoint, params=params)

# Check the status of the response
if response.status_code == 200:
    # Parse and use the response data (in JSON format)
    data = response.json()
    situation_reports = data['data']

else:
    print(f"Error: {response.status_code} - {response.text}")

In [52]:
json_data = response.json()['data']
len(json_data)

5

In [65]:
#get min situation report id
fpath = "D://projects//_external_files//reliefweb_situation_reports//"
file_ids = [f.split("_")[0] for f in os.listdir(fpath)]
file_ids.sort()
print(file_ids)
min = file_ids[0]
max = file_ids[-1]
print(f"to: {min}")
print(f"from: {max}")



['4017696', '4017713', '4017716', '4017720', '4017723', '4017728', '4017729', '4017737', '4017739', '4017764']
to: 4017696
from: 4017764


In [53]:
fpath = "D://projects//_external_files//reliefweb_situation_reports//"

def generate_filename(json):

    #set defaults
    rec_id = uuid.uuid4().hex
    rec_date = 'yyyy_mm_dd'
    p_country = 'country_name'

    try:
        rec_id = json['id']
    except:
        pass

    try:
        rec_date = json['fields']['date']['original'].split("T")[0]
    except:
        pass
        
    try:
        p_country = json['fields']['primary_country']['name'].replace(" ","_").lower()
    except:
        pass
        
    return f"{rec_id}_{rec_date}_reliefweb_{p_country}.json"


for x in json_data:
    fname = generate_filename(x)
    with open(f"{fpath}{fname}", 'w') as outfile:
        outfile.write(json.dumps(x))

In [100]:
def extract_themes(j):
    #must pass in after levelling json up to j['fields']
    themes = j.get('theme')
    if themes is None:
        return None
        
    ts =[]
    for theme in themes:
        ts.append(theme['name'].lower())
    return '; '.join(ts)

In [101]:
df_reliefweb_situation_report = pd.DataFrame(columns = ['record_type','source_url','glide_id','source_level_country','source_title','source_desc','source_original_text','reference_url','text','authoring_org','reported_date'])

for j in json_data:

    #parse 
    reference_url = j['href']
        
    j = j['fields']
    
    glide_id = None
    disaster = j.get('disaster')
    if disaster:
        glide_id = disaster[0]['glide']

    
    rec_id = j['id']
    title = j['title']
    original_text = j['body']
    link_to_doc = j['url_alias']
    file_url = j['file'][0]['url']
    primary_country_iso3 = j['primary_country']['iso3']
    primary_country = j['primary_country']['shortname']
    author_org = j['source'][0]['shortname']
    report_date = j['date']['original']
    themes = extract_themes(j)
   
    original_text_list = original_text.split("\n\n")
    for o in original_text_list:
        row = ['situation report',reference_url,glide_id,primary_country,title,themes,o,file_url,o,author_org,report_date]
        df_reliefweb_situation_report.loc[len(df_reliefweb_situation_report)] = row

In [102]:
df_reliefweb_situation_report

Unnamed: 0,record_type,source_url,glide_id,source_level_country,source_title,source_desc,source_original_text,reference_url,text,authoring_org,reported_date
0,situation report,https://api.reliefweb.int/v1/reports/4017691,,World,Multi-country outbreak of mpox (monkeypox) - E...,health,**Highlights**,https://reliefweb.int/attachments/2e5a83c9-d6f...,**Highlights**,WHO,2023-11-25T00:00:00+00:00
1,situation report,https://api.reliefweb.int/v1/reports/4017691,,World,Multi-country outbreak of mpox (monkeypox) - E...,health,- The mpox surveillance reporting frequency ha...,https://reliefweb.int/attachments/2e5a83c9-d6f...,- The mpox surveillance reporting frequency ha...,WHO,2023-11-25T00:00:00+00:00
2,situation report,https://api.reliefweb.int/v1/reports/4017691,,World,Multi-country outbreak of mpox (monkeypox) - E...,health,- A total of 668 new laboratory-confirmed case...,https://reliefweb.int/attachments/2e5a83c9-d6f...,- A total of 668 new laboratory-confirmed case...,WHO,2023-11-25T00:00:00+00:00
3,situation report,https://api.reliefweb.int/v1/reports/4017691,,World,Multi-country outbreak of mpox (monkeypox) - E...,health,- Based on the data reported through global su...,https://reliefweb.int/attachments/2e5a83c9-d6f...,- Based on the data reported through global su...,WHO,2023-11-25T00:00:00+00:00
4,situation report,https://api.reliefweb.int/v1/reports/4017691,,World,Multi-country outbreak of mpox (monkeypox) - E...,health,- WHO has published a Disease Outbreak News ou...,https://reliefweb.int/attachments/2e5a83c9-d6f...,- WHO has published a Disease Outbreak News ou...,WHO,2023-11-25T00:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...
1112,situation report,https://api.reliefweb.int/v1/reports/4016821,EP-2023-000181-SDN,South Sudan,South Sudan: Response to the Sudan Crisis Situ...,contributions; coordination; food and nutritio...,- Following the onset of the conflict in Sudan...,https://reliefweb.int/attachments/a94aa91b-f5d...,- Following the onset of the conflict in Sudan...,OCHA,2023-11-22T00:00:00+00:00
1113,situation report,https://api.reliefweb.int/v1/reports/4016821,EP-2023-000181-SDN,South Sudan,South Sudan: Response to the Sudan Crisis Situ...,contributions; coordination; food and nutritio...,- Of the 22 points of entry (PoEs) that are mo...,https://reliefweb.int/attachments/a94aa91b-f5d...,- Of the 22 points of entry (PoEs) that are mo...,OCHA,2023-11-22T00:00:00+00:00
1114,situation report,https://api.reliefweb.int/v1/reports/4016821,EP-2023-000181-SDN,South Sudan,South Sudan: Response to the Sudan Crisis Situ...,contributions; coordination; food and nutritio...,"- Currently, the onward transportation assista...",https://reliefweb.int/attachments/a94aa91b-f5d...,"- Currently, the onward transportation assista...",OCHA,2023-11-22T00:00:00+00:00
1115,situation report,https://api.reliefweb.int/v1/reports/4016821,EP-2023-000181-SDN,South Sudan,South Sudan: Response to the Sudan Crisis Situ...,contributions; coordination; food and nutritio...,- The number of Sudanese refugees and asylum s...,https://reliefweb.int/attachments/a94aa91b-f5d...,- The number of Sudanese refugees and asylum s...,OCHA,2023-11-22T00:00:00+00:00


In [104]:
df_reliefweb_situation_report.to_csv("c://temp//100_situation_reports.csv", encoding='utf-8-sig', index=False)

In [84]:
x = uuid.uuid4().hex
x

'7a823776db334c81960785e42707cccd'