# Data Exploration, Cleaning and Visualization

We start with data acquisition, preparing, cleaning, exploration and visualization

Importing the required libraries

In [31]:
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

Below is the api code for getting the food security of kenya from 2023 -01 -01 to 2025 -05 -25

In [2]:
def fetch_data_from_api(base_url, endpoint, params, headers=None):
    
    """
    Fetches data from a REST API.

    Args:
        base_url (str): The base URL of the API.
        endpoint (str): The specific API endpoint.
        params (dict): A dictionary of query parameters.
        headers (dict, optional): A dictionary of HTTP headers (e.g., for authentication).
    Returns:
        list: The JSON response data as a list of dictionaries, or None if the request fails.
    """
    
    url = f"{base_url}{endpoint}"
    
    
    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()  #Raise an HTTPError for bad responses.
        
        return response.json()
    
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err} - Response status: {response.status_code}")
        
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred: {conn_err}")
        
    except requests.exceptions.Timeout as timeout_err:
        print(f"Timeout error occurred: {timeout_err}")
        
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred: {req_err}")
        
    return None

In [32]:
# Definining the country and date range we want to get data
country_name = "Kenya"
start_date = "2023-01-01"
end_date = "2025-05-25"

In [33]:
#Configuration for FEWS NET Food Security Data ---
fews_net_base_url = "https://fdw.fews.net"

"""

Endpoint for foodsecurity classifications.
Provides data on IPC (Integrated Food Security Phase Classification) phases

"""
fews_net_endpoint = "/api/ipcclassification.json"


In [34]:
# Parameters for the API request
fews_net_params = {
    "country": country_name,
    "start_date": start_date,
    "end_date": end_date,
    "format": "json",
    "limit": 10000 
                   
}
data = fetch_data_from_api(fews_net_base_url, fews_net_endpoint, fews_net_params)

In [6]:
# Saving Data to DataFrame 
df = pd.DataFrame(data)

#### Cleaning the FEWS NET food security data frame

In [8]:
#Dropping columns
df = df.drop(columns=['start', 'end', 'admin_0', 'admin_2', 'admin_1', 'admin_3',
                      'admin_4', 'geographic_group'])

## Loading the Data Files for FAO and ACLED for cleaning

In [10]:
fao = pd.read_csv("../data/FAO.csv")
acled = pd.read_csv("../data/Acled.csv")

In [11]:
pd.set_option("display.max_columns", None)

#### Cleaning the FAO data file

In [12]:
#Retaining only the required columns
required = ['admin_1', 'market', 'cpcv2_description', 'is_staple_food',
            'longitude', 'latitude', 'collection_schedule', 'created', 'modified',
            'value', 'common_unit_price'
            ]
fao = fao[required]

In [13]:
#Dropped missing entries and committed the changes to the dataframe
fao.dropna(inplace=True)

In [14]:
# Printing the first 3 entries 
fao.head(3)

Unnamed: 0,admin_1,market,cpcv2_description,is_staple_food,longitude,latitude,collection_schedule,created,modified,value,common_unit_price
0,Isiolo,Baringo,"Goats, local quality",False,39.113776,0.858767,Monthly,2020-09-07T13:57:16,2024-12-27T13:41:51,1060.0,1060.0
1,Isiolo,Baringo,"Goats, local quality",False,39.113776,0.858767,Monthly,2020-09-07T13:57:19,2024-12-27T13:41:51,957.0,957.0
2,Isiolo,Baringo,"Goats, local quality",False,39.113776,0.858767,Monthly,2020-09-07T13:57:21,2024-12-27T13:41:51,876.0,876.0


#### Cleaning the ACLED data file

In [15]:
#Retaining only the required columns
required = ['event_date', 'country', 'event_type', 'sub_event_type',
            'actor1', 'fatalities', 'interaction', 'location',
            'geo_precision', 'admin1'
            ]
acled = acled[required]

In [16]:
# Renaming the column admin1 to admin_1 to be used for merging of the two DataFrames
acled = acled.rename(columns={'admin1': 'admin_1'})

In [17]:
acled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4643 entries, 0 to 4642
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   event_date      4643 non-null   object
 1   country         4643 non-null   object
 2   event_type      4643 non-null   object
 3   sub_event_type  4643 non-null   object
 4   actor1          4643 non-null   object
 5   fatalities      4643 non-null   int64 
 6   interaction     4643 non-null   object
 7   location        4643 non-null   object
 8   geo_precision   4643 non-null   int64 
 9   admin_1         4643 non-null   object
dtypes: int64(2), object(8)
memory usage: 362.9+ KB


## Loading the Rainfall Data Files

In [18]:
def load_and_merge_rainfall_data(folder_path):
    
    # Initializing empty dictionary to store DataFrames
    data_frames = {}

    # Iterate through all files in the folder 'chirp'
    for file_name in os.listdir(folder_path):
        
        if file_name.endswith('.csv'): 
            file_path = os.path.join(folder_path, file_name)

            # Load the CSV file
            df = pd.read_csv(file_path)

            # Add a county column using the file name and instatiate admin_1 to it
            county_name = os.path.splitext(file_name)[0]
            df['admin_1'] = county_name
            
            # Dropping the Unnamed: 4 column in all datasets
            if 'Unnamed: 4' in df.columns:
                df.drop(columns=['Unnamed: 4'], inplace=True)

            # Use county_name as key for DataFrame
            data_frames[county_name] = df
            
    # Merge all DataFrames into one
    merged_data = None
    for admin_1, df in data_frames.items():
        if merged_data is None:
            merged_data = df
        else:
            # Avoid duplicate column and keeping only 'Month' and year columns from new dataframe(Merged_data = df)
            cols_to_merge = ['Month', '2023', '2024', 'admin_1']
            df = df[cols_to_merge]
            merged_data = merged_data.merge(df, on='Month', how='outer', suffixes=(None, f"_{admin_1}"))

    # Reshape the data into long format
    long_rows = []

    for col in merged_data.columns:
        if col.startswith('2023') or col.startswith('2024'):
            base, *suffix = col.split('_')
            admin_col = f"admin_1{('_' + suffix[0]) if suffix else ''}"
        
            # Use 'admin_1' if the specific column doesn't exist
            admin_1 = merged_data[admin_col] if admin_col in merged_data else merged_data['admin_1']
            
            for idx, value in enumerate(merged_data[col]):
                month = merged_data.loc[idx, 'Month']
                long_rows.append({
                    'Month': month,
                    'admin_1': admin_1.iloc[idx],
                    'year': base,
                    'rainfall': value
                })
                
    # Convert to DataFrame
    long_data = pd.DataFrame(long_rows)  

    return merged_data, long_data


In [19]:
#Defining path to folder containing rainfall data
folder_path = '../chirp'  

#Load and merge the rainfall data from the specified folder
merged_df, long_df = load_and_merge_rainfall_data(folder_path)

## Merging the acled and fao datasets

In [20]:
"""
Convert the event_date column in the ACLED DataFrame to a datetime format and,
create a new column year_month that represents the year and month as a period
"""
acled['year_month'] = pd.to_datetime(acled['event_date']).dt.to_period('M')

"""
Convert the created column in the FAO DataFrame to a datetime format and,
create a new column year_month that represents the year and month as a period
"""

fao['year_month'] = pd.to_datetime(fao['created']).dt.to_period('M')  

In [21]:
fao.to_csv("fao.csv", index=False)
acled.to_csv("acled.csv", index=False)

In [22]:
"""
Merging the ACLED and FAO DataFrames by first creating a new 'year_month' column in both DataFrames by converting
their respective date columns to a period format (Year-Month)

"""

merged = pd.merge(
    acled.assign(year_month=pd.to_datetime(acled['event_date']).dt.to_period('M')),
    fao.assign(year_month=pd.to_datetime(fao['created']).dt.to_period('M')),
    on=['admin_1', 'year_month'],
    how='inner'  
)

In [23]:
merged.head(3)

Unnamed: 0,event_date,country,event_type,sub_event_type,actor1,fatalities,interaction,location,geo_precision,admin_1,year_month,market,cpcv2_description,is_staple_food,longitude,latitude,collection_schedule,created,modified,value,common_unit_price
0,01 January 2025,Kenya,Riots,Mob violence,Rioters (Kenya),0,Rioters-Civilians,Nairobi - Embakasi North,1,Nairobi,2025-01,Nairobi,"Beans, dry, rosecoco",False,36.8317,-1.28255,Monthly,2025-01-13T12:44:28,2025-01-24T05:50:20,12690.0,141.0
1,01 January 2025,Kenya,Riots,Mob violence,Rioters (Kenya),0,Rioters-Civilians,Nairobi - Embakasi North,1,Nairobi,2025-01,Nairobi,"Beans, dry, rosecoco",False,36.8317,-1.28255,Monthly,2025-01-13T15:53:17,2025-01-24T05:50:20,11610.0,129.0
2,01 January 2025,Kenya,Riots,Mob violence,Rioters (Kenya),0,Rioters-Civilians,Nairobi - Embakasi North,1,Nairobi,2025-01,Nairobi,"Beans, dry, rosecoco",False,36.8317,-1.28255,Monthly,2025-01-13T17:24:16,2025-01-24T05:50:20,12240.0,136.0


## Merging Merged DataFrame and Rainfall DataFrame

In [24]:
# Create a new (year_month) column in the long_df DataFrame and combines the 'year' and 'Month' columns into a datetime format

long_df['year_month'] = pd.to_datetime(long_df['year'].astype(str) + '-' + long_df['Month'].astype(str)).dt.to_period('M')

  long_df['year_month'] = pd.to_datetime(long_df['year'].astype(str) + '-' + long_df['Month'].astype(str)).dt.to_period('M')


In [25]:
#long_df.to_csv("rainfall.csv", index=False)

In [26]:
long_df.head(2)

Unnamed: 0,Month,admin_1,year,rainfall,year_month
0,Apr,Mandera,2023,223.3755,2023-04
1,Aug,Mandera,2023,3.1516,2023-08


In [27]:
# Merging the merged dataframe and long_df dataframe of the rainfall

merged_data = pd.merge(
    merged,
    long_df,
    on=['admin_1', 'year_month'],
    how='inner'  
)

In [28]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13958 entries, 0 to 13957
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype    
---  ------               --------------  -----    
 0   event_date           13958 non-null  object   
 1   country              13958 non-null  object   
 2   event_type           13958 non-null  object   
 3   sub_event_type       13958 non-null  object   
 4   actor1               13958 non-null  object   
 5   fatalities           13958 non-null  int64    
 6   interaction          13958 non-null  object   
 7   location             13958 non-null  object   
 8   geo_precision        13958 non-null  int64    
 9   admin_1              13958 non-null  object   
 10  year_month           13958 non-null  period[M]
 11  market               13958 non-null  object   
 12  cpcv2_description    13958 non-null  object   
 13  is_staple_food       13958 non-null  bool     
 14  longitude            13958 non-null  float64  
 15  la

In [29]:
# Saving the merged data file 
#merged_data.to_csv("merged_data.csv", index=False)

In [30]:
merged_data.head(2)

Unnamed: 0,event_date,country,event_type,sub_event_type,actor1,fatalities,interaction,location,geo_precision,admin_1,year_month,market,cpcv2_description,is_staple_food,longitude,latitude,collection_schedule,created,modified,value,common_unit_price,Month,year,rainfall
0,31 December 2024,Kenya,Protests,Peaceful protest,Protesters (Kenya),0,Protesters only,Mombasa,1,Mombasa,2024-12,Composite (Mombasa),"Tea, Mombasa",False,39.6606,-4.06154,Monthly,2024-12-17T19:59:02,2024-12-27T13:42:19,2.066,2.066,Dec,2024,4.2088
1,31 December 2024,Kenya,Protests,Peaceful protest,Protesters (Kenya),0,Protesters only,Mombasa,1,Mombasa,2024-12,Composite (Mombasa),"Tea, Mombasa",False,39.6606,-4.06154,Monthly,2024-12-17T20:00:08,2024-12-27T13:42:19,2.2375,2.2375,Dec,2024,4.2088
