# Data Preprocessing - Parte 4 (Maps)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def load_data(file_path, sep=',', encoding='utf-8'):
    """Load data from a CSV file into a pandas DataFrame."""
    try:
        data = pd.read_csv(file_path, sep=sep, encoding=encoding)
        print("Data loaded successfully.")
        return data
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        return None

def summarize_data(data):
    """Generate summary statistics of the DataFrame."""
    if data is not None:
        summary = data.describe()
        print("Data summary:")
        print(summary)
        return summary
    else:
        print("No data to summarize.")
        return None

## Choropleth Map based on the ACLED Data

In [3]:
data = load_data("../data/apac_data.csv", sep=';')
summary = summarize_data(data)

Data loaded successfully.
Data summary:
              EVENTS     FATALITIES  POPULATION_EXPOSURE             ID
count  197730.000000  197730.000000         9.976600e+04  197730.000000
mean        3.459632       2.036479         9.523680e+04    1538.280291
std         7.029385       9.795868         2.026675e+05     993.437542
min         1.000000       0.000000         1.000000e+00       1.000000
25%         1.000000       0.000000         1.054400e+04     593.000000
50%         1.000000       0.000000         3.482700e+04    1314.000000
75%         3.000000       0.000000         9.525400e+04    2283.000000
max       190.000000     618.000000         5.030356e+06    4157.000000


In [4]:
data.head()

Unnamed: 0,WEEK,REGION,COUNTRY,ADMIN1,EVENT_TYPE,SUB_EVENT_TYPE,EVENTS,FATALITIES,POPULATION_EXPOSURE,DISORDER_TYPE,ID,CENTROID_LATITUDE,CENTROID_LONGITUDE
0,31-dicembre-2016,Caucasus and Central Asia,Afghanistan,Badakhshan,Battles,Armed clash,6,15,,Political violence,1,36966,733417
1,07-gennaio-2017,Caucasus and Central Asia,Afghanistan,Badakhshan,Battles,Armed clash,5,28,,Political violence,1,36966,733417
2,21-gennaio-2017,Caucasus and Central Asia,Afghanistan,Badakhshan,Battles,Armed clash,1,4,,Political violence,1,36966,733417
3,04-febbraio-2017,Caucasus and Central Asia,Afghanistan,Badakhshan,Battles,Armed clash,1,4,,Political violence,1,36966,733417
4,11-febbraio-2017,Caucasus and Central Asia,Afghanistan,Badakhshan,Battles,Armed clash,4,41,,Political violence,1,36966,733417


In [5]:
# Filtering for Afghanistan
data['YEAR'] = data['WEEK'].apply(lambda x: int(x.split('-')[2]))

afg_data = data[data['COUNTRY'] == 'Afghanistan']

# Grouping by Admin1, Event_type and summing Events
grouped_data = afg_data.groupby(['ADMIN1', 'EVENT_TYPE', 'CENTROID_LATITUDE', 'CENTROID_LONGITUDE']).agg({'EVENTS': 'sum'}).reset_index()

grouped_data['ADMIN1'].value_counts()

ADMIN1
Badakhshan    6
Badghis       6
Wardak        6
Urozgan       6
Takhar        6
Samangan      6
Parwan        6
Panjshir      6
Paktika       6
Paktia        6
Nuristan      6
Nimruz        6
Nangarhar     6
Logar         6
Laghman       6
Kunduz        6
Kunar         6
Khost         6
Kapisa        6
Baghlan       6
Balkh         6
Bamyan        6
Daykundi      6
Farah         6
Faryab        6
Ghazni        6
Ghor          6
Helmand       6
Herat         6
Jowzjan       6
Kabul         6
Kandahar      6
Zabul         6
Sar-e Pol     5
Name: count, dtype: int64

In [6]:
grouped_data.head()

Unnamed: 0,ADMIN1,EVENT_TYPE,CENTROID_LATITUDE,CENTROID_LONGITUDE,EVENTS
0,Badakhshan,Battles,36966,733417,842
1,Badakhshan,Explosions/Remote violence,36966,733417,233
2,Badakhshan,Protests,36966,733417,49
3,Badakhshan,Riots,36966,733417,15
4,Badakhshan,Strategic developments,36966,733417,74


In [7]:
# For each ADMIN1, find the EVENT_TYPE with the maximum EVENTS
max_events = grouped_data.loc[grouped_data.groupby('ADMIN1')['EVENTS'].idxmax()]

max_events.head()

max_event = max_events['EVENTS'].max()
print(f"Maximum number of events in a single ADMIN1: {max_event}")

min_event = 0

Maximum number of events in a single ADMIN1: 4119


In [8]:
# find centroid lat and long for Sar-e Pol to add missing entry
sarepol_data = afg_data[afg_data['ADMIN1'] == 'Sar-e Pol']
sarepol_lat = sarepol_data['CENTROID_LATITUDE'].iloc[0]
sarepol_lon = sarepol_data['CENTROID_LONGITUDE'].iloc[0]

print(f"Sar-e Pol centroid: Latitude {sarepol_lat}, Longitude {sarepol_lon}")

Sar-e Pol centroid: Latitude 35,7152, Longitude 66,088


In [9]:
# find a row in the afg_data with ADMIN1 'Sar-e Pol' and print its details
sarepol_row = data[data['ADMIN1'] == 'Sar-e Pol']
print(sarepol_row['CENTROID_LATITUDE'])
print(sarepol_row['CENTROID_LONGITUDE'])

21766    35,7152
21767    35,7152
21768    35,7152
21769    35,7152
21770    35,7152
          ...   
22153    35,7152
22154    35,7152
22155    35,7152
22156    35,7152
22157    35,7152
Name: CENTROID_LATITUDE, Length: 392, dtype: object
21766    66,088
21767    66,088
21768    66,088
21769    66,088
21770    66,088
          ...  
22153    66,088
22154    66,088
22155    66,088
22156    66,088
22157    66,088
Name: CENTROID_LONGITUDE, Length: 392, dtype: object


In [10]:
# Adding a new entry for ADMIN1 == 'Sar-e Pol' and EVENT_TYPE == 'Riots' and EVENTS == 0
if not ((grouped_data['ADMIN1'] == 'Sar-e Pol') & (grouped_data['EVENT_TYPE'] == 'Riots')).any():
    new_entry = pd.DataFrame({
        'ADMIN1': ['Sar-e Pol'],
        'EVENT_TYPE': ['Riots'],
        'EVENTS': [0],
        'NORMALIZED_EVENTS': [0.0],
        'CENTROID_LATITUDE': [sarepol_lat],
        'CENTROID_LONGITUDE': [sarepol_lon]
    })
    grouped_data = pd.concat([grouped_data, new_entry], ignore_index=True)


In [11]:
# Normalizing the EVENTS for color scaling
grouped_data['NORMALIZED_EVENTS'] = (grouped_data['EVENTS'] - min_event) / (max_event - min_event)
grouped_data.head()

Unnamed: 0,ADMIN1,EVENT_TYPE,CENTROID_LATITUDE,CENTROID_LONGITUDE,EVENTS,NORMALIZED_EVENTS
0,Badakhshan,Battles,36966,733417,842,0.204419
1,Badakhshan,Explosions/Remote violence,36966,733417,233,0.056567
2,Badakhshan,Protests,36966,733417,49,0.011896
3,Badakhshan,Riots,36966,733417,15,0.003642
4,Badakhshan,Strategic developments,36966,733417,74,0.017966


In [12]:
print(grouped_data[grouped_data['ADMIN1'] == 'Sar-e Pol'])

        ADMIN1                  EVENT_TYPE CENTROID_LATITUDE  \
174  Sar-e Pol                     Battles           35,7152   
175  Sar-e Pol  Explosions/Remote violence           35,7152   
176  Sar-e Pol                    Protests           35,7152   
177  Sar-e Pol      Strategic developments           35,7152   
178  Sar-e Pol  Violence against civilians           35,7152   
203  Sar-e Pol                       Riots           35,7152   

    CENTROID_LONGITUDE  EVENTS  NORMALIZED_EVENTS  
174             66,088     392           0.095169  
175             66,088     120           0.029133  
176             66,088       8           0.001942  
177             66,088      48           0.011653  
178             66,088      65           0.015781  
203             66,088       0           0.000000  


In [13]:
# Transforming CENTROID_LATITUDE and CENTROID_LONGITUDE to numeric (pay attention: now their format has a comma instead of a dot)
# So first we need to replace commas with dots
grouped_data['CENTROID_LATITUDE'] = grouped_data['CENTROID_LATITUDE'].str.replace(',', '.')
grouped_data['CENTROID_LONGITUDE'] = grouped_data['CENTROID_LONGITUDE'].str.replace(',', '.')
grouped_data['CENTROID_LATITUDE'] = pd.to_numeric(grouped_data['CENTROID_LATITUDE'], errors='coerce')
grouped_data['CENTROID_LONGITUDE'] = pd.to_numeric(grouped_data['CENTROID_LONGITUDE'], errors='coerce')

grouped_data.head()

Unnamed: 0,ADMIN1,EVENT_TYPE,CENTROID_LATITUDE,CENTROID_LONGITUDE,EVENTS,NORMALIZED_EVENTS
0,Badakhshan,Battles,36.966,73.3417,842,0.204419
1,Badakhshan,Explosions/Remote violence,36.966,73.3417,233,0.056567
2,Badakhshan,Protests,36.966,73.3417,49,0.011896
3,Badakhshan,Riots,36.966,73.3417,15,0.003642
4,Badakhshan,Strategic developments,36.966,73.3417,74,0.017966


In [15]:
grouped_data['EVENT_TYPE'].unique()

array(['Battles', 'Explosions/Remote violence', 'Protests', 'Riots',
       'Strategic developments', 'Violence against civilians'],
      dtype=object)

In [None]:
# Saving grouped_data to CSV
#grouped_data.to_csv("../data/section_4/afg_choropleth.csv", index=False)

In [18]:
# afg_data['ADMIN1'].unique()

## Migrations

In [16]:
migration_data = load_data("../data/hdx_hapi_refugees_afg.csv")
migration_data.head()

Data loaded successfully.


Unnamed: 0,origin_location_code,origin_has_hrp,origin_in_gho,asylum_location_code,asylum_has_hrp,asylum_in_gho,population_group,gender,age_range,min_age,max_age,population,reference_period_start,reference_period_end
0,AFG,True,True,AFG,True,True,OOC,f,0-4,0.0,4.0,8751,2020-01-01,2020-12-31
1,AFG,True,True,AFG,True,True,OOC,f,5-11,5.0,11.0,8775,2020-01-01,2020-12-31
2,AFG,True,True,AFG,True,True,OOC,f,12-17,12.0,17.0,5828,2020-01-01,2020-12-31
3,AFG,True,True,AFG,True,True,OOC,f,18-59,18.0,59.0,18958,2020-01-01,2020-12-31
4,AFG,True,True,AFG,True,True,OOC,f,60+,60.0,,587,2020-01-01,2020-12-31
