# Exploring ACLED data

In [1]:
# African dataset
# https://www.acleddata.com/curated-data-files/

In [18]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Exporting excel file as json

In [None]:
# import excel file
# df = pd.read_excel('../data/Africa_1997-2019_Oct12-1.xlsx')

In [None]:
# New column for date
# df['date'] = pd.to_datetime(df['EVENT_DATE']).dt.date

In [None]:
# export dataframe as json
# df.to_json('../data/africa-data.json')

## Importing data

In [3]:
df = pd.read_json('../data/africa-data.json')

In [4]:
df.shape

(193565, 30)

## Data formatting
- Rename columns
- Aggregate by country/month
- Encode values

In [5]:
# Rename and remove columns
columns = ['YEAR', 'EVENT_TYPE', 'SUB_EVENT_TYPE', 'ACTOR1', 'REGION', 'COUNTRY', 'SOURCE', 'FATALITIES', 'date']
try:
    df = df[columns]
    df.columns = ['year', 'event_type', 'sub_event_type', 'actor', 'region', 'country', 'source', 'fatalities', 'date']
except KeyError as e:
    print(e)
    pass

df.columns

Index(['year', 'event_type', 'sub_event_type', 'actor', 'region', 'country',
       'source', 'fatalities', 'date'],
      dtype='object')

In [21]:
'''Dataset reshape
- Count events by country
- Aggregate data by month
'''

'Dataset reshape\n- Count events by country\n- Aggregate data by month'

In [None]:
# Function to get data for one specific country

# def country_data(country):
#     df_country = df.filter(['country', 'event_type', 'fatalities', 'date', 'sub_event_type'])\

# df.filter(['country', 'event_type', 'fatalities', 'date', 'sub_event_type'])
df_country = df.filter(['country', 'event_type', 'fatalities', 'date', 'sub_event_type'])\
    .query('country == "Algeria"')\
    .sort_values(by = 'date')

df_country.head()

In [None]:
df.head()

## Data exploration

In [None]:
# Plot variables sns
plt.rcParams['figure.figsize'] = (15, 8)

In [None]:
print('Event types: {}\n'.format(df['event_type'].unique()))
print('Event subtypes: {}'.format(df['sub_event_type'].unique()))

In [None]:
# Events
sns.countplot(x='event_type', color='lightcoral', data=df)

In [None]:
# Plotting time series
# https://www.kaggle.com/lordkada/acled-simple-analysis

# All event 1997 - 2019
event_type = df.groupby(['date'])['event_type'].count()
event_type.plot(subplots=True, figsize=(20, 10), style='.', alpha=0.5)
print("1997-2019 - Number of events in {} countries: {}".
      format(df['country'].nunique(), df['event_type'].count()))

In [None]:
# All subevents by date
sub_events = df.groupby(['date', 'sub_event_type'])['sub_event_type', ].count()
sub_events.reset_index(level=['date'], inplace= True)
sub_events.head()

In [None]:
year = df.groupby(['year'])['event_type'].count()
year.plot(kind='bar', color = 'lightcoral')

In [None]:
# Use rolling to deseasonalize data 
event_type['1999'].plot(figsize=(20,10), linewidth=5, fontsize=20, style='x', alpha = 0.5)
event_type['1999'].rolling(12).mean().plot(figsize=(20,10), linewidth=2, fontsize=20, alpha = 0.5)


In [None]:
# Plot peaks/diff in data
event_type.plot(figsize=(20,10), linewidth=5, fontsize=20, style='.', alpha=0.2 )
event_type.diff().plot(figsize=(20,10), linewidth=1, fontsize=20, alpha=0.4 )

# Models: Finding outliers by month

In [None]:
# Using k-clustering to find anomalies compare to previous month
# https://towardsdatascience.com/time-series-of-price-anomaly-detection-13586cd5ff46