# Using the datasets module
This notebook must be executed inside the examples/ directory.

In [1]:
import os
assert (os.getcwd().endswith('examples')), '%s\n%s "%s"' % (
    'Notebook must be executed inside examples directory\n', 
    'Current directory is', os.getcwd())

In [2]:
# fix import paths 
import sys
sys.path.insert(0, '../')

In [3]:
# fix data directory path in datasets module
import modules.datasets as datasets
datasets.DATADIR = '../' + datasets.DATADIR  

# The ACLED class

In [4]:
acled = datasets.ACLED()

In [5]:
type(acled)

modules.datasets.ACLED

In [6]:
## delete some data from mongodb to test database update below
#import datetime
#starttime = datetime.datetime(2017,3,1)  # March 1st, 2017
#acled.mongodb_delete_many(del_filter={'event_date': {'$gt':starttime}})

### Download new data from ACLED server (or everything if database does not exist)

In [7]:
acled.mongodb_update_database()

2017-03-25 20:26:21.071004 Querying ACLED API (one dot is 500 rows) ...
2017-03-25 20:26:31.795270 2 pages retrieved from ACLED API.
2017-03-25 20:26:31.795510 Make DataFrame...
2017-03-25 20:26:31.805392 Apply str->datetime on event_date...
595 records inserted to mongodb, 606 lines in csv. (Investigate difference)


### Getting the enitre ACLED database as a pandas.DataFrame

In [8]:
import pandas as pd

In [9]:
df = acled.mongodb_get_entire_database()

In [10]:
df.columns

Index(['_id', 'actor1', 'actor2', 'admin1', 'admin2', 'admin3', 'ally_actor_1',
       'ally_actor_2', 'country', 'data_id', 'event_date', 'event_id_cnty',
       'event_id_no_cnty', 'event_type', 'fatalities', 'geo_precision', 'gwno',
       'inter1', 'inter2', 'interaction', 'latitude', 'location', 'longitude',
       'notes', 'source', 'time_precision', 'year'],
      dtype='object')

In [11]:
df.head()

Unnamed: 0,_id,actor1,actor2,admin1,admin2,admin3,ally_actor_1,ally_actor_2,country,data_id,...,inter1,inter2,interaction,latitude,location,longitude,notes,source,time_precision,year
0,58d6c467b2c79a65fc8e6c64,Protesters (Malawi),,Lilongwe,Lilongwe City,n.a. (907),,,Malawi,142532,...,6,0,60,-13.9833,Lilongwe,33.7833,Malawian civil society organisations held a da...,Nyasa Times (Leeds),1,2017
1,58d6c467b2c79a65fc8e6c65,Rioters (Madagascar),Police Forces of Madagascar (2014-),Antananarivo,Analamanga,Andramasina,Students (Madagascar),,Madagascar,142520,...,5,1,15,-19.3,Ankatso,47.8333,Students staged a protest over the cutting of ...,L'Express de Madagascar,1,2017
2,58d6c467b2c79a65fc8e6c66,Unidentified Armed Group (Kenya),Civilians (Kenya),Garissa,Daadab,Dadaab,,,Kenya,142259,...,3,7,37,0.0514,Dadaab Refugee Camp,40.3142,Unidentified gunmen kidnapped three teachers f...,Associated Press,1,2017
3,58d6c467b2c79a65fc8e6c67,Protesters (Ethiopia),,Oromia,Misraq Harerge,Babile,,,Ethiopia,142086,...,6,0,60,9.2167,Babile,42.3333,"Protests reported in Babile, with demonstrator...",Oromiya Media Network,1,2017
4,58d6c467b2c79a65fc8e6c68,Police Forces of Egypt (2014-),Lewaa El-Thawra,Giza,Kardasa,,,,Egypt,142009,...,1,3,13,30.031,Kirdasah,31.1111,Egypt's Interior Ministry reportedly killed on...,Egypt Independent; Al-Ahram Gate,1,2017


### List unique countries

In [12]:
df['country'].unique()

array(['Malawi', 'Madagascar', 'Kenya', 'Ethiopia', 'Egypt', 'Chad',
       'Central African Republic', 'Libya', 'Ivory Coast', 'Ghana',
       'Democratic Republic of Congo', 'Burkina Faso', 'Zimbabwe',
       'Tunisia', 'Sudan', 'South Sudan', 'South Africa', 'Somalia',
       'Morocco', 'Mali', 'Nigeria', 'Niger', 'Burundi', 'Algeria',
       'Sierra Leone', 'Gambia', 'Cameroon', 'Republic of Congo', 'Uganda',
       'Togo', 'Angola', 'Namibia', 'Senegal', 'Guinea', 'Gabon', 'Rwanda',
       'Mauritania', 'Guinea-Bissau', 'Tanzania', 'Liberia', 'Mozambique',
       'Zambia', 'Benin', 'Botswana', 'Djibouti', 'Swaziland', 'Lesotho',
       'Eritrea', 'Equatorial Guinea'], dtype=object)

### Fetch data for Somalia

In [13]:
df_somalia = df[df['country'] == 'Somalia']

In [14]:
df_somalia['country'].unique()

array(['Somalia'], dtype=object)

### Append 'geo_point' column to DataFrame (for shapely.geometry)

In [15]:
# disable false positive warning
pd.options.mode.chained_assignment = None  # default='warn'
# apply geo_points
acled.append_geo_points(df_somalia)

In [16]:
df_somalia[['fatalities','location','latitude','longitude','geo_point']][:5]

Unnamed: 0,fatalities,location,latitude,longitude,geo_point
31,0,Daynile,2.05814,45.3003,POINT (2.05814 45.3003)
32,0,Belet Weyne,4.73598,45.2043,POINT (4.735980000000001 45.2043)
33,2,Mahadaay,2.97043,45.5347,POINT (2.97043 45.5347)
34,57,Afmadow,0.5172,42.071,POINT (0.5172 42.07100000000001)
70,1,Baidoa,3.11718,43.6469,POINT (3.11718 43.6469)


### Data newer than 2017-01-31

In [17]:
import datetime

In [18]:
starttime = datetime.datetime(2017, 1, 31)
df_somalia_feb = df[df['event_date'] >= starttime]

### Top ten fatalities

In [19]:
df_somalia_feb[['event_date','location','fatalities','interaction','event_type']].sort_values(['fatalities'],ascending=False)[:10]

Unnamed: 0,event_date,location,fatalities,interaction,event_type
1463,2017-02-08,Owachi,98,12,Battle-No change of territory
101,2017-03-04,Torit,77,23,Battle-No change of territory
144845,2017-03-04,Torit,77,23,Battle-No change of territory
34,2017-03-02,Afmadow,57,28,Battle-No change of territory
144931,2017-03-02,Afmadow,57,28,Battle-No change of territory
1175,2017-02-15,Mokolo,56,12,Battle-No change of territory
773,2017-02-25,Yuai,53,12,Battle-No change of territory
144352,2017-03-18,Ganfouda,48,13,Battle-Government regains territory
225,2017-03-18,Ganfouda,48,13,Battle-Government regains territory
849,2017-02-23,Ganfouda,44,13,Strategic development


In [20]:
df.columns

Index(['_id', 'actor1', 'actor2', 'admin1', 'admin2', 'admin3', 'ally_actor_1',
       'ally_actor_2', 'country', 'data_id', 'event_date', 'event_id_cnty',
       'event_id_no_cnty', 'event_type', 'fatalities', 'geo_precision', 'gwno',
       'inter1', 'inter2', 'interaction', 'latitude', 'location', 'longitude',
       'notes', 'source', 'time_precision', 'year'],
      dtype='object')

# The Gapminder class
Data from www.gapminder.org

In [21]:
gm = datasets.Gapminder()

In [22]:
gm.get_dataset_names()

dict_keys(['Body Mass Index (BMI), women, Kg/m2', 'Urban population growth (annual %)', 'Medical Doctors (per 1,000 people)', 'Births attended by skilled health staff (% of total)', 'Body Mass Index (BMI), men, Kg/m2', 'Aid received (% of GNI)', 'Cell phones (per 100 people)', 'Age 15-24 unemployment rate', 'Crude death rate (deaths per 1,000 population)'])

In [23]:
df_bmi = gm.get_dataset('Body Mass Index (BMI), men, Kg/m2')

In [24]:
df_bmi.columns

Index(['Country',      1980,      1981,      1982,      1983,      1984,
            1985,      1986,      1987,      1988,      1989,      1990,
            1991,      1992,      1993,      1994,      1995,      1996,
            1997,      1998,      1999,      2000,      2001,      2002,
            2003,      2004,      2005,      2006,      2007,      2008],
      dtype='object')

In [25]:
df_bmi['Country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda',
       'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Central African Rep.', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Cook Islands',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus',
       'Czech Rep.', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Rep.',
       'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France',
       'French Polynesia', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Greenland

In [26]:
df_bmi_sudan = df_bmi[df_bmi['Country'] == 'Sudan']
df_bmi_sudan

Unnamed: 0,Country,1980,1981,1982,1983,1984,1985,1986,1987,1988,...,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008
168,Sudan,20.71915,20.75761,20.79627,20.82779,20.8473,20.86213,20.8824,20.90864,20.93535,...,21.49895,21.57897,21.66078,21.75247,21.84463,21.9434,22.0473,22.16048,22.28084,22.40484


In [27]:
df_unemp = gm.get_dataset('Age 15-24 unemployment rate')

In [28]:
df_unemp.columns

Index(['Total 25-54 unemployment (%)',                           1981,
                                 1982,                           1983,
                                 1984,                           1985,
                                 1986,                           1987,
                                 1988,                           1989,
                                 1990,                           1991,
                                 1992,                           1993,
                                 1994,                           1995,
                                 1996,                           1997,
                                 1998,                           1999,
                                 2000,                           2001,
                                 2002,                           2003,
                                 2004,                           2005],
      dtype='object')

## How do download other datasets from Gapminder

- Go to http://www.gapminder.org/data/ and search for a dataset. 
- Get indicator name and url (link).
- The dataset will be downloaded and can be retrieved later without specifying the url.

![gapminder_download](gapminder_download.png)

In [29]:
name = 'Suicide (per 100,000 people)'
url = 'http://spreadsheets.google.com/pub?key=troMumuI0Y6Phpwnj6qXa_A&output=xls'
df_sui = gm.get_dataset(name,url=url)

In [30]:
df_sui.columns

Index(['Suicide, age adjusted, per 100 000 standard population',
                                                           1950,
                                                           1951,
                                                           1952,
                                                           1953,
                                                           1954,
                                                           1955,
                                                           1956,
                                                           1957,
                                                           1958,
                                                           1959,
                                                           1960,
                                                           1961,
                                                           1962,
                                                           1963,
                         

If the dataset is already downloaded, specifying an url is unnecessary (will not be used).

In [31]:
df_sui2 = gm.get_dataset('Suicide (per 100,000 people)')

In [32]:
df_sui2.size

11286