# Using the datasets module
This notebook must be executed inside the examples/ directory.

In [1]:
import os
assert (os.getcwd().endswith('examples')), '%s\n%s "%s"' % (
    'Notebook must be executed inside examples directory\n', 
    'Current directory is', os.getcwd())

In [2]:
# fix import paths 
import sys
sys.path.insert(0, '../')

In [3]:
# fix data directory path in datasets module
import modules.datasets as datasets
datasets.DATADIR = '../' + datasets.DATADIR  

# The ACLED class

In [4]:
acled = datasets.ACLED()

In [5]:
type(acled)

modules.datasets.ACLED

In [6]:
## delete some data from mongodb to test database update below
#import datetime
#starttime = datetime.datetime(2017,3,1)  # March 1st, 2017
#acled.mongodb_delete_many(del_filter={'event_date': {'$gt':starttime}})

### Download new data from ACLED server (or everything if database does not exist)

In [7]:
acled.mongodb_update_database()

2017-03-25 12:07:49.142884 Querying ACLED API (one dot is 500 rows) ...
2017-03-25 12:07:59.398884 2 pages retrieved from ACLED API.
2017-03-25 12:07:59.399119 Make DataFrame...
2017-03-25 12:07:59.405115 Apply str->datetime on event_date...
595 records inserted to mongodb, 606 lines in csv. (Why a difference?)


### Getting the enitre ACLED database as a pandas.DataFrame

In [8]:
import pandas as pd

In [9]:
df = acled.mongodb_get_entire_database()

In [10]:
df.columns

Index(['_id', 'actor1', 'actor2', 'admin1', 'admin2', 'admin3', 'ally_actor_1',
       'ally_actor_2', 'country', 'data_id', 'event_date', 'event_id_cnty',
       'event_id_no_cnty', 'event_type', 'fatalities', 'geo_precision', 'gwno',
       'inter1', 'inter2', 'interaction', 'latitude', 'location', 'longitude',
       'notes', 'source', 'time_precision', 'year'],
      dtype='object')

In [11]:
df.head()

Unnamed: 0,_id,actor1,actor2,admin1,admin2,admin3,ally_actor_1,ally_actor_2,country,data_id,...,inter1,inter2,interaction,latitude,location,longitude,notes,source,time_precision,year
0,58d64e80b2c79a33ea6ad1ad,Protesters (Liberia),,Montserrado,Greater Monrovia,Monrovia,PATEL: Patriotic Entrepreneurs of Liberia,,Liberia,142291,...,6,0,60,6.31,Monrovia,-10.8,Marketers and local business owners led by the...,Liberian Observer (Monrovia); FrontPageAfrica ...,1,2017
1,58d64e80b2c79a33ea6ad1af,Police Forces of Ethiopia (1991-) Liyu Division,Police Forces of Ethiopia (1991-),Oromia,Mirab Hararghe,Boke,,Oromo Ethnic Group (Ethiopia),Ethiopia,142060,...,1,1,11,8.59288,Nure Musa,41.077,Clashes yesterday between Liyu Police and Orom...,Oromiya Media Network,1,2017
2,58d64e80b2c79a33ea6ad1b0,Protesters (Egypt),,Beni Suef,Bani Swayf,,,,Egypt,141960,...,6,0,60,29.0744,Bani Suwayf,31.0979,"Workers at Beni Suef ""Titan"" Cement factory i...",Arab Trade Union Federation,1,2017
3,58d64e80b2c79a33ea6ad1b1,Rioters (Democratic Republic of Congo),Police Forces of Democratic Republic of Congo ...,Kinshasa,Kinshasa,Mont Ngafula,UDPS: Union for Democracy and Social Progress,,Democratic Republic of Congo,141814,...,5,1,15,-4.46417,Limete Njili,15.3489,Clashes were reported for a second day between...,Radio Okapi,1,2017
4,58d64e80b2c79a33ea6ad1b2,Police Forces of Democratic Republic of Congo ...,BDK: Bunda Dia Kongo,Bas-Congo,Cataractes,Songololo,,,Democratic Republic of Congo,141813,...,1,3,13,-5.55,Kimpese,14.4333,A BDK follower was killed in a clash with poli...,RFI,1,2017


### List unique countries

In [12]:
df['country'].unique()

array(['Liberia', 'Ethiopia', 'Egypt', 'Democratic Republic of Congo',
       'Central African Republic', 'Cameroon', 'Burundi', 'Burkina Faso',
       'Algeria', 'Tunisia', 'Sudan', 'South Africa', 'Somalia',
       'Sierra Leone', 'Nigeria', 'Mali', 'Madagascar', 'Libya', 'Guinea',
       'South Sudan', 'Senegal', 'Niger', 'Mauritania', 'Kenya',
       'Mozambique', 'Ivory Coast', 'Ghana', 'Zimbabwe',
       'Republic of Congo', 'Angola', 'Morocco', 'Malawi', 'Chad',
       'Zambia', 'Uganda', 'Benin', 'Tanzania', 'Gambia', 'Gabon',
       'Namibia', 'Togo', 'Guinea-Bissau', 'Mozambique ', 'Botswana',
       'Djibouti', 'Swaziland', 'Lesotho', 'Eritrea', 'Rwanda',
       'Equatorial Guinea'], dtype=object)

### Fetch data for Somalia

In [13]:
df_somalia = df[df['country'] == 'Somalia']

In [14]:
df_somalia['country'].unique()

array(['Somalia'], dtype=object)

### Data newer than 2017-01-31

In [15]:
import datetime

In [16]:
starttime = datetime.datetime(2017, 1, 31)
df_somalia_feb = df[df['event_date'] >= starttime]

### Top ten fatalities

In [17]:
df_somalia_feb[['event_date','location','fatalities','interaction','event_type']].sort_values(['fatalities'],ascending=False)[:10]

Unnamed: 0,event_date,location,fatalities,interaction,event_type
522,2017-02-08,Owachi,98,12,Battle-No change of territory
156492,2017-03-04,Torit,77,23,Battle-No change of territory
2565,2017-03-02,Afmadow,57,28,Battle-No change of territory
2525,2017-02-15,Mokolo,56,12,Battle-No change of territory
155993,2017-02-25,Yuai,53,12,Battle-No change of territory
155999,2017-03-18,Ganfouda,48,13,Battle-Government regains territory
2290,2017-02-23,Ganfouda,44,13,Strategic development
2459,2017-02-19,Medina,40,27,Violence against civilians
447,2017-02-10,Dikwa,37,12,Battle-No change of territory
156317,2017-03-08,Khor Adar,33,12,Battle-No change of territory


In [18]:
df.columns

Index(['_id', 'actor1', 'actor2', 'admin1', 'admin2', 'admin3', 'ally_actor_1',
       'ally_actor_2', 'country', 'data_id', 'event_date', 'event_id_cnty',
       'event_id_no_cnty', 'event_type', 'fatalities', 'geo_precision', 'gwno',
       'inter1', 'inter2', 'interaction', 'latitude', 'location', 'longitude',
       'notes', 'source', 'time_precision', 'year'],
      dtype='object')

# The Gapminder class
Data from www.gapminder.org

In [19]:
gm = datasets.Gapminder()

In [20]:
gm.get_dataset_names()

dict_keys(['Urban population growth (annual %)', 'Aid received (% of GNI)', 'Body Mass Index (BMI), women, Kg/m2', 'Medical Doctors (per 1,000 people)', 'Age 15-24 unemployment rate', 'Cell phones (per 100 people)', 'Births attended by skilled health staff (% of total)', 'Crude death rate (deaths per 1,000 population)', 'Body Mass Index (BMI), men, Kg/m2'])

In [21]:
df_bmi = gm.get_dataset('Body Mass Index (BMI), men, Kg/m2')

Downloaded: "../data/gapminder/Body Mass Index (BMI), men, Kg_m2.xlsx"
URL: http://spreadsheets.google.com/pub?key=0ArfEDsV3bBwCdF9saE1pWUNYVkVsNU1FdW1Yem81Nmc&output=xls


In [22]:
df_bmi.columns

Index(['Country',      1980,      1981,      1982,      1983,      1984,
            1985,      1986,      1987,      1988,      1989,      1990,
            1991,      1992,      1993,      1994,      1995,      1996,
            1997,      1998,      1999,      2000,      2001,      2002,
            2003,      2004,      2005,      2006,      2007,      2008],
      dtype='object')

In [23]:
df_bmi['Country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda',
       'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Central African Rep.', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Cook Islands',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus',
       'Czech Rep.', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Rep.',
       'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France',
       'French Polynesia', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Greenland

In [24]:
df_bmi_sudan = df_bmi[df_bmi['Country'] == 'Sudan']
df_bmi_sudan

Unnamed: 0,Country,1980,1981,1982,1983,1984,1985,1986,1987,1988,...,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008
168,Sudan,20.71915,20.75761,20.79627,20.82779,20.8473,20.86213,20.8824,20.90864,20.93535,...,21.49895,21.57897,21.66078,21.75247,21.84463,21.9434,22.0473,22.16048,22.28084,22.40484


In [25]:
df_unemp = gm.get_dataset('Age 15-24 unemployment rate')

Downloaded: "../data/gapminder/Age 15-24 unemployment rate.xlsx"
URL: http://spreadsheets.google.com/pub?key=rEMA-cbNPaOtpDyxTcwugnw&output=xls


In [26]:
df_unemp.columns

Index(['Total 25-54 unemployment (%)',                           1981,
                                 1982,                           1983,
                                 1984,                           1985,
                                 1986,                           1987,
                                 1988,                           1989,
                                 1990,                           1991,
                                 1992,                           1993,
                                 1994,                           1995,
                                 1996,                           1997,
                                 1998,                           1999,
                                 2000,                           2001,
                                 2002,                           2003,
                                 2004,                           2005],
      dtype='object')

## How do download other datasets from Gapminder

- Go to http://www.gapminder.org/data/ and search for a dataset. 
- Get indicator name and url (link).
- The dataset will be downloaded and can be retrieved later without specifying the url.

![gapminder_download](gapminder_download.png)

In [27]:
name = 'Suicide (per 100,000 people)'
url = 'http://spreadsheets.google.com/pub?key=troMumuI0Y6Phpwnj6qXa_A&output=xls'
df_sui = gm.get_dataset(name,url=url)

Downloaded: "../data/gapminder/Suicide (per 100,000 people).xlsx"
URL: http://spreadsheets.google.com/pub?key=troMumuI0Y6Phpwnj6qXa_A&output=xls


In [28]:
df_sui.columns

Index(['Suicide, age adjusted, per 100 000 standard population',
                                                           1950,
                                                           1951,
                                                           1952,
                                                           1953,
                                                           1954,
                                                           1955,
                                                           1956,
                                                           1957,
                                                           1958,
                                                           1959,
                                                           1960,
                                                           1961,
                                                           1962,
                                                           1963,
                         

If the dataset is already downloaded, specifying an url is unnecessary (will not be used).

In [30]:
df_sui2 = gm.get_dataset('Suicide (per 100,000 people)')

In [31]:
df_sui2.size

11286