# About notebook:

* Sandbox notebook with Børge's trials
* Updated to load from MongoDB

## Content:
* Currently playing with one-hot encoding


## Deleted
* Deleted old plot method (using tiles

# Imports

In [13]:
import pandas as pd
import numpy as np
import datetime

%matplotlib inline

## Importing ACLED data from MongoDB

In [14]:
import sys
sys.path.insert(0, '../')
import datasets

In [15]:
acled = datasets.ACLED()
acled.mongodb_update_database

<bound method ACLED.mongodb_update_database of <datasets.ACLED object at 0x101f43208>>

In [16]:
# Loading ACLED-data to pandas.Dataframe:
df = acled.mongodb_get_entire_database()

# Mini-dataset to play with

In [17]:
df_f = df[['event_date', 'country', 'event_type', 'fatalities']].copy()

# Load the new encoding function:

In [18]:

def _invert_dict_nonunique(d):
    """ Inverting nonunique dictionary 'd'
    
    Reference:
    [1] http://www.saltycrane.com/blog/2008/01/how-to-invert-dict-in-python/
    """
    newdict = {}
    for k, v in d.iteritems():
        newdict.setdefault(v, []).append(k)
    return newdict

def encoding_col_values_to_num(df, col, lower=True, strip=True, preset_dict=None):
    """Values of df['col'] encoded as numerical pandas.Series + dict
    
    Results in:
    category_col[category_dict] == df['col']           *
     
    *: Modulo str.lower() and str.strip(), if applied
    
    Keyword arguments:
        df -- pandas.Dataframe
        col -- Column (string) encoding is applied to,
               the column should contain strings.
        lower -- Ignore case? Default: True
        strip -- Strip strings? Default: True
        preset_dict -- Enables encoding using custom dictionary (custom grouping) 
                       
    
    Returns:
        category_col -- pandas.Series
        category_dict -- dict        

    Warning: Does not change 'column' in the passed dataframe.
    
    Todo:
        - Finalize 'presed_dict' functionality: Enabling customized dictionaries.
          currently inversion of dictionary will cause error.
        - Alternatively remove code related to 'prest_dict' (including function '_invert_dict_nonunique') 
    """
    column = df[col].copy()
    if lower:
        column = column.apply(str.lower)
    if strip:
        column = column.apply(str.strip)

    if preset_dict:
        mapping_dict = preset_dict
        # Inverting mapping_dict:
        category_dict = _invert_dict_nonunique(mapping_dict)
    else:
        categories = column.unique()
        mapping_dict = {category : i for i, category in enumerate(categories)}
        # Inverting mapping_dict:
        category_dict = dict([(v, k) for k, v in mapping_dict.items()])


    # Establishing 
    category_col = pd.Series(column.map(mapping_dict), dtype=int)

    # Verifies that all rows were successfully coded:
    assert(category_col.isnull().any()==False)
           
    return category_col, category_dict

# Apply the encoding function

In [19]:
event_cat_col, event_type_dict = encoding_col_values_to_num(df_f, 'event_type')

df_f['event_cat'] = event_cat_col

In [20]:
df_f[['event_type', 'event_cat']].head(10)

Unnamed: 0,event_type,event_cat
0,Violence against civilians,0
1,Battle-No change of territory,1
2,Riots/Protests,2
3,Violence against civilians,0
4,Battle-No change of territory,1
5,Battle-No change of territory,1
6,Riots/Protests,2
7,Battle-No change of territory,1
8,Riots/Protests,2
9,Battle-No change of territory,1


## Quick check to verify the above does the correct thing
... in essence: that the (new numerical categories + dict) in fact do correspond to the event type:

In [21]:
(df_f['event_cat'].map(event_type_dict) == df_f['event_type'].apply(str.lower).apply(str.strip)).all()

True

## One-hot encoding (using pandas)
... aaaand we're done:

In [22]:
pd.get_dummies(df_f, columns=['event_cat'], drop_first=True)

Unnamed: 0,event_date,country,event_type,fatalities,event_cat_1,event_cat_2,event_cat_3,event_cat_4,event_cat_5,event_cat_6,event_cat_7,event_cat_8
0,2017-02-18,Somalia,Violence against civilians,1,0,0,0,0,0,0,0,0
1,2017-02-18,Libya,Battle-No change of territory,1,1,0,0,0,0,0,0,0
2,2017-02-18,Ivory Coast,Riots/Protests,0,0,1,0,0,0,0,0,0
3,2017-02-18,Ethiopia,Violence against civilians,1,0,0,0,0,0,0,0,0
4,2017-02-18,Somalia,Battle-No change of territory,1,1,0,0,0,0,0,0,0
5,2017-02-18,Democratic Republic of Congo,Battle-No change of territory,0,1,0,0,0,0,0,0,0
6,2017-02-18,Algeria,Riots/Protests,0,0,1,0,0,0,0,0,0
7,2017-02-18,Egypt,Battle-No change of territory,1,1,0,0,0,0,0,0,0
8,2017-02-18,South Africa,Riots/Protests,0,0,1,0,0,0,0,0,0
9,2017-02-18,Somalia,Battle-No change of territory,3,1,0,0,0,0,0,0,0


### Todo (hot-encoding)
Will have to implement into function that does what we need with it :) :)

# Matching 'countries' in ACLED with 'name' in geo-frame:

In [26]:
country_names_acled = df['country'].unique()

# Loading shapefile:
sys.path.insert(0, '../modules/')
from ImportShapefile import ImportShapefile
# Update the link to where you have stored the shapefiles:
link = '../data/ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp'
df_geo_shapefile = ImportShapefile(link).get_df()

mask = df_geo_shapefile['continent']=='Africa'
df_geo_africa = df_geo_shapefile.loc[mask,:].reset_index(drop=True)

country_names_geo = df_geo_africa['name'].unique()

We now have two lists containing the names of the countries as they are written in the two datasets:

In [31]:
print(country_names_acled) # Contain names of 
print(country_names_geo)

['Algeria' 'Angola' 'Benin' 'Botswana' 'Burkina Faso' 'Burundi' 'Cameroon'
 'Central African Republic' 'Chad' 'Democratic Republic of Congo'
 'Djibouti' 'Egypt' 'Equatorial Guinea' 'Eritrea' 'Ethiopia' 'Gabon'
 'Gambia' 'Ghana' 'Guinea' 'Guinea-Bissau' 'Ivory Coast' 'Kenya' 'Lesotho'
 'Liberia' 'Libya' 'Madagascar' 'Malawi' 'Mali' 'Mauritania' 'Morocco'
 'Mozambique' 'Mozambique ' 'Namibia' 'Niger' 'Nigeria' 'Republic of Congo'
 'Rwanda' 'Senegal' 'Sierra Leone' 'Somalia' 'South Africa' 'South Sudan'
 'Sudan' 'Swaziland' 'Tanzania' 'Togo' 'Tunisia' 'Uganda' 'Zambia'
 'Zimbabwe']
['Algeria' 'Angola' 'Benin' 'Botswana' 'Burkina Faso' 'Burundi' 'Cameroon'
 'Central African Rep.' 'Chad' 'Congo' "Côte d'Ivoire" 'Dem. Rep. Congo'
 'Djibouti' 'Egypt' 'Eq. Guinea' 'Eritrea' 'Ethiopia' 'Gabon' 'Gambia'
 'Ghana' 'Guinea' 'Guinea-Bissau' 'Kenya' 'Lesotho' 'Liberia' 'Libya'
 'Madagascar' 'Malawi' 'Mali' 'Mauritania' 'Morocco' 'Mozambique' 'Namibia'
 'Niger' 'Nigeria' 'Rwanda' 'S. Sudan' 'Senegal' 

### TODO for this section
Create pandas.DataFrame with two columns:
- 'ACLED country'
- 'Sharefile country'

Use pandas functions to align the same country, thereafter manually map the rest.

#### Result
As a result, we can map statistics from on country to the mapping functions (e.g. results on 'Ivory Coast' correctly represented on the map with name 'Côte d'Ivoire').

# WARNING: In work, nothing done below

# Plotting fatalities
Inspired by Dirk's examples in lecture 13.02.

### Creating pivot table
* Important to set 'aggfunc' to sum (standard is mean)

In [None]:
df_piv = df_f.pivot_table(index='event_date',
                              columns='country',
                              values='fatalities',
                              aggfunc=np.sum)

### Resampling pivot to monthly

In [None]:
df_piv = df_piv.resample('1M').sum()

### Extract countries
Extract 5 countries with highest total fatalities (for plotting)

In [None]:
most_fat = list(df_piv.sum().sort_values(
                     ascending=False)[0:5].index)

In [None]:
print("Total number of events:")
df_piv.sum().sort_values(
                     ascending=False)[0:5]

In [None]:
df_fat = df_piv[most_fat]

In [None]:
ax = df_fat.plot(figsize=(10,8))
ax.set_ylabel("Fatalities")
ax.set_xlabel("Month")

## Plotting using pandas
Pandas is more flexible and allows zooming ++

In [None]:
# TO BE FIXED

# Bokeh geo-plotting
Based on:
http://nbviewer.jupyter.org/github/bokeh/bokeh-notebooks/blob/master/tutorial/11%20-%20geo.ipynb

In [None]:
from bokeh.io import output_notebook, show
output_notebook()

In [None]:
from bokeh.plotting import figure
from bokeh.tile_providers import WMTSTileSource

# NOTE: This is a little off Africa, but can easily be moved:
dist = 6000000
x_min = -30000
y_min = -8000000

Africa = x_range,y_range = ((-x_min,x_min+dist), (-y_min,y_min+dist))

fig = figure(tools='pan, wheel_zoom', x_range=x_range, y_range=y_range)
fig.axis.visible = False

In [None]:
url = 'http://a.basemaps.cartocdn.com/dark_all/{Z}/{X}/{Y}.png'
attribution = "Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL"

fig.add_tile(WMTSTileSource(url=url, attribution=attribution))

In [None]:
show(fig)

## Adding some points:

In [None]:
# Function comes from tutorial (see section header):
def wgs84_to_web_mercator(df, lon="LONGITUDE", lat="LATITUDE"):
    """Converts decimal longitude/latitude to Web Mercator format"""
    k = 6378137
    df["x"] = df[lon] * (k * np.pi/180.0)
    df["y"] = np.log(np.tan((90 + df[lat]) * np.pi/360.0)) * k
    return df

Selecting nor or less random samples (statistically speaking

In [None]:
N_points = 10000

test_points = df.iloc[0:N_points][['COUNTRY','LATITUDE', 'LONGITUDE', 'FATALITIES']]

In [None]:
wgs84_to_web_mercator(test_points)

None # To surpress output from function call

#### Plotting the points from above
**Note**: Size of plot set proportional with fatalities, in a little dodgy way (for now)

In [None]:
fig.circle(x=test_points['x'], y=test_points['y'],fill_color='blue', size=4*np.log(1+test_points['FATALITIES']))
show(fig)