In [2]:
import pandas as pd
import os
import folium
from geopy.geocoders import Nominatim

pd.set_option('display.max_columns', 500)

In [3]:
DATA_FOLDER = '../data/'

edges_csv = 'panama_papers.edges.csv'
intermediary_csv = 'panama_papers.nodes.intermediary.csv' # company or individuals
address_csv = 'panama_papers.nodes.address.csv'
officer_csv = 'panama_papers.nodes.officer.csv'
entity_csv = 'panama_papers.nodes.entity.csv' # tax heaven companies

In [4]:
# Importing the Panama Papers data into DataFrames
df_edges_raw = pd.read_csv(DATA_FOLDER + edges_csv,low_memory=False)
df_address_raw = pd.read_csv(DATA_FOLDER + address_csv,low_memory=False)
df_entity_raw = pd.read_csv(DATA_FOLDER + entity_csv,low_memory=False)
df_intermediary_raw = pd.read_csv(DATA_FOLDER + intermediary_csv,low_memory=False)
df_officier_raw = pd.read_csv(DATA_FOLDER + officer_csv,low_memory=False)

In [5]:
threshold1=1000
df_entity_S1=df_entity_raw.copy()
vc=df_entity_S1.jurisdiction_description.value_counts()
tax_heavens_L=list(vc[vc > threshold1].index)
tax_heavens_S=list(vc[vc <= threshold1].index)
df_entity_S1.loc[~df_entity_S1.jurisdiction_description.isin(tax_heavens_L), 'jurisdiction_description'] = 'Others'

print(df_entity_S1.jurisdiction_description.value_counts(),"\n")
print('Jurisdictions in Others category are: ',tax_heavens_S)

British Virgin Islands    113648
Panama                     48360
Bahamas                    15915
Seychelles                 15182
Niue                        9611
Samoa                       5307
British Anguilla            3253
Nevada                      1260
Others                      1098
Name: jurisdiction_description, dtype: int64 

Jurisdictions in Others category are:  ['Hong Kong', 'United Kingdom', 'Belize', 'Costa Rica', 'Cyprus', 'Uruguay', 'New Zealand', 'Jersey', 'Wyoming', 'Malta', 'Isle Of Man', 'Ras Al Khaimah', 'Singapore']


In [6]:
jurisdiction_df = pd.DataFrame(df_entity_raw.jurisdiction_description.value_counts())
jurisdiction_df.reset_index(inplace=True)
jurisdiction_df.columns = ["Area", "Count"]

In [7]:
# getting long, lat for all jurisdictions
geolocator = Nominatim(user_agent='ADA_AKBAR')
j_lat = []
j_long = []

for ind, row in jurisdiction_df.iterrows():
    if row['Area'] == 'British Anguilla':
        location = geolocator.geocode('Anguilla')
    else:
        location = geolocator.geocode(row['Area'])
    j_lat.append(location.latitude)
    j_long.append(location.longitude)

jurisdiction_df['Latitude'] = j_lat
jurisdiction_df['Longitude'] = j_long

In [8]:
# FOLIUM KEEP FOR NOW
# jur_m = folium.Map(tiles='CartoDB positron')
# j_lat_long = jurisdiction_df[['Latitude', 'Longitude']].values.tolist()
# for i in range(len(j_lat_long)):
#     folium.Marker(j_lat_long[i], 
#                   popup=jurisdiction_df['Jurisdiction'][i] + ': ' + str(jurisdiction_df['Count'][i])).add_to(jur_m)
# jur_m

In [9]:
# since using Mercator Projection, need to convert
# lat/lon into x,y coordinates on map using following
# equation

import math
    
def merc(lat_lst, lon_lst):
    x_lst = []
    y_lst = []
    for i in range(len(lat_lst)):
        lat = lat_lst[i]
        lon = lon_lst[i]
        r_major = 6378137.000
        x = r_major * math.radians(lon)
        scale = x/lon
        y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 + 
            lat * (math.pi/180.0)/2.0)) * scale
        x_lst.append(x)
        y_lst.append(y)
    return x_lst, y_lst

x_y = merc(jurisdiction_df.Latitude, jurisdiction_df.Longitude)
jurisdiction_df['x'] = x_y[0]
jurisdiction_df['y'] = x_y[1]

In [59]:
from bokeh.plotting import figure, show, output_file, output_notebook
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.layouts import gridplot, widgetbox
from bokeh.models.widgets import DataTable, TableColumn

def plot_map(df, title):
    source = ColumnDataSource(data=dict(
                            x=list(df['x']), 
                            y=list(df['y']),
                            count=list(df['Count']),
                            area=list(df['Area'])))

    hover = HoverTool(tooltips=[
        ("Area", "@area"),
        ("Count","@count")])

    p = figure(
        title = title,
        width = 600, 
        height= 400, 
        x_range=(-20000000, 10000000), y_range=(-100, 10000),
        x_axis_type="mercator", y_axis_type="mercator",
        tools=['pan', hover, 'wheel_zoom','save'])

    p.add_tile(CARTODBPOSITRON)

    p.circle(x = 'x',
             y = 'y', 
             size = 10,
             source=source, 
             line_color="#FF0000", 
             fill_color="#FF0000",
             fill_alpha=0.05)
    
    columns = [
        TableColumn(field="area", title="Area"),
        TableColumn(field="count", title="Count")
    ]
    
    data_table = DataTable(source=source, columns=columns, height=350, width=200)
    grid = gridplot([p, data_table], ncols=2)
    
    output_notebook()
    show(grid)

output_file("../proj_jekyll/assets/jurisdictions_map.html", title="jurisdictions")
plot_map(jurisdiction_df, "Top jurisdictions by count")

In [60]:
# do the same for countries, filtering by countries > 1000

s = df_entity_raw.countries.value_counts()
s = s[s > 1000]
country_df = pd.DataFrame(s).reset_index()
country_df.columns = ['Area', 'Count']

c_lat = []
c_long = []

for ind, row in country_df.iterrows():
    location = geolocator.geocode(row['Area'])
    c_lat.append(location.latitude)
    c_long.append(location.longitude)

country_df['Latitude'] = c_lat
country_df['Longitude'] = c_long

x_y_c = merc(country_df.Latitude, country_df.Longitude)
country_df['x'] = x_y_c[0]
country_df['y'] = x_y_c[1]

In [61]:
output_file("../proj_jekyll/assets/country_map.html", title="jurisdictions")
plot_map(country_df, "Top countries that have offshore entities")

In [13]:
country_df

Unnamed: 0,Area,Count,Latitude,Longitude,x,y
0,Switzerland,37911,46.798562,8.231974,916379.1,5909256.0
1,Hong Kong,37911,22.279328,114.162813,12708550.0,2545095.0
2,Panama,15811,8.309607,-81.306625,-9051012.0,928281.1
3,Jersey,14331,49.212307,-2.1256,-236620.7,6310962.0
4,Luxembourg,10840,49.815868,6.129675,682352.3,6414448.0
5,United Kingdom,9619,54.702354,-3.276575,-364746.7,7304312.0
6,Guernsey,7327,49.462291,-2.581202,-287338.1,6353670.0
7,United Arab Emirates,7269,24.000249,53.999483,6011195.0,2753438.0
8,Bahamas,4984,24.773655,-78.000055,-8682926.0,2847969.0
9,Uruguay,4906,-32.875555,-56.020153,-6236135.0,-3878798.0


In [14]:
df_entity_raw

Unnamed: 0,node_id,name,jurisdiction,jurisdiction_description,country_codes,countries,incorporation_date,inactivation_date,struck_off_date,closed_date,ibcRUC,status,company_type,service_provider,sourceID,valid_until,note
0,10000001,"TIANSHENG INDUSTRY AND TRADING CO., LTD.",SAM,Samoa,HKG,Hong Kong,23-MAR-2006,18-FEB-2013,15-FEB-2013,,25221,Defaulted,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,
1,10000002,"NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.",SAM,Samoa,HKG,Hong Kong,27-MAR-2006,27-FEB-2014,15-FEB-2014,,25249,Defaulted,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,
2,10000003,"HOTFOCUS CO., LTD.",SAM,Samoa,HKG,Hong Kong,10-JAN-2006,15-FEB-2012,15-FEB-2012,,24138,Defaulted,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,
3,10000004,"SKY-BLUE GIFTS & TOYS CO., LTD.",SAM,Samoa,HKG,Hong Kong,06-JAN-2006,16-FEB-2009,15-FEB-2009,,24012,Defaulted,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,
4,10000005,FORTUNEMAKER INVESTMENTS CORPORATION,SAM,Samoa,HKG,Hong Kong,19-APR-2006,15-MAY-2009,15-FEB-2008,,R25638,Changed agent,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,
5,10000006,8808 HOLDING LIMITED,SAM,Samoa,HKG,Hong Kong,05-JAN-2006,,,,23835,Active,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,
6,10000007,KENT DEVELOPMENT LIMITED,SAM,Samoa,HKG,Hong Kong,26-JAN-2004,03-MAY-2006,15-FEB-2006,,15757,Defaulted,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,
7,10000008,BONUS TRADE LIMITED,SAM,Samoa,HKG,Hong Kong,13-FEB-2004,16-FEB-2009,15-FEB-2009,,15910,Defaulted,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,
8,10000009,AMARANDAN LTD.,SAM,Samoa,HKG,Hong Kong,26-JAN-2004,03-MAY-2006,15-FEB-2006,,15759,Defaulted,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,
9,10000010,NEW IDEA LIMITED,SAM,Samoa,HKG,Hong Kong,30-MAR-2004,27-FEB-2007,15-FEB-2007,,16462,Defaulted,,Mossack Fonseca,Panama Papers,The Panama Papers data is current through 2015,


In [34]:
country_df[['Area', 'Count']].head(5)

Unnamed: 0,Area,Count
0,Switzerland,37911
1,Hong Kong,37911
2,Panama,15811
3,Jersey,14331
4,Luxembourg,10840
