In [1]:
import pandas as pd
import os
import folium
import numpy as np
from geopy.geocoders import Nominatim
from math import pi

pd.set_option('display.max_columns', 500)

In [2]:
DATA_FOLDER = '../data/'

edges_csv = 'panama_papers.edges.csv'
intermediary_csv = 'panama_papers.nodes.intermediary.csv' # company or individuals
address_csv = 'panama_papers.nodes.address.csv'
officer_csv = 'panama_papers.nodes.officer.csv'
entity_csv = 'panama_papers.nodes.entity.csv' # tax heaven companies

In [3]:
# Importing the Panama Papers data into DataFrames
df_edges_raw = pd.read_csv(DATA_FOLDER + edges_csv,low_memory=False)
df_address_raw = pd.read_csv(DATA_FOLDER + address_csv,low_memory=False)
df_entity_raw = pd.read_csv(DATA_FOLDER + entity_csv,low_memory=False)
df_intermediary_raw = pd.read_csv(DATA_FOLDER + intermediary_csv,low_memory=False)
df_officier_raw = pd.read_csv(DATA_FOLDER + officer_csv,low_memory=False)

In [4]:
threshold1=1000
df_entity_S1=df_entity_raw.copy()
vc=df_entity_S1.jurisdiction_description.value_counts()
tax_heavens_L=list(vc[vc > threshold1].index)
tax_heavens_S=list(vc[vc <= threshold1].index)
df_entity_S1.loc[~df_entity_S1.jurisdiction_description.isin(tax_heavens_L), 'jurisdiction_description'] = 'Others'

print(df_entity_S1.jurisdiction_description.value_counts(),"\n")
print('Jurisdictions in Others category are: ',tax_heavens_S)

British Virgin Islands    113648
Panama                     48360
Bahamas                    15915
Seychelles                 15182
Niue                        9611
Samoa                       5307
British Anguilla            3253
Nevada                      1260
Others                      1098
Name: jurisdiction_description, dtype: int64 

Jurisdictions in Others category are:  ['Hong Kong', 'United Kingdom', 'Belize', 'Costa Rica', 'Cyprus', 'Uruguay', 'New Zealand', 'Jersey', 'Wyoming', 'Malta', 'Isle Of Man', 'Ras Al Khaimah', 'Singapore']


In [5]:
jurisdiction_df = pd.DataFrame(df_entity_raw.jurisdiction_description.value_counts())
jurisdiction_df.reset_index(inplace=True)
jurisdiction_df.columns = ["Area", "Count"]

In [6]:
# getting long, lat for all jurisdictions
geolocator = Nominatim(user_agent='ADA_AKBAR')
j_lat = []
j_long = []

for ind, row in jurisdiction_df.iterrows():
    if row['Area'] == 'British Anguilla':
        location = geolocator.geocode('Anguilla')
    else:
        location = geolocator.geocode(row['Area'])
    j_lat.append(location.latitude)
    j_long.append(location.longitude)

jurisdiction_df['Latitude'] = j_lat
jurisdiction_df['Longitude'] = j_long

In [7]:
# FOLIUM KEEP FOR NOW
# jur_m = folium.Map(tiles='CartoDB positron')
# j_lat_long = jurisdiction_df[['Latitude', 'Longitude']].values.tolist()
# for i in range(len(j_lat_long)):
#     folium.Marker(j_lat_long[i], 
#                   popup=jurisdiction_df['Jurisdiction'][i] + ': ' + str(jurisdiction_df['Count'][i])).add_to(jur_m)
# jur_m

In [8]:
# since using Mercator Projection, need to convert
# lat/lon into x,y coordinates on map using following
# equation

import math
    
def merc(lat_lst, lon_lst):
    x_lst = []
    y_lst = []
    for i in range(len(lat_lst)):
        lat = lat_lst[i]
        lon = lon_lst[i]
        r_major = 6378137.000
        x = r_major * math.radians(lon)
        scale = x/lon
        y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 + 
            lat * (math.pi/180.0)/2.0)) * scale
        x_lst.append(x)
        y_lst.append(y)
    return x_lst, y_lst

x_y = merc(jurisdiction_df.Latitude, jurisdiction_df.Longitude)
jurisdiction_df['x'] = x_y[0]
jurisdiction_df['y'] = x_y[1]

In [9]:
from bokeh.plotting import figure, show, output_file, output_notebook
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.layouts import gridplot, widgetbox
from bokeh.models.widgets import DataTable, TableColumn

In [10]:
def plot_map(df, title):
    source = ColumnDataSource(data=dict(
                            x=list(df['x']), 
                            y=list(df['y']),
                            count=list(df['Count']),
                            area=list(df['Area'])))

    hover = HoverTool(tooltips=[
        ("Area", "@area"),
        ("Count","@count")])

    p = figure(
        title = title,
        width = 600, 
        height= 400, 
        x_range=(-20000000, 10000000), y_range=(-100, 10000),
        x_axis_type="mercator", y_axis_type="mercator",
        tools=['pan', hover, 'wheel_zoom','save'])

    p.add_tile(CARTODBPOSITRON)

    p.circle(x = 'x',
             y = 'y', 
             size = 10,
             source=source, 
             line_color="#FF0000", 
             fill_color="#FF0000",
             fill_alpha=0.05)
    
    columns = [
        TableColumn(field="area", title="Area"),
        TableColumn(field="count", title="Count")
    ]
    
    data_table = DataTable(source=source, columns=columns, height=350, width=200)
    grid = gridplot([p, data_table], ncols=2)
    
    output_notebook()
    show(grid)

output_file("../proj_jekyll/assets/jurisdictions_map.html", title="jurisdictions")
plot_map(jurisdiction_df, "Top jurisdictions by count")

In [11]:
# do the same for countries, filtering by countries > 1000

s = df_entity_raw.countries.value_counts()
s = s[s > 1000]
country_df = pd.DataFrame(s).reset_index()
country_df.columns = ['Area', 'Count']

c_lat = []
c_long = []

for ind, row in country_df.iterrows():
    location = geolocator.geocode(row['Area'])
    c_lat.append(location.latitude)
    c_long.append(location.longitude)

country_df['Latitude'] = c_lat
country_df['Longitude'] = c_long

x_y_c = merc(country_df.Latitude, country_df.Longitude)
country_df['x'] = x_y_c[0]
country_df['y'] = x_y_c[1]

In [12]:
output_file("../proj_jekyll/assets/country_map.html", title="jurisdictions")
plot_map(country_df, "Top countries that have offshore entities")

In [13]:
# part 2: calculating effect on home country wrt tax

GDP_tot = 'API_NY.GDP.MKTP.CD_DS2_en_csv_v2_10224782.csv'
corp_tax_rate = 'corp_tax_rate.csv'
business_country = 'business_country.csv'

GDP_tot_raw_df=pd.read_csv(DATA_FOLDER + GDP_tot,skiprows=4,low_memory=False)[['Country Name', '2015']]
corp_tax_rate_df = pd.read_csv(DATA_FOLDER + corp_tax_rate)[['Location', '2015']]
business_country_df = pd.read_csv(DATA_FOLDER + business_country)

In [14]:
# merging corp tax table with GDP table
m1 = corp_tax_rate_df.merge(GDP_tot_raw_df, left_on='Location', 
                            right_on='Country Name', how='inner')

# merging m1 with number of businesses table
m2 = m1.merge(business_country_df, left_on='Location',
             right_on='Country Name', how='inner')
m2 = m2[['Location', '2015_x', '2015_y', 'Number of Businesses']]
m2.columns = ['Location', '2015 Corp Tax Rate % of GDP', '2015 GDP', 'Number of Businesses']

# calculating corporate tax revenue
m2['Corporate Tax Revenue'] = m2['2015 Corp Tax Rate % of GDP'] * m2['2015 GDP'] / 100

# calculating avg corp tax paid per business
m2['Avg tax per business'] = m2['Corporate Tax Revenue'] / m2['Number of Businesses']

# merge m2 with countries grouped by count of entries in Panama Papers
country_entries = pd.DataFrame(df_entity_raw.countries.value_counts()).reset_index()
country_entries.columns = ['Location', 'Count in PP']
country_entries.loc[country_entries['Location'] == 'South Korea', 'Location'] = 'Korea, Rep.'
m3 = m2.merge(country_entries, on='Location', how='inner')

# calculate potential tax evasion
m3['Potential Tax Lost'] = m3['Avg tax per business'] * m3['Count in PP']
m3['Potential Real Corporate Tax Revenue'] = m3['Potential Tax Lost'] + m3['Corporate Tax Revenue']
m3['% Tax Lost'] = m3['Potential Tax Lost'] / m3['Corporate Tax Revenue'] * 100

In [15]:
m3.head(10)

Unnamed: 0,Location,2015 Corp Tax Rate % of GDP,2015 GDP,Number of Businesses,Corporate Tax Revenue,Avg tax per business,Count in PP,Potential Tax Lost,Potential Real Corporate Tax Revenue,% Tax Lost
0,Australia,4.25,1350000000000.0,1873101,57375000000.0,30631.0231,94,2879316.0,57377880000.0,0.005018
1,Austria,2.22,382000000000.0,547330,8480400000.0,15494.12603,24,371859.0,8480772000.0,0.004385
2,Belgium,3.33,455000000000.0,1597753,15151500000.0,9483.005195,59,559497.3,15152060000.0,0.003693
3,Canada,3.41,1560000000000.0,1536428,53196000000.0,34623.164899,347,12014240.0,53208010000.0,0.022585
4,Chile,4.29,244000000000.0,775308,10467600000.0,13501.215001,140,1890170.0,10469490000.0,0.018057
5,Czech Republic,3.59,187000000000.0,1442568,6713300000.0,4653.714764,34,158226.3,6713458000.0,0.002357
6,Denmark,2.71,301000000000.0,495961,8157100000.0,16447.059345,14,230258.8,8157330000.0,0.002823
7,Estonia,2.05,22566960000.0,112271,462622600.0,4120.588737,880,3626118.0,466248700.0,0.783818
8,Finland,2.17,232000000000.0,564997,5034400000.0,8910.48979,65,579181.8,5034979000.0,0.011504
9,France,2.08,2440000000000.0,3497529,50752000000.0,14510.816065,285,4135583.0,50756140000.0,0.008149


In [16]:
from bokeh.models.widgets import MultiSelect, Div
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.layouts import column, row, widgetbox

In [17]:
def modify_doc(doc):
    def make_dataset(countries):
        # select the relevant canton
        df = m3.loc[m3['Location'].isin(countries)]
        df = df[['Location', 'Corporate Tax Revenue', 'Potential Tax Lost',
                'Potential Real Corporate Tax Revenue', '% Tax Lost']]
        #df = df[['Location', 'Potential Tax Lost']]
        
        #ICD.display(df)
        return ColumnDataSource(df), df

    def make_plot(src, df):
        max_height = max(df['Potential Real Corporate Tax Revenue'])
        p = figure(plot_width=500, plot_height=800, y_range=(0,max_height*1.5))
        N = df.shape[0]
        
        ind = np.arange(N)
        width = 0.3

        d = {}
        countries = list(df.Location)

        for i in ind:
            j = str(i+width)
            d[j] = countries[i]

        yvals1 = list(df['Corporate Tax Revenue'])
        p.vbar(x=ind, width=width, top=yvals1, bottom=0, 
               color='red', legend='Corporate Tax Revenue')

        yvals2 = list(df['Potential Real Corporate Tax Revenue'])
        p.vbar(x=ind+width, width=width, top=yvals2, 
               color='blue', legend='Potential Real Corporate Tax Revenue')

        p.xaxis.axis_label = 'Countries'
        p.yaxis.axis_label = 'Dollars (USD)'
        p.xaxis.ticker = [i + width for i in ind]
        p.xaxis.major_label_overrides = d
        p.xaxis.major_label_orientation = pi/2
        p.legend.label_text_font_size = "12px"

        return p

    def update(attr, old, new):
        countries = multi_select.value
        src, df = make_dataset(countries)
        p = make_plot(src, df)
        text = get_stats(df)
        div = Div(text=text, width=200, height=100)
        controls = widgetbox(multi_select, div)
        layout.children[0] = controls
        layout.children[1] = p
        
    def get_stats(df):
        most_tax_lost = df.loc[df['Potential Tax Lost'].idxmax()]
        most_tax_lost_as_perc = df.loc[df['% Tax Lost'].idxmax()]
        c_tax = most_tax_lost['Location']
        c_tax_perc = most_tax_lost_as_perc['Location']
        text = """
        <b>%s</b> has the most potential corporate tax lost by value at %s.<br><br>
        <b>%s</b> has the most potential corporate tax lost by percentage at %s.
        """ % (c_tax, '${:,.0f}'.format(most_tax_lost['Potential Tax Lost']),
               c_tax_perc, '{:,.2f}%'.format(most_tax_lost_as_perc['% Tax Lost']))
        return text
    
    country_lst = [(x, x) for x in m3.Location]
    multi_select = MultiSelect(title="Options", value=['Switzerland', 'Turkey', 'United Kingdom'],
                               options=country_lst)
    multi_select.on_change('value', update)
    multi_select.size = 35

    countries = multi_select.value
    src, df = make_dataset(countries)
    p = make_plot(src, df)
    
    text = get_stats(df)
    div = Div(text=text, width=200, height=100)
    controls = widgetbox(multi_select, div)
    
    layout = row([controls, p])
    doc.add_root(layout)
    return doc
    
# Set up an application
handler = FunctionHandler(modify_doc)
app = Application(handler)
show(app)



In [20]:
from bokeh.embed import server_document
script = server_document("https://demo.bokehplots.com/apps/slider")

In [22]:
print(script)


<script
    src="https://demo.bokehplots.com/apps/slider/autoload.js?bokeh-autoload-element=25f2d5ea-9f67-4ae9-a397-9778c5bd2fc2&bokeh-app-path=/apps/slider&bokeh-absolute-url=https://demo.bokehplots.com/apps/slider"
    id="25f2d5ea-9f67-4ae9-a397-9778c5bd2fc2"
    data-bokeh-model-id=""
    data-bokeh-doc-id=""
></script>


In [23]:
bokeh

NameError: name 'bokeh' is not defined

In [24]:
import bokeh

In [25]:
bokeh.__version__

'0.12.16'

In [26]:
import numpy
numpy.version.version

'1.14.3'

In [28]:
import pandas
pandas.__version__

'0.23.0'

In [29]:
import geopy
geopy.__version__

'1.17.0'