![Dakar, Senagal](https://i.imgur.com/5Wmn3yB.jpg)

With over 23 thousand responses from all over the world, the 2018 Kaggle ML & DS Survey has a lot of data from a variety of income classes. In this Kennel, I'll look at how Kagglers from the developing world not differ from those in the developed world and also look at the common ground they share. I'll use the World Bank's development indicator dataset to classify countries into either developed or developing.

# Where everyone is from
Let's start by having a picture of what countries are repesented in the 2018 Kaggle ML & DS Survey and in which income group they fall in

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import plotly as py
import os
from plotly.graph_objs import *
from plotly.graph_objs import Scatter, Layout
import plotly.graph_objs as go
from IPython.display import display

income = pd.read_csv('../input/world-development-indicators/Country.csv',low_memory=False)
data_multiple_choice = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv',low_memory=False)

developing = income.loc[((income['IncomeGroup']!="High income: OECD") &(income['IncomeGroup']!="High income: nonOECD"))]
developed = income.loc[((income['IncomeGroup']=="High income: OECD") |(income['IncomeGroup']=="High income: nonOECD"))]
income["Developement_State"] = 0
income.loc[((income['IncomeGroup']!="High income: OECD") &(income['IncomeGroup']!="High income: nonOECD")) ,"Developement_State"] = 0
income.loc[((income['IncomeGroup']=="High income: OECD") |(income['IncomeGroup']=="High income: nonOECD")) ,"Developement_State"] = 1

income["Developement"] = "Developing"
income.loc[((income['IncomeGroup']!="High income: OECD") &(income['IncomeGroup']!="High income: nonOECD")) ,"Developement"] = "Developing Nation"
income.loc[((income['IncomeGroup']=="High income: OECD") |(income['IncomeGroup']=="High income: nonOECD")) ,"Developement"] = "Developed Nation"

# Any results you write to the current directory are saved as output.
counties = income.loc[(income['TableName'].isin(data_multiple_choice['Q3'])|income['LongName'].isin(data_multiple_choice['Q3'])
                      |income['ShortName'].isin(data_multiple_choice['Q3']))]

data_multiple_choice_developing = data_multiple_choice.loc[data_multiple_choice['Q3'].isin(developing['TableName'])
                                                          |data_multiple_choice['Q3'].isin(developing['ShortName'])
                                                          |data_multiple_choice['Q3'].isin(developing['LongName'])]
data_multiple_choice_developed = data_multiple_choice.loc[data_multiple_choice['Q3'].isin(developed['TableName'])
                                                          |data_multiple_choice['Q3'].isin(developed['ShortName'])
                                                          |data_multiple_choice['Q3'].isin(developed['LongName'])]

metricscale=[[0, '#FFEB3B'], [1, '#FF9800']]

data = [ dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = counties['TableName'],
        z = counties['Developement_State'],
        text = counties['Developement'],
        colorscale = metricscale,
        autocolorscale = False,
        reversescale = False,
        showlegend=False,
      showscale = False,
     colorbar = dict(
        tick0= 0,
        tickmode= 'array',
        nticks = 2,
        tickvals= [0, 1],ticktext= ["Developing Country", "Developed Country"],len=0.5, y=1
         ),
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
      ) ]

layout = dict(
    title = 'Responding Countries',
    geo = dict(
        showframe = True,
        showocean = True,
        oceancolor = 'rgb(28,107,160)',
        projection = dict(
        type = 'orthographic',
            rotation = dict(
                    lon = 60,
                    lat = 10),
        ),
        lonaxis =  dict(
                showgrid = False,
                gridcolor = 'rgb(202, 202, 202)',
                width = '0.05'
            ),
        lataxis = dict(
                showgrid = False,
                gridcolor = 'rgb(102, 102, 102)'
                )
            )
)

fig = dict( data=data, layout=layout )
py.offline.init_notebook_mode(connected=True)
py.offline.iplot( fig, validate=False, filename='d3-world-map' )

We can see from the above map that Western Europe and North America are well represented, in fact the majority of the developed countries are represented. However, there are so many missing developing countries from Africa, the Middle East, Asia and Latin America. In fact, if we have a closer look at the raw numbers, kagglers in the developed world out number those in the developing world not only by countries represented but also by number of respondents

In [None]:
from plotly import tools
developed_country_responses = data_multiple_choice_developed.groupby(['Q3']).size().reset_index(name='count')
developing_country_responses = data_multiple_choice_developing.groupby(['Q3']).size().reset_index(name='count')

developed_country_responses=developed_country_responses.sort_values('count')
developing_country_responses=developing_country_responses.sort_values('count')
#print(developed_country_responses['Q3'].value_counts())
fig = tools.make_subplots(rows=1, cols=2, print_grid=False)

fig.append_trace(go.Bar(
            x=developing_country_responses['count'],
            y=developing_country_responses['Q3'],
            orientation = 'h',
            name = ''
), 1, 1)
fig.append_trace(go.Bar(
            x=developed_country_responses['count'],
            y=developed_country_responses['Q3'],
            orientation = 'h',
            name = ''
), 1, 2)

fig['layout'].update(height=900, width=800, title='Delevoping Countries vs Developed Countries')
py.offline.iplot(fig, filename='simple-subplot-with-annotations')
#developed.tail(22)


As shown above, most Kagglers are from India and the USA with India representing the developing world with 4417 respondants and the USA representing the developed world with 4716 respondants. 22 countries from the developing world are represented out of 168 developing nations. Let's now break down this data and see what similarities and differences kagglers have.

# Age, Education level and Years in Experience
We'll beging by having a look at how young and old kagglers are in the developing world and the developed world.
A population pyramid will we have a gist of this so let's do that


In [None]:
data_multiple_choice_age = data_multiple_choice.loc[((data_multiple_choice['Q2']!="What is your age (# years)?")
                                                    & (data_multiple_choice['Q2']!="nan"))]


data_multiple_choice['Q2'].value_counts(normalize=True)

data_multiple_choice_developing = data_multiple_choice_developing.fillna('nan')
data_multiple_choice_developed = data_multiple_choice_developed.fillna('nan')
SanKeyChartValues = []

        
data_multiple_choice_developed_age = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q2']!="What is your age (# years)?")
                                                                             & (data_multiple_choice_developed['Q2']!="nan"))]
data_multiple_choice_developing_age = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q2']!="What is your age (# years)?")
                                                                             & (data_multiple_choice_developing['Q2']!="nan"))]

data_multiple_choice_developed_age = data_multiple_choice_developed_age.sort_values(by=['Q2'], ascending=False)
data_multiple_choice_developing_age = data_multiple_choice_developing_age.sort_values(by=['Q2'], ascending=False)

data_multiple_choice_developed_age = data_multiple_choice_developed_age.groupby(['Q2']).size().reset_index(name='count')
data_multiple_choice_developing_age = data_multiple_choice_developing_age.groupby(['Q2']).size().reset_index(name='count')

data_multiple_choice_developing_age['count'] = -1*data_multiple_choice_developing_age['count']

layout = dict(yaxis=go.layout.YAxis(title='Age'),
                   xaxis=go.layout.XAxis(
                       range=[-5000, 5000],
                        tickvals=[-2000, -4000,0,2000,4000],
                       ticktext=[2000, 4000,0,2000,4000],
                       title='Kagglers'),
                   barmode='overlay',
                   bargap=0.1)
data = [go.Bar(
    
    y=data_multiple_choice_developed_age['Q2'],
               x=data_multiple_choice_developed_age['count'],
               orientation='h',
               name='Developed Countries',
               hoverinfo='x',
                xaxis='x1',
    yaxis='y1',
               marker=dict(color='#E91E63')
               ),
        go.Bar(y=data_multiple_choice_developing_age['Q2'],
               x=data_multiple_choice_developing_age['count'],
               orientation='h',
               name='Developing Countries',
               hoverinfo='text', xaxis='x1',
    yaxis='y1',
               marker=dict(color='#9C27B0')
               )]

py.offline.iplot(dict(data=data, layout=layout), validate=False,filename='EXAMPLES/bar_pyramid') 


From the above pyramid we can see that kagglers in developing countries are younger than those in developed countries, this should'nt come as a surprise as the population in the developing world is younger than that in the developed world. The largest age group in the developed world is 25-29, whilst the largest age group in the developed world is 22-25. The 18-21 age group is mostly in the developing world, whilst senior kagglers mostly reside in the developed world. We can break down this data further by looking at which industry each age group participates in.

In [None]:
def getPyramid(industry):
    
    total_high = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q2']!="What is your age (# years)?")
                                                                & (data_multiple_choice_developed['Q2']!="nan"))]
    total_low = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q2']!="What is your age (# years)?")
                                                                & (data_multiple_choice_developing['Q2']!="nan"))]
    
    data_multiple_choice_developed_age = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q2']!="What is your age (# years)?")
                                                                             & (data_multiple_choice_developed['Q7']==industry)
                                                                             & (data_multiple_choice_developed['Q2']!="nan"))]
    data_multiple_choice_developing_age = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q2']!="What is your age (# years)?")
                                                                              & (data_multiple_choice_developing['Q7']==industry)
                                                                               & (data_multiple_choice_developing['Q2']!="nan"))]

    data_multiple_choice_developed_age = data_multiple_choice_developed_age.sort_values(by=['Q2'], ascending=False)
    data_multiple_choice_developing_age = data_multiple_choice_developing_age.sort_values(by=['Q2'], ascending=False)

    data_multiple_choice_developed_age = data_multiple_choice_developed_age.groupby(['Q2']).size().reset_index(name='count')
    data_multiple_choice_developing_age = data_multiple_choice_developing_age.groupby(['Q2']).size().reset_index(name='count')
    
    total_high = total_high.groupby(['Q2']).size().reset_index(name='count')
    total_low = total_low.groupby(['Q2']).size().reset_index(name='count')
    
    
    data_multiple_choice_developing_age['count'] = 100*(data_multiple_choice_developing_age['count']/total_low['count'])
    data_multiple_choice_developed_age['count'] = 100*(data_multiple_choice_developed_age['count']/total_high['count'])
    #data_multiple_choice_developing_age['count'] = -1*data_multiple_choice_developing_age['count']

    layout = dict(yaxis=go.layout.YAxis(title='Age'),
                   xaxis=go.layout.XAxis(
                       range=[-5000, 5000],
                        tickvals=[-2000, -4000,0,2000,4000],
                       ticktext=[2000, 4000,0,2000,4000],
                       title='Kagglers'),
                   barmode='overlay',
                   bargap=0.1)
    return go.Bar(x=data_multiple_choice_developed_age['Q2'],
               y=data_multiple_choice_developed_age['count'],
                  marker=dict(color='#E91E63'),
                  name='Developed Countries'
                ,showlegend=False
            ),go.Bar(x=data_multiple_choice_developing_age['Q2'],
               y=data_multiple_choice_developing_age['count'],
                      marker=dict(color='#9C27B0'),
                      name='Developing Countries'
                ,showlegend=False
            )



fig = tools.make_subplots(rows=5, cols=4, print_grid=False,
                          subplot_titles=('Computers/Technology','I am a student','Academics/Education'
              ,'Accounting/Finance','Online Service/Internet-based Services',
                'Medical/Pharmaceutical','Government/Public Service'
              ,'Insurance/Risk Assessment','Manufacturing/Fabrication',
                'Marketing/CRM','Retail/Sales'
             ,'Energy/Mining','Broadcasting/Communications','Online Business/Internet-based Sales',
                                         'Shipping/Transportation'
             ,'Non-profit/Service','Hospitality/Entertainment/Sports','Military/Security/Defense'
              ,'Other'))
industries_list = [['Computers/Technology','I am a student','Academics/Education'
              ,'Accounting/Finance']
              ,['Medical/Pharmaceutical','Government/Public Service'
              ,'Insurance/Risk Assessment','Retail/Sales']
              ,['Marketing/CRM'
             ,'Energy/Mining','Broadcasting/Communications','Non-profit/Service']
               ,['Shipping/Transportation'
             ,'Hospitality/Entertainment/Sports','Military/Security/Defense','Manufacturing/Fabrication'
              ],['Online Service/Internet-based Services','Online Business/Internet-based Sales','Other']]

i = 1
for industries in industries_list:
    k = 1
    for industry in industries:
        hig,low =getPyramid(industry)
        fig.append_trace(low, i, k)
        fig.append_trace(hig, i, k)
        k = k+1
    i = i+1
labels = ["Developing Nations", "Developed Nations"]
buttons = []
for i, label in enumerate(labels):
    visibility = [i==j for j in range(len(labels))]
    button = dict(
                 label =  label,
                 method = 'update',
                 args = [{'visible': visibility},
                     {'title': label}])
    buttons.append(button)

updatemenus = list([
    dict(active=1,
         x=-0.15,
         buttons=buttons
    )
])

fig['layout']['title'] = 'Title'
#fig['layout']['showlegend'] = False
#fig['layout']['updatemenus'] = updatemenus
for i in fig['layout']['annotations']:
    i['font'] = dict(size=8,color='#000000')
fig['layout'].update(height=1000, width=800, title='Kagglers by Indsutry and Age Group (Percentage)'
                    , legend=dict(
       
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=8,
            color='#000'
        ),
        bgcolor='#E2E2E2',
        bordercolor='#FFFFFF',
        borderwidth=2
    ))
py.offline.iplot(fig, filename='simple-subplot-with-annotations')
#data_multiple_choice['Q7'].value_counts(normalize=True)

The above chart shows us how each industry by age range of kagglers. We can see that the majority of 18-21 olds are students for all kagglers weather in developing or developed countries. The computer and tech industry takes the position of the most popular industry for 22 year olds to 50 year olds. However, after that the academic industry takes the position of the most industry for kagglers in developing countries older than 50 and younger than 70. We can also see that some industries such as insurance and shipping are nearly exclusive to kagglers in the developed world.

## Education 
Let's now turn our forcus to the highest education level attend by kagglers in the developed and developing world

In [None]:
pd.options.mode.chained_assignment = None
education = ['Master抯 degree','Bachelor抯 degree','Doctoral degree'
            ,'Some college/university study without earning a bachelor抯 degree','Professional degree'
            ,'No formal education past high school']
data_multiple_choice['Q4'].value_counts(normalize=True)

metricscale=[[0, '#11B2AF'], [1, '#F2CF26']]



layout = dict(
     width=800,
    height=1000,
    autosize=False,
    margin = dict(t=100),
    paper_bgcolor='#8B4B62',
    plot_bgcolor='#8B4B62',
    title = 'Kagglers By Highest Education Level',
    font=dict( color='#fafafa'),
    geo = dict(domain=dict(x=[0, 1],
                y=[0.55, 1]),
        countrywidth = 0,
        showcountries = False,
        showframe = False,
        showocean = True,
        showcoastlines = False,
        landcolor ='#BB6F6B',
        oceancolor = '#8B4B62',
        bgcolor='#BB6F6B',
        projection = dict(
        type = 'equirectangular'
        )
            )
)

fig = dict( data=data, layout=layout )
#py.offline.iplot( fig, validate=False, filename='d3-world-map' )
data_multiple_choice_edu = data_multiple_choice.loc[((data_multiple_choice['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                & (data_multiple_choice['Q4']!="I prefer not to answer")
                                                & (data_multiple_choice['Q4']!="nan"))]
data_multiple_choice_developed_edu = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                                             & (data_multiple_choice_developed['Q4']!="nan")
                                                                        & (data_multiple_choice_developed['Q4']!="I prefer not to answer"))]
data_multiple_choice_developing_edu = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                                             & (data_multiple_choice_developing['Q4']!="nan")
                                                                          & (data_multiple_choice_developing['Q4']!="I prefer not to answer"))]

data_multiple_choice_edu = data_multiple_choice_edu.sort_values(by=['Q4'], ascending=False)

data_multiple_choice_developed_edu = data_multiple_choice_developed_edu.sort_values(by=['Q4'], ascending=False)
data_multiple_choice_developing_edu = data_multiple_choice_developing_edu.sort_values(by=['Q4'], ascending=False)

data_multiple_choice_edu = data_multiple_choice_edu.groupby(['Q3','Q4']).size().reset_index(name='count')

data_multiple_choice_developed_edu = data_multiple_choice_developed_edu.groupby(['Q3','Q4']).size().reset_index(name='count')
data_multiple_choice_developing_edu = data_multiple_choice_developing_edu.groupby(['Q3','Q4']).size().reset_index(name='count')
    
data_multiple_choice['Q4'].value_counts(normalize=True)
def func(group):
    return group.loc[group['count'] == group['count'].max()]
def transformDataFrame(df):
    df = df.sort_values(by=['count'], ascending=False)
    df =df.pivot_table(index=['Q3'], columns='Q4', values='count')
    #data_multiple_choice_edu = data_multiple_choice_edu.set_index(['Q3','Q4'])['count'].unstack()
    df.reset_index(inplace=True)
    df = df.fillna(0)
    #df = df.sort_values(by=['count'], ascending=False)
    return df

data_multiple_choice_edu_all = transformDataFrame(data_multiple_choice_edu)
data_multiple_choice_developed_edu = transformDataFrame(data_multiple_choice_developed_edu)
data_multiple_choice_developing_edu = transformDataFrame(data_multiple_choice_developing_edu)

table = go.Table( domain=dict(x=[0, 1],
                y=[0.22, 0.5]),  
    columnwidth=[0.8, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
    header=dict(
        #values=list(df.columns[1:]),
        
        values=['Country','Bachelor抯 degree','Doctoral degree','Master抯 degree','High school'
                ,'Professional degree'
            ,'Some College/University'
            ],
        font=dict(size=10,color="#424242"),
        line = dict(color='rgba(50, 50, 50,0.1)'),
        align = 'left',
        fill = dict(color='#FCBC80'),
    ),
    cells=dict(
        values=[data_multiple_choice_edu_all[k].tolist() for k in data_multiple_choice_edu_all.columns[:]],
        line = dict(color='rgba(50, 50, 50,0.1)'),
        align = 'left',
        fill = dict(color='#F7E29C'),
        font=dict(color="#424242")
    )
)
#py.offline.iplot([table], filename='table-of-mining-data')

education = ['Master抯 degree','Bachelor抯 degree','Doctoral degree'
            ,'Some college/university study without earning a bachelor抯 degree','Professional degree'
            ,'No formal education past high school']
data_multiple_choice_edu = data_multiple_choice_edu.groupby('Q3', as_index=False).apply(func).reset_index(drop=True)
#print(data_multiple_choice_edu['Q4'].value_counts(normalize=True))
data_multiple_choice_edu["edu"] = 0
data_multiple_choice_edu.loc[(data_multiple_choice_edu['Q4']==
                              "Master抯 degree"),"edu"] = 0
data_multiple_choice_edu.loc[(data_multiple_choice_edu['Q4']==
                              "Bachelor抯 degree"),"edu"] = 1
data =  dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = data_multiple_choice_edu['Q3'],
        z = data_multiple_choice_edu['edu'],
        text = data_multiple_choice_edu['Q4'],
        colorscale = metricscale,
        autocolorscale = False,
        reversescale = False,
        showlegend=False,
    showscale = False,
        marker = dict(
            line = dict (
                color = 'rgb(0,0,0,0)',
                width = 0
            ) ),
      )

cols = ['Bachelor抯 degree', 'Doctoral degree', 'Master抯 degree', 'No formal education past high school'
       , 'Professional degree', 'Some college/university study without earning a bachelor抯 degree']
    
def sumDataFrame(df,nation):
    df.loc['Total']= df.sum()
    df[cols] = df[cols].div(df[cols].sum(axis=1), axis=0).multiply(100).round(2)
    df = df.tail(1)
    #df.is_copy = False
    df.loc[0:1,'Q3'] = nation
    return df
edu =pd.concat([sumDataFrame(data_multiple_choice_developing_edu,"Developing Nation")
                ,sumDataFrame(data_multiple_choice_developed_edu,"Developed Nation")], axis=0)
#edu.head(2)
table2 = go.Table( domain=dict(x=[0, 1],
                y=[0, 0.2]),  
    columnwidth=[0.8, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
    header=dict(
        #values=list(df.columns[1:]),
        
        values=['Income Level','Bachelor抯 degree', 'Doctoral degree', 'Master抯 degree', 'High school'
       , 'Professional degree', 'Some college/university'
            ],
        font=dict(size=10,color="#424242"),
        line = dict(color='rgba(50, 50, 50,0.1)'),
        align = 'left',
        fill = dict(color='#FCBC80'),
    ),
    cells=dict(
        values=[edu[k].tolist() for k in edu.columns[:]],
        line = dict(color='rgba(50, 50, 50,0.1)'),
        align = 'left',
        fill = dict(color='#F7E29C'),
        font=dict(color="#424242")
    )
)
fig2 = dict(data=[data,table,table2], layout=layout)
py.offline.iplot(fig2, filename='vertical-stacked-subplot-tables')
#data_multiple_choice_edu.head()
#print(data_multiple_choice_edu.shape)
#data_multiple_choice_edu.head(12)

#data_multiple_choice_edu.tail(12)

The majority of Kagglers in the developed world hold master's degree, whilst those in the developing world hold bachelor's. Moreover, there is larger propotion of professional degree holders from the developing world than the developed world. Those with some college and high school experience have nearly the same propotions in both the developing and developed world. Let's see how this breaks down by industry

In [None]:
def getPyramid(industry):
    
    total_high = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                & (data_multiple_choice_developed['Q4']!="I prefer not to answer")
                                                & (data_multiple_choice_developed['Q4']!="nan"))]
    total_low = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                & (data_multiple_choice_developing['Q4']!="I prefer not to answer")
                                                & (data_multiple_choice_developing['Q4']!="nan"))]
    
    data_multiple_choice_developed_age = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                & (data_multiple_choice_developed['Q4']!="I prefer not to answer")
                                                & (data_multiple_choice_developed['Q4']!="nan")
                                                                             & (data_multiple_choice_developed['Q7']==industry)
                                                                             )]
    data_multiple_choice_developing_age = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                & (data_multiple_choice_developing['Q4']!="I prefer not to answer")
                                                & (data_multiple_choice_developing['Q4']!="nan")
                                                                & (data_multiple_choice_developing['Q7']==industry))]

    data_multiple_choice_developed_age = data_multiple_choice_developed_age.sort_values(by=['Q4'], ascending=False)
    data_multiple_choice_developing_age = data_multiple_choice_developing_age.sort_values(by=['Q4'], ascending=False)

    data_multiple_choice_developed_age = data_multiple_choice_developed_age.groupby(['Q4']).size().reset_index(name='count')
    data_multiple_choice_developing_age = data_multiple_choice_developing_age.groupby(['Q4']).size().reset_index(name='count')
    
    total_high = total_high.groupby(['Q4']).size().reset_index(name='count')
    total_low = total_low.groupby(['Q4']).size().reset_index(name='count')
    
    #data_multiple_choice_developing_age = data_multiple_choice_developing_age.sort_values(by=['Q4'], ascending=False)
    #data_multiple_choice_developed_age = data_multiple_choice_developed_age.sort_values(by=['Q4'], ascending=False)
    #total_high = total_high.sort_values(by=['Q4'], ascending=False)
    #total_low = total_low.sort_values(by=['Q4'], ascending=False)
    
    data_multiple_choice_developing_age['count'] = 100*(data_multiple_choice_developing_age['count']/total_low['count'])
    data_multiple_choice_developed_age['count'] = 100*(data_multiple_choice_developed_age['count']/total_high['count'])
    #data_multiple_choice_developing_age['count'] = -1*data_multiple_choice_developing_age['count']
    #print(data_multiple_choice_developing_age['Q4'])
    #print(data_multiple_choice_developed_age['Q4'])
    layout = dict(yaxis=go.layout.YAxis(title='Education'),
                   xaxis=go.layout.XAxis(
                       range=[-5000, 5000],
                        tickvals=[-2000, -4000,0,2000,4000],
                       ticktext=[2000, 4000,0,2000,4000],
                       title='Kagglers'),
                   barmode='overlay',
                   bargap=0.1)
    return go.Bar(x=['Bachelo','Doctoral','Master'
            ,'High school','Professional','Some Tertiary'],
               y=data_multiple_choice_developed_age['count'],
                  marker=dict(color='#F27F0F'),
                  name='Developed Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
            ),go.Bar(x=['Bachelo','Doctoral','Master'
            ,'High school','Professional','Some Tertiary'],
               y=data_multiple_choice_developing_age['count'],
                      marker=dict(color='#157EC4'),
                      name='Developing Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
         
            )



fig = tools.make_subplots(rows=5, cols=4, print_grid=False,
                          subplot_titles=('Computers/Technology','I am a student','Academics/Education'
              ,'Accounting/Finance','Online Service/Internet-based Services',
                'Medical/Pharmaceutical','Government/Public Service'
              ,'Insurance/Risk Assessment','Manufacturing/Fabrication',
                'Marketing/CRM','Retail/Sales'
             ,'Energy/Mining','Broadcasting/Communications','Online Business/Internet-based Sales',
                                         'Shipping/Transportation'
             ,'Non-profit/Service','Hospitality/Entertainment/Sports','Military/Security/Defense'
              ,'Other'))
industries_list = [['Computers/Technology','I am a student','Academics/Education'
              ,'Accounting/Finance']
              ,['Medical/Pharmaceutical','Government/Public Service'
              ,'Insurance/Risk Assessment','Retail/Sales']
              ,['Marketing/CRM'
             ,'Energy/Mining','Broadcasting/Communications','Non-profit/Service']
               ,['Shipping/Transportation'
             ,'Hospitality/Entertainment/Sports','Military/Security/Defense','Manufacturing/Fabrication'
              ],['Online Service/Internet-based Services','Online Business/Internet-based Sales','Other']]

i = 1
for industries in industries_list:
    k = 1
    for industry in industries:
        hig,low =getPyramid(industry)
        fig.append_trace(low, i, k)
        fig.append_trace(hig, i, k)
        k = k+1
    i = i+1
labels = ["Developing Nations", "Developed Nations"]
buttons = []
for i, label in enumerate(labels):
    visibility = [i==j for j in range(len(labels))]
    button = dict(
                 label =  label,
                 method = 'update',
                 args = [{'visible': visibility},
                     {'title': label}])
    buttons.append(button)

updatemenus = list([
    dict(active=1,
         x=-0.15,
         buttons=buttons
    )
])

fig['layout']['title'] = 'Title'
#fig['layout']['showlegend'] = False
#fig['layout']['updatemenus'] = updatemenus
for i in fig['layout']['annotations']:
    i['font'] = dict(size=8,color='#000000')

axis =  dict(tickfont=dict(
            family='sans-serif',
            size=8,
            color='#000'
        ))
fig['layout'].update(height=1000, width=800, title='Kagglers by Qualification and Industry (Percentage)'
                    , legend=dict(
       
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=8,
            color='#000'
        ),
        bgcolor='#E2E2E2',
        bordercolor='#FFFFFF',
        borderwidth=2
    ),xaxis1 =axis,yaxis1 =axis,xaxis2 =axis,yaxis2 =axis,xaxis3 =axis,yaxis3 =axis
     ,xaxis4 =axis,yaxis4 =axis,xaxis5 =axis,yaxis5 =axis,xaxis6 =axis,yaxis6 =axis
     ,xaxis7 =axis,yaxis7 =axis
     ,xaxis8 =axis,yaxis8 =axis,xaxis9 =axis,yaxis9 =axis,xaxis10 =axis,yaxis10 =axis
     ,xaxis11 =axis,yaxis11 =axis
     ,xaxis12 =axis,yaxis12 =axis,xaxis13 =axis,yaxis13 =axis,xaxis14 =axis,yaxis14 =axis
    ,xaxis15 =axis,yaxis15 =axis
     ,xaxis16 =axis,yaxis16 =axis,xaxis17 =axis,yaxis17 =axis
    ,xaxis18 =axis,yaxis18 =axis,xaxis19 =axis,yaxis19 =axis
        )
py.offline.iplot(fig, filename='simple-subplot-with-annotations')

From the above chart we can see that in the developing world and developed world, the majority of those of doctoral degrees are in the academic industry. The technology industry in the has the most kagglers for in all qualification categories except high school and doctorial.
Another difference that developing countries have from developed countries is the perception of quality of online learning platforms and in-person bootcamps compared by traditional Institutions. This is answered in Q39 and the results below show below

In [None]:
exclude = 'How do you perceive the quality of online learning platforms and in-person bootcamps as compared to the quality of the education provided by traditional brick and mortar institutions? - Online learning platforms and MOOCs:'
def getPyramid(industry):
    
    total_high = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                & (data_multiple_choice_developed['Q4']!="I prefer not to answer")
                                                & (data_multiple_choice_developed['Q4']!="nan"))]
    total_low = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                & (data_multiple_choice_developing['Q4']!="I prefer not to answer")
                                                & (data_multiple_choice_developing['Q4']!="nan"))]
    
    data_multiple_choice_developed_age = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                & (data_multiple_choice_developed['Q4']!="I prefer not to answer")
                                                & (data_multiple_choice_developed['Q4']!="nan")
                                                & (data_multiple_choice_developed['Q39_Part_1']==industry)
                                                                             )]
    data_multiple_choice_developing_age = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")
                                                & (data_multiple_choice_developing['Q4']!="I prefer not to answer")
                                                & (data_multiple_choice_developing['Q4']!="nan")
                                              & (data_multiple_choice_developing['Q39_Part_1']==industry))]

    data_multiple_choice_developed_age = data_multiple_choice_developed_age.sort_values(by=['Q4'], ascending=False)
    data_multiple_choice_developing_age = data_multiple_choice_developing_age.sort_values(by=['Q4'], ascending=False)

    data_multiple_choice_developed_age = data_multiple_choice_developed_age.groupby(['Q4']).size().reset_index(name='count')
    data_multiple_choice_developing_age = data_multiple_choice_developing_age.groupby(['Q4']).size().reset_index(name='count')
    
    total_high = total_high.groupby(['Q4']).size().reset_index(name='count')
    total_low = total_low.groupby(['Q4']).size().reset_index(name='count')
    
    
    #data_multiple_choice_developing_age['count'] = 100*(data_multiple_choice_developing_age['count']/total_low['count'])
    #data_multiple_choice_developed_age['count'] = 100*(data_multiple_choice_developed_age['count']/total_high['count'])
    
    layout = dict(yaxis=go.layout.YAxis(title='Education'),
                   xaxis=go.layout.XAxis(
                       range=[-5000, 5000],
                        tickvals=[-2000, -4000,0,2000,4000],
                       ticktext=[2000, 4000,0,2000,4000],
                       title='Kagglers'),
                   barmode='overlay',
                   bargap=0.1)
    return go.Bar(x=['Bachelo','Doctoral','Master'
            ,'High school','Professional','Some Tertiary'],
               y=data_multiple_choice_developed_age['count'],
                  marker=dict(color='#F27F0F'),
                  name='Developed Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
            ),go.Bar(x=['Bachelo','Doctoral','Master'
            ,'High school','Professional','Some Tertiary'],
               y=data_multiple_choice_developing_age['count'],
                      marker=dict(color='#157EC4'),
                      name='Developing Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
         
            )



fig = tools.make_subplots(rows=2, cols=3, print_grid=False,
                          subplot_titles=('Slightly better','Much better'
                                          ,'Neither better nor worse'
              ,'Slightly worse','No opinion; I do not know',
                'Much worse'))
industries_list = [['Slightly better','Much better'
                                          ,'Neither better nor worse'
                   ],['Slightly worse','No opinion; I do not know',
                'Much worse']]

i = 1
for industries in industries_list:
    k = 1
    for industry in industries:
        hig,low =getPyramid(industry)
        fig.append_trace(low, i, k)
        fig.append_trace(hig, i, k)
        k = k+1
    i = i+1
labels = ["Developing Nations", "Developed Nations"]


fig['layout']['title'] = 'Title'
#fig['layout']['showlegend'] = False
#fig['layout']['updatemenus'] = updatemenus
for i in fig['layout']['annotations']:
    i['font'] = dict(size=8,color='#000000')

axis =  dict(tickfont=dict(
            family='sans-serif',
            size=8,
            color='#000'
        ))
fig['layout'].update(height=1000, width=800, title='Kagglers by Qualification and Perception of online learning platforms'
                    , legend=dict(
       
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=8,
            color='#000'
        ),
        bgcolor='#E2E2E2',
        bordercolor='#FFFFFF',
        borderwidth=2
    ),xaxis1 =axis,yaxis1 =axis,xaxis2 =axis,yaxis2 =axis,xaxis3 =axis,yaxis3 =axis
     ,xaxis4 =axis,yaxis4 =axis,xaxis5 =axis,yaxis5 =axis,xaxis6 =axis,yaxis6 =axis
     ,xaxis7 =axis,yaxis7 =axis
     ,xaxis8 =axis,yaxis8 =axis,xaxis9 =axis,yaxis9 =axis,xaxis10 =axis,yaxis10 =axis
     ,xaxis11 =axis,yaxis11 =axis
     ,xaxis12 =axis,yaxis12 =axis,xaxis13 =axis,yaxis13 =axis,xaxis14 =axis,yaxis14 =axis
    ,xaxis15 =axis,yaxis15 =axis
     ,xaxis16 =axis,yaxis16 =axis,xaxis17 =axis,yaxis17 =axis
    ,xaxis18 =axis,yaxis18 =axis,xaxis19 =axis,yaxis19 =axis
        )
py.offline.iplot(fig, filename='simple-subplot-with-annotations')
#print(data_multiple_choice['Q39_Part_1'].value_counts(normalize=True))

Kagglers in the developing world hold online platforms at a high regard as seen above. The majority of bachelo degree holders in the developing world think they are much better than the brick and motor institutions. On the other hand those with master's degrees think they are only slightly better in both developing and developed countries

## Status of ML Methods in the Industry


In [None]:
#print(data_multiple_choice['Q10'].value_counts(normalize=True))

exclude ='In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice'
def getPyramid(industry):
    
    total_high = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q7']!="I am a student")
                                                & (data_multiple_choice_developed['Q7']!=exclude)
                                                & (data_multiple_choice_developed['Q7']!="nan"))]
    total_low = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q7']!="I am a student")
                                                & (data_multiple_choice_developing['Q7']!=exclude)
                                                & (data_multiple_choice_developing['Q7']!="nan"))]
    
    data_multiple_choice_developed_age = data_multiple_choice_developed.loc[(
        (data_multiple_choice_developed['Q7']!="I am a student")
                                                & (data_multiple_choice_developed['Q7']!=exclude)
                                                & (data_multiple_choice_developed['Q7']!="nan")
                                                                             & (data_multiple_choice_developed['Q10']==industry)
                                                                             )]
    data_multiple_choice_developing_age = data_multiple_choice_developing.loc[(
        (data_multiple_choice_developing['Q7']!="I am a student")
                                                & (data_multiple_choice_developing['Q7']!=exclude)
                                                & (data_multiple_choice_developing['Q7']!="nan")
                                                                & (data_multiple_choice_developing['Q10']==industry))]

    data_multiple_choice_developed_age = data_multiple_choice_developed_age.sort_values(by=['Q7'], ascending=False)
    data_multiple_choice_developing_age = data_multiple_choice_developing_age.sort_values(by=['Q7'], ascending=False)

    data_multiple_choice_developed_age = data_multiple_choice_developed_age.groupby(['Q7']).size().reset_index(name='count')
    data_multiple_choice_developing_age = data_multiple_choice_developing_age.groupby(['Q7']).size().reset_index(name='count')
    
    total_high = total_high.groupby(['Q7']).size().reset_index(name='count')
    total_low = total_low.groupby(['Q7']).size().reset_index(name='count')
    
    
    data_multiple_choice_developing_age['count'] = 100*(data_multiple_choice_developing_age['count']/total_low['count'])
    data_multiple_choice_developed_age['count'] = 100*(data_multiple_choice_developed_age['count']/total_high['count'])
    #data_multiple_choice_developing_age['count'] = -1*data_multiple_choice_developing_age['count']

    layout = dict(yaxis=go.layout.YAxis(title='Education'),
                   xaxis=go.layout.XAxis(
                       range=[-5000, 5000],
                        tickvals=[-2000, -4000,0,2000,4000],
                       ticktext=[2000, 4000,0,2000,4000],
                       title='Kagglers'),
                   barmode='overlay',
                   bargap=0.1)
    return go.Bar(x=data_multiple_choice_developed_age['Q7'],
               y=data_multiple_choice_developed_age['count'],
                  marker=dict(color='#8B4B62'),
                  name='Developed Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
            ),go.Bar(x=data_multiple_choice_developing_age['Q7'],
               y=data_multiple_choice_developing_age['count'],
                      marker=dict(color='#FCBC80'),
                      name='Developing Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
         
            )



fig = tools.make_subplots(rows=3, cols=2, print_grid=False,
                          subplot_titles=("We are exploring ML methods (and may one day put a model into production)"
,"No (we do not use ML methods)"
,"We recently started using ML methods (i.e., models in production for less than 2 years)"
,"I do not know"
,"We have well established ML methods (i.e., models in production for more than 2 years)"))
industries_list = [["We are exploring ML methods (and may one day put a model into production)"
,"No (we do not use ML methods)"
],
["We recently started using ML methods (i.e., models in production for less than 2 years)"
 ,"We have well established ML methods (i.e., models in production for more than 2 years)"],[
"I do not know"
]]

i = 1
for industries in industries_list:
    k = 1
    for industry in industries:
        hig,low =getPyramid(industry)
        fig.append_trace(low, i, k)
        fig.append_trace(hig, i, k)
        k = k+1
    i = i+1
labels = ["Developing Nations", "Developed Nations"]
buttons = []
for i, label in enumerate(labels):
    visibility = [i==j for j in range(len(labels))]
    button = dict(
                 label =  label,
                 method = 'update',
                 args = [{'visible': visibility},
                     {'title': label}])
    buttons.append(button)

updatemenus = list([
    dict(active=1,
         x=-0.15,
         buttons=buttons
    )
])

fig['layout']['title'] = 'Title'
#fig['layout']['showlegend'] = False
#fig['layout']['updatemenus'] = updatemenus
for i in fig['layout']['annotations']:
    i['font'] = dict(size=8,color='#000000')

axis =  dict(tickfont=dict(
            family='sans-serif',
            size=8,
            color='#000'
        ))
fig['layout'].update(height=1200, width=800, title='Kagglers by use of ML and Industry (Percentage)'
                    , legend=dict(
       
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=8,
            color='#000'
        ),
        bgcolor='#E2E2E2',
        bordercolor='#FFFFFF',
        borderwidth=2
    ),xaxis1 =axis,yaxis1 =axis,xaxis2 =axis,yaxis2 =axis,xaxis3 =axis,yaxis3 =axis
     ,xaxis4 =axis,yaxis4 =axis,xaxis5 =axis,yaxis5 =axis,xaxis6 =axis,yaxis6 =axis
     ,xaxis7 =axis,yaxis7 =axis
     ,xaxis8 =axis,yaxis8 =axis,xaxis9 =axis,yaxis9 =axis,xaxis10 =axis,yaxis10 =axis
     ,xaxis11 =axis,yaxis11 =axis
     ,xaxis12 =axis,yaxis12 =axis,xaxis13 =axis,yaxis13 =axis,xaxis14 =axis,yaxis14 =axis
    ,xaxis15 =axis,yaxis15 =axis
     ,xaxis16 =axis,yaxis16 =axis,xaxis17 =axis,yaxis17 =axis
    ,xaxis18 =axis,yaxis18 =axis,xaxis19 =axis,yaxis19 =axis
        )
py.offline.iplot(fig, filename='simple-subplot-with-annotations')


Close to 50 percent of Kagglers in the hospitality and entertainmane industry in teh developing world do not use machine learning verses 25 percent in the developing world. Moreover, about 40 percent non profits in both the developed and developing world do not use machine learning. The academic industry in both the developing and developed world has the highest propotion of well establised machine learning methods

## What Industry is Everyone In
The next section we are going to tackle is the industry which kagglers partake in according to whether they live in a developed country or a developed country

In [None]:
exclude ='In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice'

data_multiple_choice_developed_ind = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q7']!=exclude)
                                                & (data_multiple_choice_developed['Q7']!="nan")
                                                                             )]
data_multiple_choice_developing_ind = data_multiple_choice_developing.loc[( (data_multiple_choice_developing['Q7']!=exclude)
                                                & (data_multiple_choice_developing['Q7']!="nan"))]

data_multiple_choice_developed_ind = data_multiple_choice_developed_ind.sort_values(by=['Q7'], ascending=False)
data_multiple_choice_developing_ind = data_multiple_choice_developing_ind.sort_values(by=['Q7'], ascending=False)

data_multiple_choice_developed_ind = data_multiple_choice_developed_ind.groupby(['Q7']).size().reset_index(name='count')
data_multiple_choice_developing_ind = data_multiple_choice_developing_ind.groupby(['Q7']).size().reset_index(name='count')
    
    
    
data_multiple_choice_developing_ind['count'] = 100*(data_multiple_choice_developing_ind['count']/data_multiple_choice_developing_ind['count'].sum())
data_multiple_choice_developed_ind['count'] = 100*(data_multiple_choice_developed_ind['count']/data_multiple_choice_developed_ind['count'].sum())
#data_multiple_choice_developing_age['count'] = -1*data_multiple_choice_developing_age['count']

layout = dict(barmode='group', title='Kagglers by Industry (Percentage)')
data = [ go.Bar(x=data_multiple_choice_developed_ind['Q7'],
               y=data_multiple_choice_developed_ind['count'],
                  marker=dict(color='#3B8B88'),
                  name='Developed Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
            ),go.Bar(x=data_multiple_choice_developing_ind['Q7'],
               y=data_multiple_choice_developing_ind['count'],
                      marker=dict(color='#F7473B'),
                      name='Developing Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
         
            )]
fig = dict( data=data, layout=layout )
py.offline.iplot( fig, validate=False, filename='_indsutr' )

Most kaggler in the developing world are students and they represent about 29 percent while students in the developed world represent only 16 percent. There are more kagglers in computer and tech industry than there are students in the developed world while in the developing world there are nearly the same amount. The academic industry comes in next for both the developed and developing world

## Primary Tool Used To Analyze Data
Does where a person live affect their primary tool used to analyze data? Let's find out by visualizing question 12 by wheather a kaggler is from a developed or developing nation

In [None]:
exclude ='What is the primary tool that you use at work or school to analyze data? (include text response) - Selected Choice'

data_multiple_choice_developed_tool = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q12_MULTIPLE_CHOICE']!=exclude)
                                                & (data_multiple_choice_developed['Q12_MULTIPLE_CHOICE']!="nan")
                                                                             )]
data_multiple_choice_developing_tool = data_multiple_choice_developing.loc[( (data_multiple_choice_developing['Q12_MULTIPLE_CHOICE']!=exclude)
                                                & (data_multiple_choice_developing['Q12_MULTIPLE_CHOICE']!="nan"))]

data_multiple_choice_developed_tool = data_multiple_choice_developed_tool.sort_values(by=['Q12_MULTIPLE_CHOICE'], ascending=False)
data_multiple_choice_developing_tool = data_multiple_choice_developing_tool.sort_values(by=['Q12_MULTIPLE_CHOICE'], ascending=False)

data_multiple_choice_developed_tool = data_multiple_choice_developed_tool.groupby(['Q12_MULTIPLE_CHOICE']).size().reset_index(name='count')
data_multiple_choice_developing_tool = data_multiple_choice_developing_tool.groupby(['Q12_MULTIPLE_CHOICE']).size().reset_index(name='count')
    
    
data_multiple_choice_developing_tool['count'] = 100*(data_multiple_choice_developing_tool['count']
                                                     /data_multiple_choice_developing_tool['count'].sum())
data_multiple_choice_developed_tool['count'] = 100*(data_multiple_choice_developed_tool['count']
                                                    /data_multiple_choice_developed_tool['count'].sum())
#data_multiple_choice_developing_age['count'] = -1*data_multiple_choice_developing_age['count']
#print(data_multiple_choice['Q12_MULTIPLE_CHOICE'].value_counts(normalize=True))
layout = dict(barmode='group', title='Kagglers by Primary tool (Percentage)')
data = [ go.Bar(x=data_multiple_choice_developed_tool['Q12_MULTIPLE_CHOICE'],
               y=data_multiple_choice_developed_tool['count'],
                  marker=dict(color='#DBE1E7'),
                  name='Developed Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
            ),go.Bar(x=data_multiple_choice_developing_tool['Q12_MULTIPLE_CHOICE'],
               y=data_multiple_choice_developing_tool['count'],
                      marker=dict(color='#3F5C7D'),
                      name='Developing Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
         
            )]
fig = dict( data=data, layout=layout )
py.offline.iplot( fig, validate=False, filename='_indsutr' )

There doesn't seem to be much of a difference between the kagglers in the developed world and those in the developing world on the primary tool used to analyze data. However, there seems to be more of a preference for basic statistical software by kagglers in the developing world. 

### Visualizing the Relationship between Annual Income and Primary Tool Used To Analyze Data
We continue from the previous section but this time we add the annual income

In [None]:
exclude = 'How do you perceive the quality of online learning platforms and in-person bootcamps as compared to the quality of the education provided by traditional brick and mortar institutions? - Online learning platforms and MOOCs:'
def getPyramid(industry):
    
    data_multiple_choice_developed_tool = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q12_MULTIPLE_CHOICE']!=exclude)
                                                & (data_multiple_choice_developed['Q12_MULTIPLE_CHOICE']!="nan")
                                                & (data_multiple_choice_developed['Q9']==industry)
                                                                             )]
    data_multiple_choice_developing_tool = data_multiple_choice_developing.loc[( (data_multiple_choice_developing['Q12_MULTIPLE_CHOICE']!=exclude)
                                                & (data_multiple_choice_developing['Q12_MULTIPLE_CHOICE']!="nan")
                                              & (data_multiple_choice_developing['Q9']==industry))]

    data_multiple_choice_developed_tool = data_multiple_choice_developed_tool.sort_values(by=['Q12_MULTIPLE_CHOICE'], ascending=False)
    data_multiple_choice_developing_tool = data_multiple_choice_developing_tool.sort_values(by=['Q12_MULTIPLE_CHOICE'], ascending=False)

    data_multiple_choice_developed_tool = data_multiple_choice_developed_tool.groupby(['Q12_MULTIPLE_CHOICE']).size().reset_index(name='count')
    data_multiple_choice_developing_tool = data_multiple_choice_developing_tool.groupby(['Q12_MULTIPLE_CHOICE']).size().reset_index(name='count')
    
    data_multiple_choice_developing_tool['count'] = 100*(data_multiple_choice_developing_tool['count']
                                                     /data_multiple_choice_developing_tool['count'].sum())
    data_multiple_choice_developed_tool['count'] = 100*(data_multiple_choice_developed_tool['count']
                                                    /data_multiple_choice_developed_tool['count'].sum())
    newnames =[['Local or hosted development environments (RStudio, JupyterLab, etc.)','Local hosted']
              ,['Basic statistical software (Microsoft Excel, Google Sheets, etc.)','Basic stats software']
              ,['Advanced statistical software (SPSS, SAS, etc.)','Advanced stats software']
              ,['Cloud-based data software & APIs (AWS, GCP, Azure, etc.)','Cloud-based']
              ,['Business intelligence software (Salesforce, Tableau, Spotfire, etc.)','Intelligence software']]
    for newname in newnames:
        data_multiple_choice_developing_tool.loc[(data_multiple_choice_developing_tool['Q12_MULTIPLE_CHOICE']==newname[0]) ,"Q12_MULTIPLE_CHOICE"] = newname[1]
        data_multiple_choice_developed_tool.loc[(data_multiple_choice_developed_tool['Q12_MULTIPLE_CHOICE']==newname[0]) ,"Q12_MULTIPLE_CHOICE"] = newname[1]
    #data_multiple_choice_developing_age['count'] = 100*(data_multiple_choice_developing_age['count']/total_low['count'])
    #data_multiple_choice_developed_age['count'] = 100*(data_multiple_choice_developed_age['count']/total_high['count'])
    
    return go.Bar(x=data_multiple_choice_developed_tool['Q12_MULTIPLE_CHOICE'],
               y=data_multiple_choice_developed_tool['count'],
                  marker=dict(color='#DBE1E7'),
                  name='Developed Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
            ),go.Bar(x=data_multiple_choice_developing_tool['Q12_MULTIPLE_CHOICE'],
               y=data_multiple_choice_developing_tool['count'],
                      marker=dict(color='#3F5C7D'),
                      name='Developing Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
         
            )



fig = tools.make_subplots(rows=5, cols=4, print_grid=False,
                          subplot_titles=('0-10,000', '10-20,000', '20-30,000', '30-40,000'
                                        , '40-50,000', '50-60,000', '60-70,000', '70-80,000'
                                          , '80-90,000', '90-100,000', '100-125,000', '125-150,000'
                                          , '150-200,000', '200-250,000', '250-300,000', '300-400,000'
                                          , '400-500,000', '500,000+'))
industries_list = [['0-10,000', '10-20,000', '20-30,000', '30-40,000']
                   , ['40-50,000', '50-60,000', '60-70,000', '70-80,000']
                   , ['80-90,000', '90-100,000', '100-125,000', '125-150,000']
                   , ['150-200,000', '200-250,000', '250-300,000', '300-400,000']
                   , ['400-500,000', '500,000+']]

i = 1
for industries in industries_list:
    k = 1
    for industry in industries:
        hig,low =getPyramid(industry)
        fig.append_trace(low, i, k)
        fig.append_trace(hig, i, k)
        k = k+1
    i = i+1
labels = ["Developing Nations", "Developed Nations"]


fig['layout']['title'] = 'Title'
#fig['layout']['showlegend'] = False
#fig['layout']['updatemenus'] = updatemenus
for i in fig['layout']['annotations']:
    i['font'] = dict(size=8,color='#000000')

axis =  dict(tickfont=dict(
            family='sans-serif',
            size=8,
            color='#000'
        ))
fig['layout'].update(height=1000, width=800, title=' Relationship between Annual Income and Primary Tool Used To Analyze Data'
                    , legend=dict(
       
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=8,
            color='#000'
        ),
        bgcolor='#E2E2E2',
        bordercolor='#FFFFFF',
        borderwidth=2
    ),xaxis1 =axis,yaxis1 =axis,xaxis2 =axis,yaxis2 =axis,xaxis3 =axis,yaxis3 =axis
     ,xaxis4 =axis,yaxis4 =axis,xaxis5 =axis,yaxis5 =axis,xaxis6 =axis,yaxis6 =axis
     ,xaxis7 =axis,yaxis7 =axis
     ,xaxis8 =axis,yaxis8 =axis,xaxis9 =axis,yaxis9 =axis,xaxis10 =axis,yaxis10 =axis
     ,xaxis11 =axis,yaxis11 =axis
     ,xaxis12 =axis,yaxis12 =axis,xaxis13 =axis,yaxis13 =axis,xaxis14 =axis,yaxis14 =axis
    ,xaxis15 =axis,yaxis15 =axis
     ,xaxis16 =axis,yaxis16 =axis,xaxis17 =axis,yaxis17 =axis
    ,xaxis18 =axis,yaxis18 =axis,xaxis19 =axis,yaxis19 =axis
        )
py.offline.iplot(fig, filename='simple-subplot-with-annotations')

Again regardless of the income group, kagglers in both the developed and developing world seem to prefer local hosting their data analysis. However, cloud based data analysis preference goes up by yearly income in both the developing and developed world

## Data Source
Let's visualize the results from question 33 which deals with where Kagglers get their public data and just like the rest of this notebook we shall look at things from the prostective of those from developing and developed countries


In [None]:
new_columns = ['Government websites'
                                                         ,'University research group websites'
                                                         ,'Non-profit research group websites'
                                                         ,'Dataset aggregator/platform '
                                                         ,'I collect my own data'
                                                         ,' Publicly released data from private companies'
                                                         ,'Google Search'
                                                         ,'Google Dataset Search'
                                                         ,'GitHub'
                                                         ,'None/I do not work with public data']
old_columns = ['Q33_Part_1','Q33_Part_2','Q33_Part_3','Q33_Part_4','Q33_Part_5','Q33_Part_6','Q33_Part_7'
              ,'Q33_Part_8','Q33_Part_9','Q33_Part_10']

data_multiple_choice_developed_data_source =pd.DataFrame(columns = new_columns)
data_multiple_choice_developing_data_source =pd.DataFrame(columns = new_columns)
for i in range(len(old_columns)):
    data_multiple_choice_developed_data_source[new_columns[i]] =data_multiple_choice_developed[old_columns[i]]
    data_multiple_choice_developing_data_source[new_columns[i]] =data_multiple_choice_developing[old_columns[i]]

def mergeDataSource(df):
    df =pd.DataFrame({'data_source':pd.concat([df[new_columns[0]],df[new_columns[1]],df[new_columns[2]],df[new_columns[3]]
                                            ,df[new_columns[4]],df[new_columns[5]] ,df[new_columns[6]],df[new_columns[7]]
                                            ,df[new_columns[8]],df[new_columns[9]]])})
    df = df.loc[(df['data_source']!='nan')]
    df = df.groupby('data_source').size().reset_index(name='count')
    #df['count'] = 100*(df['count']/df['count'].sum())
    return df
data_multiple_choice_developed_data_source= mergeDataSource(data_multiple_choice_developed_data_source)
data_multiple_choice_developing_data_source= mergeDataSource(data_multiple_choice_developing_data_source)

data_multiple_choice_developed_data_source.head(12)
layout = dict(barmode='group', title='Kagglers Data Source')
data = [ go.Bar(x=data_multiple_choice_developed_data_source['data_source'],
               y=data_multiple_choice_developed_data_source['count'],
                  marker=dict(color='#C06C84'),
                  name='Developed Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
            ),go.Bar(x=data_multiple_choice_developing_data_source['data_source'],
               y=data_multiple_choice_developing_data_source['count'],
                      marker=dict(color='#6C5B7B'),
                      name='Developing Countries'
                ,showlegend=False,
    xaxis='x1',
    yaxis='y1'
         
            )]
fig = dict( data=data, layout=layout )
py.offline.iplot( fig, validate=False, filename='_indsutr' )


Dataset aggregators are the most popular source of public data for kagglers in developing and developed nations. However, fewer kagglers in the developing collect their own data than those in the developed world. Moreover, kagglers in the developing world have a near identical preference for collecting data from Github and Google Search. In the developed world, kagglers prefer Google Search more than Github



# Income
Now let's look at how much kagglers earn if they are from a developed country or developing country

In [None]:
data_multiple_choice_developing = data_multiple_choice_developing.fillna('nan')
data_multiple_choice_developed = data_multiple_choice_developed.fillna('nan')
SanKeyChartValues = []

        
data_multiple_choice_developed_salary = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q9']!="I do not wish to disclose my approximate yearly compensation")
                                                                             & (data_multiple_choice_developed['Q9']!="nan")
                                                                             & (data_multiple_choice_developed['Q6']!="Student"))]
data_multiple_choice_developing_salary = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q9']!="I do not wish to disclose my approximate yearly compensation")
                                                                               & (data_multiple_choice_developing['Q9']!="nan")
                                                                             & (data_multiple_choice_developing['Q6']!="Student"))]
#data_multiple_choice_developing_salary['Q9'].value_counts(normalize=True)
values =dict(data_multiple_choice_developed_salary['Q9'].value_counts(normalize=True))
#print('data_multiple_choice_developed_salary')
#print(data_multiple_choice_developed_salary["Q9"].mean())
#print(values)
income_levels = ['0-10,000', '10-20,000', '20-30,000', '30-40,000', '40-50,000', '50-60,000', '60-70,000', '70-80,000', '80-90,000'
                 , '90-100,000', '100-125,000', '125-150,000', '150-200,000', '200-250,000', '250-300,000', '300-400,000', '400-500,000', '500,000+']

for income_level in income_levels:
    SanKeyChartValues.append(values[income_level]*100)
    
values = dict(data_multiple_choice_developing_salary['Q9'].value_counts(normalize=True))
#print('data_multiple_choice_developing_salary')
#print(data_multiple_choice_developing_salary["Q9"].mean())
#print(values)

for income_level in income_levels:
    SanKeyChartValues.append(values[income_level]*100)

trace1 = {
  "domain": {
    "x": [0, 1], 
    "y": [0, 1]
  }, 
  "link": {
    "color": [
              "rgba(244,67,54,0.6)"
,"rgba(233,30,99,0.6)"
,"rgba(156,39,176,0.6)"
,"rgba(103,58,183,0.6)"
,"rgba(63,81,181,0.6)"
,"rgba(33,150,243,0.6)"
,"rgba(0,188,212,0.6)"
,"rgba(0,150,136,0.6)"
,"rgba(76,175,80,0.6 )"
,"rgba(139,195,74,0.6)"
,"rgba(205,220,57,0.6)"
,"rgba(255,235,59,0.6)"
,"rgba(255,152,0 ,0.6)"
,"rgba(100,221,23,0.6)"
,"rgba(174,234,0 ,0.6)"
,"rgba(255,87,34,0.6)"
,"rgba(121,85,72,0.6)"
,"rgba(96,125,139,0.6)"

        
,     "rgba(244,67,54,0.6)"
,"rgba(233,30,99,0.6)"
,"rgba(156,39,176,0.6)"
,"rgba(103,58,183,0.6)"
,"rgba(63,81,181,0.6)"
,"rgba(33,150,243,0.6)"
,"rgba(0,188,212,0.6)"
,"rgba(0,150,136,0.6)"
,"rgba(76,175,80,0.6 )"
,"rgba(139,195,74,0.6)"
,"rgba(205,220,57,0.6)"
,"rgba(255,235,59,0.6)"
,"rgba(255,152,0 ,0.6)"
,"rgba(100,221,23,0.6)"
,"rgba(174,234,0 ,0.6)"
,"rgba(255,87,34,0.6)"
,"rgba(121,85,72,0.6)"
,"rgba(96,125,139,0.6)"
        
            ], 
    "source": [0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
               ,1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], 
    "target": [2, 3, 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
               ,2, 3, 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19], 
    "value": SanKeyChartValues
  }, 
  "node": {
    "color": [
              "#F44336"
,"#E91E63"
,"#9C27B0"
,"#673AB7"
,"#3F51B5"
,"#2196F3"
,"#00BCD4"
,"#009688"
,"#4CAF50"
,"#8BC34A"
,"#CDDC39"
,"#FFEB3B"
,"#FF9800"
,"#64DD17"
,"#AEEA00"
,"#FF5722"
,"#795548"
,"#607D8B"
,"#80D8FF"
             ], 
    "label": ["Developed Countries", "Developing Countries"
              , '0-10,000', '10-20,000', '20-30,000', '30-40,000', '40-50,000', '50-60,000', '60-70,000', '70-80,000', '80-90,000'
                 , '90-100,000', '100-125,000', '125-150,000', '150-200,000', '200-250,000', '250-300,000', '300-400,000', '400-500,000', '500,000+'], 
    "line": {
      "color": "black", 
      "width": 0
    }, 
    "pad": 6, 
    "thickness": 18
  }, 
  "orientation": "h", 
  "type": "sankey", 
  "valueformat": ".0f"
}
data = Data([trace1])
layout = {
  "font": {"size": 10}, 
  "height": 744, 
  "title": "Income Distribution"
}
fig = Figure(data=data, layout=layout)
        
        


py.offline.iplot( fig, validate=False, filename='Sankey Salary' )

From the above sankey we can see that 4 out 10 Kagglers in developing countries earn less \$10,000 while 1 out 10 in Kagglers in developed countries learn between \$100,000 and \$120,000. In fact more than 63 percent of Kagglers in developing countries will earn \$20,000. We can now a bit deeper and see how income looks like based on experience and education level attend.

In [None]:
from plotly.widgets import GraphWidget
from IPython.display import display
from ipywidgets import widgets
# we will define a function that will handle the input from the dropdown widget

edu = data_multiple_choice.loc[((data_multiple_choice['Q4']!="nan")
                                &(data_multiple_choice['Q4']!="I prefer not to answer")
                               &(data_multiple_choice['Q4']!="What is the highest level of formal education that you have attained or plan to attain within the next 2 years?"))]
edu = edu['Q4']
edu = edu.drop_duplicates()
edu = edu.sort_values()
def update_plot(education):
    SanKeyChartValues = []
        
    data_multiple_choice_developed_salary = data_multiple_choice_developed.loc[((data_multiple_choice_developed['Q9']!="I do not wish to disclose my approximate yearly compensation")
                                                                             & (data_multiple_choice_developed['Q9']!="nan")
                                                                                & (data_multiple_choice_developed['Q4']==education)
                                                                             & (data_multiple_choice_developed['Q6']!="Student"))]
    data_multiple_choice_developing_salary = data_multiple_choice_developing.loc[((data_multiple_choice_developing['Q9']!="I do not wish to disclose my approximate yearly compensation")
                                                                               & (data_multiple_choice_developing['Q9']!="nan")
                                                                                  & (data_multiple_choice_developing['Q4']==education)
                                                                             & (data_multiple_choice_developing['Q6']!="Student"))]
    #data_multiple_choice_developing_salary['Q9'].value_counts(normalize=True)
    values =dict(data_multiple_choice_developed_salary['Q9'].value_counts(normalize=True))
   
    income_levels = ['0-10,000', '10-20,000', '20-30,000', '30-40,000', '40-50,000', '50-60,000', '60-70,000', '70-80,000', '80-90,000'
                 , '90-100,000', '100-125,000', '125-150,000', '150-200,000', '200-250,000', '250-300,000', '300-400,000', '400-500,000', '500,000+']

    for income_level in income_levels:
        if income_level in values:
            SanKeyChartValues.append(values[income_level]*100)
        else:
            SanKeyChartValues.append(0)
    
    values = dict(data_multiple_choice_developing_salary['Q9'].value_counts(normalize=True))
    #print(education)
    #print(values)

    for income_level in income_levels:
        if income_level in values:
            SanKeyChartValues.append(values[income_level]*100)
        else:
            SanKeyChartValues.append(0)
    return {
    "color": [
              "rgba(244,67,54,0.6)"
,"rgba(233,30,99,0.6)"
,"rgba(156,39,176,0.6)"
,"rgba(103,58,183,0.6)"
,"rgba(63,81,181,0.6)"
,"rgba(33,150,243,0.6)"
,"rgba(0,188,212,0.6)"
,"rgba(0,150,136,0.6)"
,"rgba(76,175,80,0.6 )"
,"rgba(139,195,74,0.6)"
,"rgba(205,220,57,0.6)"
,"rgba(255,235,59,0.6)"
,"rgba(255,152,0 ,0.6)"
,"rgba(100,221,23,0.6)"
,"rgba(174,234,0 ,0.6)"
,"rgba(255,87,34,0.6)"
,"rgba(121,85,72,0.6)"
,"rgba(96,125,139,0.6)"

        
,     "rgba(244,67,54,0.6)"
,"rgba(233,30,99,0.6)"
,"rgba(156,39,176,0.6)"
,"rgba(103,58,183,0.6)"
,"rgba(63,81,181,0.6)"
,"rgba(33,150,243,0.6)"
,"rgba(0,188,212,0.6)"
,"rgba(0,150,136,0.6)"
,"rgba(76,175,80,0.6 )"
,"rgba(139,195,74,0.6)"
,"rgba(205,220,57,0.6)"
,"rgba(255,235,59,0.6)"
,"rgba(255,152,0 ,0.6)"
,"rgba(100,221,23,0.6)"
,"rgba(174,234,0 ,0.6)"
,"rgba(255,87,34,0.6)"
,"rgba(121,85,72,0.6)"
,"rgba(96,125,139,0.6)"
        
            ], 
    "source": [0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
               ,1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], 
    "target": [2, 3, 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
               ,2, 3, 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19], 
    "value": SanKeyChartValues
  }
btns = []
for education in edu:
    if isinstance(education, str):
        #print(education)
        btns.append(dict(
                args=['link', update_plot(education)],
                label=education,
                method='restyle'
            ))
#print( update_plot(edu.tolist()[0]))
updatemenus=list([
    dict(
        buttons=btns,active = 0,
        direction = 'down',
        pad = {'r': 10, 't': 10},
        showactive = True,
        x = 0.2,
        xanchor = 'left',
        y = 1.1,
        yanchor = 'top' 
    ),
])

annotations = list([
    dict(text='Education Level:', x=0, y=1.085, yref='paper', align='left', showarrow=False)
])
layout = {
  "font": {"size": 10}, 
  "height": 744, 
   "autosize":False,
  "title": "Income Distribution By Education Level"
}
layout['updatemenus'] = updatemenus
layout['annotations'] = annotations

trace1 = {
  "domain": {
    "x": [0, 1], 
    "y": [0, 1]
  },
  "link": update_plot(edu.tolist()[0]),
  "node": {
    "color": [
              "#F44336"
,"#E91E63"
,"#9C27B0"
,"#673AB7"
,"#3F51B5"
,"#2196F3"
,"#00BCD4"
,"#009688"
,"#4CAF50"
,"#8BC34A"
,"#CDDC39"
,"#FFEB3B"
,"#FF9800"
,"#64DD17"
,"#AEEA00"
,"#FF5722"
,"#795548"
,"#607D8B"
,"#80D8FF"
             ], 
    "label": ["Developed Countries", "Developing Countries"
              , '0-10,000', '10-20,000', '20-30,000', '30-40,000', '40-50,000', '50-60,000', '60-70,000', '70-80,000', '80-90,000'
                 , '90-100,000', '100-125,000', '125-150,000', '150-200,000', '200-250,000', '250-300,000', '300-400,000', '400-500,000', '500,000+'], 
    "line": {
      "color": "black", 
      "width": 0
    }, 
    "pad": 6, 
    "thickness": 18
  }, 
  "orientation": "h", 
  "type": "sankey", 
  "valueformat": ".0f"
}
data = Data([trace1])


fig = dict(data=data, layout=layout)
py.offline.iplot( fig, validate=False, filename='Sankey Salary' )

#edu.head(12)

We can see the same story repeated when education levels are taken into account, kagglers in the developed world out earn those in the developing world by a large margin. 

In conclusion we can see that Kagglers in the developing world differ from Kagglers in the developed world. The differences range from education to income, however there are so many similaries part