# Exploratory Analysis and Visualisation of Nigerian Kagglers using Kaggle's 2021 Data Science and Machine Learning survey

# Import relevant libraries 

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [2]:
df = pd.read_csv("kaggle_survey_2021_responses.csv", low_memory=False)
df = df.drop([0], axis=0) #this removes the row with the questions

# enable showing all columns and rows
pd.set_option('display.max_columns', None) #show all columns
#pd.set_option('display.max_rows', None)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', None)

colors_ng = "green" #chart color for Nigerian Kagglers.
colors_rotw = "gold" #chart color for the rest of the world.

# The total number of responses globally was 25973

In [3]:
df.shape[0]

25973

In [4]:
df[df.columns[3]].replace({'United Kingdom of Great Britain and Northern Ireland':'UK',
                           'Iran, Islamic Republic of...':'Iran',
                           'United Arab Emirates':'UAE',
                           'United States of America':'USA',
                           'Viet Nam':'Vietnam'}, inplace=True)

In [5]:
print(f" List of countries that participated in the Kaggle's 2021 Data Science and Machine Learning survey:\n{np.sort(df[df.columns[3]].unique())}\n")

 List of countries that participated in the Kaggle's 2021 Data Science and Machine Learning survey:
['Algeria' 'Argentina' 'Australia' 'Austria' 'Bangladesh' 'Belarus'
 'Belgium' 'Brazil' 'Canada' 'Chile' 'China' 'Colombia' 'Czech Republic'
 'Denmark' 'Ecuador' 'Egypt' 'Ethiopia' 'France' 'Germany' 'Ghana'
 'Greece' 'Hong Kong (S.A.R.)' 'I do not wish to disclose my location'
 'India' 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Japan'
 'Kazakhstan' 'Kenya' 'Malaysia' 'Mexico' 'Morocco' 'Nepal' 'Netherlands'
 'Nigeria' 'Norway' 'Other' 'Pakistan' 'Peru' 'Philippines' 'Poland'
 'Portugal' 'Romania' 'Russia' 'Saudi Arabia' 'Singapore' 'South Africa'
 'South Korea' 'Spain' 'Sri Lanka' 'Sweden' 'Switzerland' 'Taiwan'
 'Thailand' 'Tunisia' 'Turkey' 'UAE' 'UK' 'USA' 'Uganda' 'Ukraine'
 'Vietnam']



In [6]:
print(f" Total number of countries that participated in the Kaggle's 2021 Data Science and Machine Learning survey:\n{(len(np.sort(df[df.columns[3]].unique())) - 2)}\n")

#(-2 for "Other" & "I do not wish to disclose my location")

 Total number of countries that participated in the Kaggle's 2021 Data Science and Machine Learning survey:
64



# Top 10 countries with the most Kagglers

# In comparison to other countries,  Nigeria Kagglers make up about 2.7% and Nigeria ranks 7th

In [108]:
top10_countries = df['Q3'].value_counts(normalize=True, ascending=False) #Count the number of countries and express it as a fraction
top10_countries = top10_countries.drop(["Other"], axis=0) #drop "Other"
top10_countries = top10_countries.mul(100) #multiply the values by 100 (i.e. percentage)
top10_countries = top10_countries.rename("percentage (%)").reset_index()
top10_countries = top10_countries.rename(columns={"index":"country"})
top10_countries = top10_countries.head(10).round(decimals=1)



top10_countries["color"] = "gold" #color the others gold
top10_countries.loc[top10_countries["country"] == "Nigeria", ["color"]] = "green" #color Nigeria green

fig = px.bar(top10_countries, x="country", y="percentage (%)", text='percentage (%)', 
             title="Top 10 countries with the most Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=top10_countries.color) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")

fig.show()


In [112]:
top10_countries

Unnamed: 0,country,percentage (%),color
0,India,28.6,gold
1,USA,10.2,gold
2,Japan,3.5,gold
3,China,3.1,gold
4,Brazil,2.9,gold
5,Russia,2.9,gold
6,Nigeria,2.7,green
7,UK,2.1,gold
8,Pakistan,2.0,gold
9,Egypt,1.9,gold


In [114]:
top10_countries = df['Q3'].value_counts(normalize=True,
                                        ascending=False).drop(["Other"], 
                                        axis=0).mul(100).rename("percentage (%)").reset_index(
).rename(columns={"index":"country"}).head(10).round(decimals=1)




#top10_countries = top10_countries.drop(["Other"], axis=0) #drop "Other"
#top10_countries = top10_countries.mul(100) #multiply the values by 100 (i.e. percentage)
#top10_countries = top10_countries.rename("percentage (%)").reset_index()
#top10_countries = top10_countries.rename(columns={"index":"country"})
#top10_countries = top10_countries.head(10).round(decimals=1)

top10_countries

Unnamed: 0,country,percentage (%)
0,India,28.6
1,USA,10.2
2,Japan,3.5
3,China,3.1
4,Brazil,2.9
5,Russia,2.9
6,Nigeria,2.7
7,UK,2.1
8,Pakistan,2.0
9,Egypt,1.9


# Africa

In [111]:
africa = ['Algeria', 'Egypt', 'Ethiopia','Ghana', 'Kenya', 'Morocco', 'Nigeria', 'South Africa', 
          'Tunisia', 'Uganda'] #List of African countries

africa_countries = df[df['Q3'].isin(africa)]
africa_countries = africa_countries["Q3"].value_counts(normalize=True, ascending=False) 
#Count the number of countries within Africa and express it as a fraction
africa_countries = africa_countries.mul(100) #multiply the values by 100 (i.e. percentage)
africa_countries = africa_countries.rename("percentage (%)").reset_index()
africa_countries = africa_countries.rename(columns={"index":"country"})
africa_countries = africa_countries.round(decimals=1) 


africa_countries["color"] = "gold" #color the others gold
africa_countries.loc[africa_countries["country"] == "Nigeria", ["color"]] = "green" #color Nigeria green

fig = px.bar(africa_countries, x="country", y="percentage (%)", text='percentage (%)', 
             title="Ranking of African countries with the most Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=africa_countries.color) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")

fig.show()

In [109]:
africa_countries

Unnamed: 0,country,percentage (%)
0,Nigeria,34.1
1,Egypt,23.4
2,Kenya,12.0
3,South Africa,7.1
4,Morocco,6.8
5,Tunisia,5.3
6,Ghana,4.8
7,Uganda,2.3
8,Algeria,2.1
9,Ethiopia,2.1


In [None]:
africa_countries["color"] = "gold" #color the others gold
africa_countries.loc[africa_countries["country"] == "Nigeria", ["color"]] = "green" #color Nigeria green

In [9]:
Cumulative_cases_plot = px.choropleth(africa_countries,
                    locations="country", #Spatial coordinates and corrseponds to a column in dataframe
                    color="percentage (%)", #Corresponding data in the dataframe
                    locationmode = 'country names', #location mode == One of ‘ISO-3’, ‘USA-states’, or ‘country names’ 
                    #locationmode == should match the type of data entries in "locations"
                    scope="africa", #limits the scope of the map to Africa
                    title ="African Kagglers",
                    hover_name="country",
                    color_continuous_scale = "Greens",
                   )
Cumulative_cases_plot.update_traces(marker_line_color="black") # line markers between states
Cumulative_cases_plot.show()

- Out of the 64 countries that took part in the Kaggle Survey, 10 countries are from Africa.


- There are 54 countries within Africa, which means that less than 20 % completed the Kaggle survey.


- Within Africa, Nigeria Kagglers make up 34% and and ranked number one in Africa. Fun fact - Nigeria is also the most populated country in Africa. 


- The top three African countries on Kaggle are Nigeria, Egypt and Kenya.

In [10]:
nigeria = df[df["Q3"] =="Nigeria"]

# Who are Nigerian Kagglers

# The total number of respondents from Nigeria was 702

In [11]:
nigeria.shape

(702, 369)

# Gender distribution of Nigerian Kagglers compared to the rest of the world

In [12]:
gender_distribution_ng = nigeria[nigeria["Q2"].isin(["Man", "Woman"])]
gender_distribution_ng = gender_distribution_ng['Q2'].value_counts(normalize=True, ascending=False)
gender_distribution_ng = gender_distribution_ng.mul(100)
gender_distribution_ng = gender_distribution_ng.round(decimals=1)
gender_distribution_ng = pd.DataFrame(data=gender_distribution_ng)
gender_distribution_ng.reset_index(inplace=True)
gender_distribution_ng.columns = ["gender","percentage(%)"] 

rest_of_the_world = df[df["Q3"] !="Nigeria"]
gender_distribution_rotw = rest_of_the_world[rest_of_the_world["Q2"].isin(["Man", "Woman"])]
gender_distribution_rotw = gender_distribution_rotw['Q2'].value_counts(normalize=True, ascending=False)
gender_distribution_rotw = gender_distribution_rotw.mul(100)
gender_distribution_rotw = gender_distribution_rotw.round(decimals=1) 
gender_distribution_rotw = pd.DataFrame(data=gender_distribution_rotw)
gender_distribution_rotw.reset_index(inplace=True)
gender_distribution_rotw.columns = ["gender","percentage(%)"] 

fig = go.Figure()

fig.add_trace(go.Bar(x=gender_distribution_ng["gender"], y=gender_distribution_ng["percentage(%)"], 
                     name="Nigerian Kagglers", marker_color=colors_ng,
                     text=gender_distribution_ng["percentage(%)"],
                     textposition='outside', texttemplate='%{text:.2s}'))

fig.add_trace(go.Bar(x=gender_distribution_rotw["gender"], y=gender_distribution_rotw["percentage(%)"], 
                     name="Rest of the world", marker_color=colors_rotw,
                     text=gender_distribution_rotw["percentage(%)"],
                     textposition='outside', texttemplate='%{text:.2s}'))


fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [13]:
# 15% of Nigerian Kagglers are women and 85% are men 

#Compared to the rest of the world, the percentage of female Kagglers in Nigeria (15%) 
#is lower than the rest of the world (19%)

# What do Nigerian Kagglers do?

In [14]:
job_ng = nigeria["Q5"].value_counts(normalize=True, ascending=False) 
job_ng = job_ng.mul(100)
job_ng = job_ng.round(decimals=1)
job_ng = pd.DataFrame(data=job_ng)
job_ng.reset_index(inplace=True)
job_ng.columns = ["role","percentage (%)"] 

fig = px.bar(job_ng, x="role", y="percentage (%)", text='percentage (%)', 
             title="Job roles of Nigerian Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_ng, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [76]:
job_ng

Unnamed: 0,role,percentage (%)
0,Student,19.8
1,Data Analyst,14.5
2,Data Scientist,13.5
3,Currently not employed,13.1
4,Other,8.5
5,Software Engineer,6.1
6,Machine Learning Engineer,6.1
7,Research Scientist,5.4
8,Business Analyst,4.3
9,Program/Project Manager,2.8


In [79]:
nigeria["Q5"].value_counts(normalize=True, ascending=False).mul(100).round(decimals=1).reset_index()

Unnamed: 0,index,Q5
0,Student,19.8
1,Data Analyst,14.5
2,Data Scientist,13.5
3,Currently not employed,13.1
4,Other,8.5
5,Software Engineer,6.1
6,Machine Learning Engineer,6.1
7,Research Scientist,5.4
8,Business Analyst,4.3
9,Program/Project Manager,2.8


In [15]:
#Most Nigerian Kagglers are Students, Data Analysts and Data Scientists

In [16]:
job_rotw = rest_of_the_world["Q5"].value_counts(normalize=True, ascending=False) 
job_rotw = job_rotw.mul(100)
job_rotw = job_rotw.round(decimals=1)
job_rotw = pd.DataFrame(data=job_rotw)
job_rotw.reset_index(inplace=True)
job_rotw.columns = ["role","percentage (%)"] 

fig = px.bar(job_rotw, x="role", y="percentage (%)", text='percentage (%)',
            title="Job roles for the rest of the world")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_rotw,) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [17]:
#Similar to the  rest of the world, Students form the majority of Nigerian Kagglers.

#Also, Software Engineers rank higher (3rd) compared to Kagglers from Nigeria (6th).

# Age distribution of Nigerian Kagglers 

In [18]:
age_ng = nigeria["Q1"].value_counts(normalize=True) 
age_ng = age_ng.mul(100)
age_ng = age_ng.round(decimals=1)
age_ng = pd.DataFrame(data=age_ng)
age_ng.reset_index(inplace=True)
age_ng.columns = ["age","percentage (%)"] 
age_ng.sort_values(by="age", inplace=True)


fig = px.bar(age_ng, x="age", y="percentage (%)", text='percentage (%)', 
             title="Age of Nigerian Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_ng, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")

fig.show()

In [19]:
#The largest proportion (30%) of Nigerian Kagglers are in the age range of 25 - 29 

#About 65% of Nigerian Kagglers are within the age of range 22 - 34

#91% of Nigerian Kagglers are younger than 40. 

In [20]:
age_rotw = rest_of_the_world["Q1"].value_counts(normalize=True) 
age_rotw = age_rotw.mul(100)
age_rotw = age_rotw.round(decimals=1)
age_rotw = pd.DataFrame(data=age_rotw)
age_rotw.reset_index(inplace=True)
age_rotw.columns = ["age","percentage (%)"] 
age_rotw.sort_values(by="age", inplace=True)


fig = px.bar(age_rotw, x="age", y="percentage (%)", text='percentage (%)', 
             title="Age range for the rest of the world") 
fig.update_traces(texttemplate='%{text:.3s}', textposition='outside', marker_color=colors_rotw, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [21]:
#Look at the rest of the world in general, the higher the age, the lower the proportion/percentage. 

#The largest proportion (19%) for the rest of the world falls between 18 and 21.

#In general, Kagglers worldwide are relatively young. 

# Education Level of Nigerian Kagglers

In [22]:
education_ng = nigeria["Q4"].value_counts(normalize=True) 
education_ng = education_ng.mul(100)
education_ng = education_ng.round(decimals=1)
education_ng = pd.DataFrame(data=education_ng)
education_ng.reset_index(inplace=True)
education_ng.columns = ["education", "percentage (%)"]

education_order = ["No formal education past high school", 
                             "Some college/university study without earning a bachelor’s degree",
"Bachelor’s degree", "Master’s degree", "Doctoral degree", "Professional doctorate", "I prefer not to answer"]

education_ng["education"] = pd.Categorical(education_ng["education"], categories=education_order)

education_ng.sort_values(by="education", inplace=True)

labels_ng = education_ng['education']
values_ng = education_ng["percentage (%)"]


fig = px.pie(values=values_ng, names=labels_ng, color_discrete_sequence=px.colors.sequential.Greens_r, 
            title="Highest formal education of Nigerian Kagglers") 

fig.update_traces(textposition='inside', textinfo='percent+label')  

fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')

fig.show()

In [23]:
education_rotw = rest_of_the_world["Q4"].value_counts(normalize=True) 
education_rotw  = education_rotw .mul(100)
education_rotw  = education_rotw .round(decimals=1)
education_rotw  = pd.DataFrame(data=education_rotw )
education_rotw .reset_index(inplace=True)
education_rotw.columns = ["education", "percentage (%)"]

education_order = ["No formal education past high school", 
                             "Some college/university study without earning a bachelor’s degree",
"Bachelor’s degree", "Master’s degree", "Doctoral degree", "Professional doctorate", "I prefer not to answer"]

education_rotw["education"] = pd.Categorical(education_rotw["education"], categories=education_order)

education_rotw.sort_values(by="education", inplace=True)


labels_rotw = education_rotw['education']
values_rotw = education_rotw["percentage (%)"]


fig = px.pie(values=values_rotw, names=labels_rotw, color_discrete_sequence=px.colors.sequential.YlOrBr_r, 
            title="Highest formal education for the rest of the world") 

fig.update_traces(textposition='inside', textinfo='percent+label')  

fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')

fig.show()

In [24]:
#The most common education level for Nigerian Kagglers is a Bachelor's degree (52%). 

#About 93% of Nigerian Kagglers have at least a Bachelor's degree.

#This is slightly higher than the rest of the world of 89%

# Industry of Nigeria Kagglers

In [25]:
industry_ng = nigeria["Q20"].value_counts(normalize=True) 
industry_ng = industry_ng.mul(100)
industry_ng = industry_ng.round(decimals=1)
industry_ng = pd.DataFrame(data=industry_ng)
industry_ng.reset_index(inplace=True)
industry_ng.columns = ["industry","percentage (%)"] 

fig = px.bar(industry_ng, x="industry", y="percentage (%)", text='percentage (%)', 
             title="Industry of Nigerian Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_ng, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [26]:
#Most Nigeria Kagglers work in the following sectors:
    #Academics/Education
    #Computers/Technology
    #Accounting/Finance
    
#These industries are also in line with the rest of the world, however, Computers/Technology ranks as number one in the rest of the world.

In [27]:
industry_rotw = rest_of_the_world["Q20"].value_counts(normalize=True) 
industry_rotw = industry_rotw.mul(100)
industry_rotw = industry_rotw.round(decimals=1)
industry_rotw = pd.DataFrame(data=industry_rotw)
industry_rotw.reset_index(inplace=True)
industry_rotw.columns = ["industry","percentage (%)"] 

fig = px.bar(industry_rotw, x="industry", y="percentage (%)", text='percentage (%)', 
             title="Industry of other Kagglers (rest of the world)")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_rotw) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

# How much do Nigerian Kagglers earn?

In [28]:
salary_ng = nigeria["Q25"].value_counts(normalize=True) 
salary_ng = salary_ng.mul(100)
salary_ng = salary_ng.round(decimals=1)
salary_ng = pd.DataFrame(data=salary_ng)
salary_ng.reset_index(inplace=True)
salary_ng.columns = ["salary","percentage (%)"] 

salary_order = ["$0-999", "1,000-1,999", "2,000-2,999", "3,000-3,999", "4,000-4,999", "5,000-7,499",
               "7,500-9,999", "10,000-14,999", "15,000-19,999", "20,000-24,999", "25,000-29,999",
                "30,000-39,999", "40,000-49,999", "50,000-59,999", "60,000-69,999", "70,000-79,999",
                "80,000-89,999", "90,000-99,999", "100,000-124,999", "150,000-199,999", "200,000-249,999",
                ">$1,000,000"]

salary_ng["salary"] = pd.Categorical(salary_ng["salary"], categories=salary_order)
salary_ng.sort_values(by="salary", inplace=True)

fig = px.bar(salary_ng, x="salary", y="percentage (%)", text='percentage (%)', 
             title="Annual salary of Nigerian Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_ng, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

 - Considering that annual salary above takes into consideration all the job roles (including students), let's investigate the salary distribution across the three main job titles (Data Analyst, Data Scientist and Software Engineer)

# Annual salary of Nigerian Kagglers that are Data Analysts

In [29]:
nigeria_data_analyst = nigeria[nigeria["Q5"]=="Data Analyst"] #fi;ter Nigerian Kagglers that are Data Analysts

nigeria_data_analyst = nigeria_data_analyst["Q25"].value_counts(normalize=True) 
nigeria_data_analyst = nigeria_data_analyst.mul(100)
nigeria_data_analyst = nigeria_data_analyst.round(decimals=1)
nigeria_data_analyst = pd.DataFrame(data=nigeria_data_analyst)
nigeria_data_analyst.reset_index(inplace=True)
nigeria_data_analyst.columns = ["salary","percentage (%)"] 

salary_order = ["$0-999", "1,000-1,999", "2,000-2,999", "3,000-3,999", "4,000-4,999", "5,000-7,499",
              "7,500-9,999", "10,000-14,999", "15,000-19,999", "20,000-24,999", "25,000-29,999",
               "30,000-39,999", "40,000-49,999", "50,000-59,999", "60,000-69,999", "70,000-79,999",
                "80,000-89,999", "90,000-99,999", "100,000-124,999", "150,000-199,999", "200,000-249,999",
               ">$1,000,000"]

nigeria_data_analyst["salary"] = pd.Categorical(nigeria_data_analyst["salary"], categories=salary_order)
nigeria_data_analyst.sort_values(by="salary", inplace=True)

fig = px.bar(nigeria_data_analyst, x="salary", y="percentage (%)", text='percentage (%)', 
             title="Annual salary of Nigerian Kagglers that are Data Analysts")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_ng, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

# Annual salary of Nigerian Kagglers that are Data Scientists

In [30]:
nigeria_data_scientist = nigeria[nigeria["Q5"]=="Data Scientist"]


nigeria_data_scientist = nigeria_data_scientist["Q25"].value_counts(normalize=True) 
nigeria_data_scientist = nigeria_data_scientist.mul(100)
nigeria_data_scientist = nigeria_data_scientist.round(decimals=1)
nigeria_data_scientist = pd.DataFrame(data=nigeria_data_scientist)
nigeria_data_scientist.reset_index(inplace=True)
nigeria_data_scientist.columns = ["salary","percentage (%)"] 

salary_order = ["$0-999", "1,000-1,999", "2,000-2,999", "3,000-3,999", "4,000-4,999", "5,000-7,499",
              "7,500-9,999", "10,000-14,999", "15,000-19,999", "20,000-24,999", "25,000-29,999",
                "30,000-39,999", "40,000-49,999", "50,000-59,999", "60,000-69,999", "70,000-79,999",
                "80,000-89,999", "90,000-99,999", "100,000-124,999", "150,000-199,999", "200,000-249,999",
               ">$1,000,000"]

nigeria_data_scientist["salary"] = pd.Categorical(nigeria_data_scientist["salary"], categories=salary_order)
nigeria_data_scientist.sort_values(by="salary", inplace=True)

fig = px.bar(nigeria_data_scientist, x="salary", y="percentage (%)", text='percentage (%)', 
             title="Annual salary of Nigerian Kagglers that are Data Scientists")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_ng, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

# Annual salary of Nigerian Kagglers that are Data Software Engineers

In [31]:
nigeria_software_engineer = nigeria[nigeria["Q5"]=="Software Engineer"]

nigeria_software_engineer = nigeria_software_engineer["Q25"].value_counts(normalize=True) 
nigeria_software_engineer = nigeria_software_engineer.mul(100)
nigeria_software_engineer = nigeria_software_engineer.round(decimals=1)
nigeria_software_engineer = pd.DataFrame(data=nigeria_software_engineer)
nigeria_software_engineer.reset_index(inplace=True)
nigeria_software_engineer.columns = ["salary","percentage (%)"] 

salary_order = ["$0-999", "1,000-1,999", "2,000-2,999", "3,000-3,999", "4,000-4,999", "5,000-7,499",
               "7,500-9,999", "10,000-14,999", "15,000-19,999", "20,000-24,999", "25,000-29,999",
                "30,000-39,999", "40,000-49,999", "50,000-59,999", "60,000-69,999", "70,000-79,999",
               "80,000-89,999", "90,000-99,999", "100,000-124,999", "150,000-199,999", "200,000-249,999",
                ">$1,000,000"]

nigeria_software_engineer["salary"] = pd.Categorical(nigeria_software_engineer["salary"], categories=salary_order)
nigeria_software_engineer.sort_values(by="salary", inplace=True)

fig = px.bar(nigeria_software_engineer, x="salary", y="percentage (%)", text='percentage (%)', 
             title="Annual salary of Nigerian Kagglers that are Software Engineers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_ng, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In general, the salary distribution for Data Analysts, Data Scientists and Software Engineers (Nigerian Kagglers) are similar and mainly earn less than $3,000 per year. 

A minor point is that there are more Software Engineers from the Nigerian Kaggle community earn more (66 percent) than 1000 USD per year compared to those that earn less (34 percent) than 1,000 USD per year. 

In [32]:
len(nigeria_data_analyst), len(nigeria_data_scientist), len(nigeria_software_engineer)

(11, 15, 10)

In [33]:
#Almost half of Nigerian Kagglers (45%) earn less than $1,000 per year.

#About 60% of Nigerians earn less than $2,000 per year.

In [34]:
salary_rotw = rest_of_the_world["Q25"].value_counts(normalize=True) 
salary_rotw = salary_rotw.mul(100)
salary_rotw = salary_rotw.round(decimals=1)
salary_rotw = pd.DataFrame(data=salary_rotw)
salary_rotw.reset_index(inplace=True)
salary_rotw.columns = ["salary","percentage (%)"] 

salary_order = ["$0-999", "1,000-1,999", "2,000-2,999", "3,000-3,999", "4,000-4,999", "5,000-7,499",
               "7,500-9,999", "10,000-14,999", "15,000-19,999", "20,000-24,999", "25,000-29,999",
                "30,000-39,999", "40,000-49,999", "50,000-59,999", "60,000-69,999", "70,000-79,999",
                "80,000-89,999", "90,000-99,999", "100,000-124,999", "150,000-199,999", "200,000-249,999",
                ">$1,000,000"]

salary_rotw["salary"] = pd.Categorical(salary_rotw["salary"], categories=salary_order)
salary_rotw.sort_values(by="salary", inplace=True)


fig = px.bar(salary_rotw, x="salary", y="percentage (%)", text='percentage (%)', 
             title="Annual salary for the rest of the world")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_rotw, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [35]:
#On the other hand, only about 20% of people from other nationalities earn under $1,000. 

# Programming 

# How long have Nigerian Kagglers been programming?

In [36]:
prog_years_ng = nigeria["Q6"].value_counts(normalize=True)
prog_years_ng = prog_years_ng.mul(100)
prog_years_ng = prog_years_ng.round(decimals=1)
prog_years_ng = pd.DataFrame(data=prog_years_ng)
prog_years_ng.reset_index(inplace=True)
prog_years_ng.columns = ["years", "percentage (%)"]


prog_years_order = ["I have never written code", "< 1 years", "1-3 years", "3-5 years", "5-10 years",
                   "10-20 years", "20+ years"]


prog_years_ng["years"] = pd.Categorical(prog_years_ng["years"], categories=prog_years_order)
prog_years_ng.sort_values(by="years", inplace=True)


fig = px.bar(prog_years_ng, x="years", y="percentage (%)", text='percentage (%)', 
             title="Years of programming for Nigerian Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_ng) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.update_xaxes(tickangle=45)
fig.show()


In [37]:
#Most Nigerian Kagglers (72%) have less than 3 years of programming experience.

#7% of Nigerian Kagglers have never written code.

#Only about 9% of Nigerian Kagglers have over 5 years of programming experience.

In [38]:
prog_years_rotw = rest_of_the_world["Q6"].value_counts(normalize=True)
prog_years_rotw = prog_years_rotw.mul(100)
prog_years_rotw = prog_years_rotw.round(decimals=1)
prog_years_rotw = pd.DataFrame(data=prog_years_rotw)
prog_years_rotw.reset_index(inplace=True)
prog_years_rotw.columns = ["years", "percentage (%)"]

prog_years_order = ["I have never written code", "< 1 years", "1-3 years", "3-5 years", "5-10 years",
                   "10-20 years", "20+ years"]

prog_years_rotw["years"] = pd.Categorical(prog_years_rotw["years"], categories=prog_years_order)
prog_years_rotw.sort_values(by="years", inplace=True)



fig = px.bar(prog_years_rotw, x="years", y="percentage (%)", text='percentage (%)', 
             title="Years of programming for the rest of the world")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_rotw, ) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.update_xaxes(tickangle=45)

fig.show()

# What programming languages do Nigerian Kagglers use?

In [39]:
prog_lang_ng = pd.DataFrame(nigeria.loc[:,list(nigeria.columns[7:20])].count(), columns=["count"])
prog_lang_ng["programming language"] = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 
                'Javascript', 'Julia', 'Swift','Bash', 'MATLAB', 'None', 'Other']
prog_lang_ng.sort_values(by = "count", inplace = True, ascending = False)

prog_lang_ng["percentage (%)"] = (prog_lang_ng["count"]/
                              prog_lang_ng["count"].sum()) * 100  

prog_lang_ng = prog_lang_ng.round(decimals=1)

fig = go.Figure(go.Treemap(
    labels = prog_lang_ng["programming language"],
    values = prog_lang_ng["percentage (%)"],
    parents = ['']*prog_lang_ng.shape[0], textinfo = "percent root+label+text", marker=dict(
        
        colorscale='greens',
        )
))

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', 
                  title= "Programming language used regularly by Nigerian Kagglers") 
fig.show()


In [40]:
#The top programming language for Nigerian Kagglers is Python, which is similar to the rest of the world. 

#Most Nigerian Kagglers use Python, SQL and Javascript

#Javascript ranks third compared to C++ which ranks third for the rest of the world. 

In [41]:
prog_lang_rotw = pd.DataFrame(rest_of_the_world.loc[:,list(rest_of_the_world.columns[7:20])].count(), columns=["count"])
prog_lang_rotw["programming language"] = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 
                'Javascript', 'Julia', 'Swift','Bash', 'MATLAB', 'None', 'Other']
prog_lang_rotw.sort_values(by = "count", inplace = True, ascending = False)

prog_lang_rotw["percentage (%)"] = (prog_lang_rotw["count"]/
                              prog_lang_rotw["count"].sum()) * 100  

prog_lang_rotw = prog_lang_rotw.round(decimals=1)

fig = go.Figure(go.Treemap(
    labels = prog_lang_rotw["programming language"],
    values = prog_lang_rotw["percentage (%)"],
    parents = ['']*prog_lang_rotw.shape[0], textinfo = "percent root+label+text", marker=dict(
        
        colorscale='ylorbr',
        )
))

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', 
                  title= "Programming language used by the rest of the world") 
fig.show()


# Integrated development environment (IDE) used by Nigerian Kagglers

In [42]:
ide_ng = pd.DataFrame(nigeria.loc[:,list(nigeria.columns[21:34])].count(), columns=["count"])
ide_ng["IDE"] = ['Jupyter', 'Rstudio', 'Visual Studio', 'VScode', 
                     'PyCharm', 'Spyder', 'Notepad++', 'Sublime Text', 'Vim/Emacs','MATLAB', 
                     'Jupyter Notebook', 'None', 'Other']

ide_ng.sort_values(by = "count", inplace = True, ascending = False)

ide_ng["percentage (%)"] = (ide_ng["count"]/
                              ide_ng["count"].sum()) * 100  

ide_ng = ide_ng.round(decimals=1)

fig = px.bar(ide_ng, x="IDE", y="percentage (%)", text='percentage (%)', 
             title="IDES used by Nigerian Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_ng) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [43]:
ide_rotw = pd.DataFrame(rest_of_the_world.loc[:,list(rest_of_the_world.columns[21:34])].count(), columns=["count"])
ide_rotw["IDE"] = ['Jupyter', 'Rstudio', 'Visual Studio', 'VScode', 
                     'PyCharm', 'Spyder', 'Notepad++', 'Sublime Text', 'Vim/Emacs','MATLAB', 
                     'Jupyter Notebook', 'None', 'Other']

ide_rotw.sort_values(by = "count", inplace = True, ascending = False)

ide_rotw["percentage (%)"] = (ide_rotw["count"]/
                              ide_rotw["count"].sum()) * 100  

ide_rotw = ide_rotw.round(decimals=1)

fig = px.bar(ide_rotw, x="IDE", y="percentage (%)", text='percentage (%)', 
             title="IDES used by the rest of the world")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_color=colors_rotw) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [44]:
#Jupyter Notebook is the most popular IDE among Nigerian Kagglers and the rest of the world.

#Similarly, the three most popular IDEs among Nigerian Kagglers and the rest of the world are 
#Jupyter Notebook, VScode and PyCharm.

# Machine Learning

# How long have Nigerian Kagglers been using Machine Learning methods?

In [45]:
ml_ng = nigeria["Q15"].value_counts(normalize=True)
ml_ng = ml_ng.mul(100)
ml_ng = ml_ng.round(decimals=1)
ml_ng = pd.DataFrame(data=ml_ng)
ml_ng.reset_index(inplace=True)
ml_ng.columns = ["years", "percentage (%)"]

ml_years_order = ["I do not use machine learning methods", "Under 1 year", "1-2 years", "2-3 years", "3-4 years",
                  "4-5 years","5-10 years", "10-20 years"]

ml_ng["years"] = pd.Categorical(ml_ng["years"], categories=ml_years_order)
ml_ng.sort_values(by="years", inplace=True)



fig = px.bar(ml_ng, x="years", y="percentage (%)", text='percentage (%)', 
             title="Years of using Machine Learning methods (Nigerian Kagglers)")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_ng) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.update_xaxes(tickangle=45)

fig.show()

In [46]:
#70% of Nigerian Kagglers have been applying Machine Learning methods for under two years

#20% of Nigerian Kagglers do use Machine Learning methods

In [47]:
ml_rotw = rest_of_the_world["Q15"].value_counts(normalize=True)
ml_rotw = ml_rotw.mul(100)
ml_rotw = ml_rotw.round(decimals=1)
ml_rotw = pd.DataFrame(data=ml_rotw)
ml_rotw.reset_index(inplace=True)
ml_rotw.columns = ["years", "percentage (%)"]

ml_years_order = ["I do not use machine learning methods", "Under 1 year", "1-2 years", "2-3 years", "3-4 years",
                  "4-5 years","5-10 years", "10-20 years"]

ml_rotw["years"] = pd.Categorical(ml_rotw["years"], categories=ml_years_order)
ml_rotw.sort_values(by="years", inplace=True)



fig = px.bar(ml_rotw, x="years", y="percentage (%)", text='percentage (%)', 
             title="Years of using Machine Learning methods (rest of the world)")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_rotw) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.update_xaxes(tickangle=45)

fig.show()

# Machine Learning frameworks used by Nigerian Kagglers 

In [48]:
ml_framework_ng = pd.DataFrame(nigeria.loc[:,list(nigeria.columns[72:90])].count(), columns=["count"])
ml_framework_ng["ml_framework"] = ['Scikit-learn', 'TensorFlow', 'Keras', 'PyTorch', 'Fast.ai',
       'MXNet', 'Xgboost', 'LightGBM', 'CatBoost', 'Prophet', "H2O 3", "Caret", "Tidymodels", "JAX",
                                "PyTorch Lightning", "Huggingface", 'None', 'Other']

ml_framework_ng.sort_values(by = "count", inplace = True, ascending = False)

ml_framework_ng["percentage (%)"] = (ml_framework_ng["count"]/
                              ml_framework_ng["count"].sum()) * 100  

ml_framework_ng = ml_framework_ng.round(decimals=1)

fig = px.bar(ml_framework_ng, x="ml_framework", y="percentage (%)", text='percentage (%)', 
             title="Machine Learning frameworks used regularly by Nigerian Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_ng) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [49]:
#Scikit-learn, TensorFlow and Keras are most popular Machine Learning frameworks used regularly by Nigerian Kagglers, this is also the same with the rest of the world.

In [50]:
ml_framework_rotw = pd.DataFrame(rest_of_the_world.loc[:,list(rest_of_the_world.columns[72:90])].count(), 
                                 columns=["count"])
ml_framework_rotw["ml_framework"] = ['Scikit-learn', 'TensorFlow', 'Keras', 'PyTorch', 'Fast.ai',
       'MXNet', 'Xgboost', 'LightGBM', 'CatBoost', 'Prophet', "H2O 3", "Caret", "Tidymodels", "JAX",
                                "PyTorch Lightning", "Huggingface", 'None', 'Other']

ml_framework_rotw.sort_values(by = "count", inplace = True, ascending = False)

ml_framework_rotw["percentage (%)"] = (ml_framework_rotw["count"]/
                              ml_framework_rotw["count"].sum()) * 100  

ml_framework_rotw = ml_framework_rotw.round(decimals=1)

fig = px.bar(ml_framework_rotw, x="ml_framework", y="percentage (%)", text='percentage (%)', 
             title="Machine Learning frameworks used regularly by the rest of the world")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_rotw) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [51]:
ml_algorithm_ng = pd.DataFrame(nigeria.loc[:,list(nigeria.columns[90:102])].count(), columns=["count"])
ml_algorithm_ng["ml_algorithm"] = ['Linear or Logistic Regression', 'Decision Trees or Random Forests', 
                                 'Gradient Boosting Machines (xgboost, lightgbm, etc)', 
                                 'Bayesian Approaches', 'Evolutionary Approaches',
                                 'Dense Neural Networks (MLPs, etc)', 
                                 'Convolutional Neural Networks', 'Generative Adversarial Networks', 
                                 'Recurrent Neural Networks', 'Transformer Networks (BERT, gpt-3, etc)', 
       'None', 'Other']

ml_algorithm_ng.sort_values(by = "count", inplace = True, ascending = False)

ml_algorithm_ng["percentage (%)"] = (ml_algorithm_ng["count"]/
                              ml_algorithm_ng["count"].sum()) * 100  

ml_algorithm_ng = ml_algorithm_ng.round(decimals=1)

fig = px.bar(ml_algorithm_ng, x="ml_algorithm", y="percentage (%)", text='percentage (%)', 
             title="Machine Learning algorithms used regularly by Nigerian Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_ng) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [52]:
#Linear/Logistic Regression, Decision Trees/Random Forests and Gradient Boosting Machines(XGBoost, lightgbm, etc) are most popular Machine Learning algorithms used regularly by Nigerian Kagglers, this is also the same with the rest of the world. 

In [53]:
ml_algorithm_rotw = pd.DataFrame(rest_of_the_world.loc[:,list(rest_of_the_world.columns[90:102])].count(), 
                                 columns=["count"])
ml_algorithm_rotw["ml_algorithm"] = ['Linear or Logistic Regression', 'Decision Trees or Random Forests', 
                                 'Gradient Boosting Machines (xgboost, lightgbm, etc)', 
                                 'Bayesian Approaches', 'Evolutionary Approaches',
                                 'Dense Neural Networks (MLPs, etc)', 
                                 'Convolutional Neural Networks', 'Generative Adversarial Networks', 
                                 'Recurrent Neural Networks', 'Transformer Networks (BERT, gpt-3, etc)', 
       'None', 'Other']

ml_algorithm_rotw.sort_values(by = "count", inplace = True, ascending = False)

ml_algorithm_rotw["percentage (%)"] = (ml_algorithm_rotw["count"]/
                              ml_algorithm_rotw["count"].sum()) * 100  

ml_algorithm_rotw = ml_algorithm_rotw.round(decimals=1)

fig = px.bar(ml_algorithm_rotw, x="ml_algorithm", y="percentage (%)", text='percentage (%)', 
             title="Machine Learning algorithms used regularly by the rest of the world")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_rotw) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [54]:
database_ng = pd.DataFrame(nigeria.loc[:,list(nigeria.columns[165:186])].count(), columns=["count"])
database_ng["database"] = ['MySQL', 'PostgreSQL', 'SQLite', 
                                 'Oracle Database', 'MongoDB',
                                 'Snowflake', 'IBM Db2', 'Microsoft SQL Server', 
                                 'Microsoft Azure SQL Database', 'Microsoft Azure Cosmos DB',
                         "Amazon Redshift", "Amazon Aurora", "Amazon RDS", "Amazon DynamoDB",
                         "Google Cloud BigQuery", "Google Cloud SQL", "Google Cloud Firestore",
                         "Google Cloud BigTable", "Google Cloud Spanner",
       'None', 'Other']

database_ng.sort_values(by = "count", inplace = True, ascending = False)

database_ng["percentage (%)"] = (database_ng["count"]/
                              database_ng["count"].sum()) * 100  

database_ng = database_ng.round(decimals=1)

fig = px.bar(database_ng, x="database", y="percentage (%)", text='percentage (%)', 
             title="Database used by Nigerian Kagglers")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_ng) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

In [55]:
#The most common databases used by Nigerian Kagglers are:
#MySQL
#PostgreSQL
#Microsoft SQL Server

#This is the same as the rest of the world

In [56]:
database_rotw = pd.DataFrame(rest_of_the_world.loc[:,list(rest_of_the_world.columns[165:186])].count(), columns=["count"])
database_rotw["database"] = ['MySQL', 'PostgreSQL', 'SQLite', 
                                 'Oracle Database', 'MongoDB',
                                 'Snowflake', 'IBM Db2', 'Microsoft SQL Server', 
                                 'Microsoft Azure SQL Database', 'Microsoft Azure Cosmos DB',
                         "Amazon Redshift", "Amazon Aurora", "Amazon RDS", "Amazon DynamoDB",
                         "Google Cloud BigQuery", "Google Cloud SQL", "Google Cloud Firestore",
                         "Google Cloud BigTable", "Google Cloud Spanner",
       'None', 'Other']

database_rotw.sort_values(by = "count", inplace = True, ascending = False)

database_rotw["percentage (%)"] = (database_rotw["count"]/
                              database_rotw["count"].sum()) * 100  

database_rotw = database_rotw.round(decimals=1)

fig = px.bar(database_rotw, x="database", y="percentage (%)", text='percentage (%)', 
             title="Database used by the rest of the world")
fig.update_traces(texttemplate='%{text:.2s}', textposition='auto', marker_color=colors_rotw) 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor = "white")
fig.show()

# Favourite media source on data science topics

In [57]:
media_source_ng = pd.DataFrame(nigeria.loc[:,list(nigeria.columns[256:268])].count(), columns=["count"])
media_source_ng["media_source"] = ['Twitter', 'Email newsletters', 'Reddit', 'Kaggle', 'Course Forums',
       'YouTube', 'Podcasts', 'Blogs', 'Journal Publications', 'Slack Communities',
       'None', 'Other']

media_source_ng.sort_values(by = "count", inplace = True, ascending = False)

media_source_ng["percentage (%)"] = (media_source_ng["count"]/
                              media_source_ng["count"].sum()) * 100  

media_source_ng = media_source_ng.round(decimals=1)

fig = go.Figure(go.Treemap(
    labels = media_source_ng["media_source"],
    values = media_source_ng["percentage (%)"] ,
    parents = ['']*media_source_ng.shape[0], textinfo = "percent root+label+text", marker=dict(
        
        colorscale='greens',
        )
))

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', 
                  title= "Favourite media source on data science topics for Nigerian Kagglers") 
fig.show()

In [58]:
#The top three favourite media sources on data science topics by Nigerian Kagglers are:
#Kaggle, YouTube and Blogs

In [59]:
media_source_rotw = pd.DataFrame(rest_of_the_world.loc[:,list(rest_of_the_world.columns[256:268])].count(), 
                                 columns=["count"])
media_source_rotw["media_source"] = ['Twitter', 'Email newsletters', 'Reddit', 'Kaggle', 'Course Forums',
       'YouTube', 'Podcasts', 'Blogs', 'Journal Publications', 'Slack Communities',
       'None', 'Other']

media_source_rotw.sort_values(by = "count", inplace = True, ascending = False)

media_source_rotw["percentage (%)"] = (media_source_rotw["count"]/
                              media_source_rotw["count"].sum()) * 100  

media_source_rotw = media_source_rotw.round(decimals=1)

fig = go.Figure(go.Treemap(
    labels = media_source_rotw["media_source"],
    values = media_source_rotw["percentage (%)"] ,
    parents = ['']*media_source_rotw.shape[0], textinfo = "percent root+label+text", marker=dict(
        
        colorscale='ylorbr',
        )
))

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', 
                  title= "Favourite media source on data science topics for the rest of the world") 
fig.show()

In [60]:
nigeria.head(3)

Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7_Part_1,Q7_Part_2,Q7_Part_3,Q7_Part_4,Q7_Part_5,Q7_Part_6,Q7_Part_7,Q7_Part_8,Q7_Part_9,Q7_Part_10,Q7_Part_11,Q7_Part_12,Q7_OTHER,Q8,Q9_Part_1,Q9_Part_2,Q9_Part_3,Q9_Part_4,Q9_Part_5,Q9_Part_6,Q9_Part_7,Q9_Part_8,Q9_Part_9,Q9_Part_10,Q9_Part_11,Q9_Part_12,Q9_OTHER,Q10_Part_1,Q10_Part_2,Q10_Part_3,Q10_Part_4,Q10_Part_5,Q10_Part_6,Q10_Part_7,Q10_Part_8,Q10_Part_9,Q10_Part_10,Q10_Part_11,Q10_Part_12,Q10_Part_13,Q10_Part_14,Q10_Part_15,Q10_Part_16,Q10_OTHER,Q11,Q12_Part_1,Q12_Part_2,Q12_Part_3,Q12_Part_4,Q12_Part_5,Q12_OTHER,Q13,Q14_Part_1,Q14_Part_2,Q14_Part_3,Q14_Part_4,Q14_Part_5,Q14_Part_6,Q14_Part_7,Q14_Part_8,Q14_Part_9,Q14_Part_10,Q14_Part_11,Q14_OTHER,Q15,Q16_Part_1,Q16_Part_2,Q16_Part_3,Q16_Part_4,Q16_Part_5,Q16_Part_6,Q16_Part_7,Q16_Part_8,Q16_Part_9,Q16_Part_10,Q16_Part_11,Q16_Part_12,Q16_Part_13,Q16_Part_14,Q16_Part_15,Q16_Part_16,Q16_Part_17,Q16_OTHER,Q17_Part_1,Q17_Part_2,Q17_Part_3,Q17_Part_4,Q17_Part_5,Q17_Part_6,Q17_Part_7,Q17_Part_8,Q17_Part_9,Q17_Part_10,Q17_Part_11,Q17_OTHER,Q18_Part_1,Q18_Part_2,Q18_Part_3,Q18_Part_4,Q18_Part_5,Q18_Part_6,Q18_OTHER,Q19_Part_1,Q19_Part_2,Q19_Part_3,Q19_Part_4,Q19_Part_5,Q19_OTHER,Q20,Q21,Q22,Q23,Q24_Part_1,Q24_Part_2,Q24_Part_3,Q24_Part_4,Q24_Part_5,Q24_Part_6,Q24_Part_7,Q24_OTHER,Q25,Q26,Q27_A_Part_1,Q27_A_Part_2,Q27_A_Part_3,Q27_A_Part_4,Q27_A_Part_5,Q27_A_Part_6,Q27_A_Part_7,Q27_A_Part_8,Q27_A_Part_9,Q27_A_Part_10,Q27_A_Part_11,Q27_A_OTHER,Q28,Q29_A_Part_1,Q29_A_Part_2,Q29_A_Part_3,Q29_A_Part_4,Q29_A_OTHER,Q30_A_Part_1,Q30_A_Part_2,Q30_A_Part_3,Q30_A_Part_4,Q30_A_Part_5,Q30_A_Part_6,Q30_A_Part_7,Q30_A_OTHER,Q31_A_Part_1,Q31_A_Part_2,Q31_A_Part_3,Q31_A_Part_4,Q31_A_Part_5,Q31_A_Part_6,Q31_A_Part_7,Q31_A_Part_8,Q31_A_Part_9,Q31_A_OTHER,Q32_A_Part_1,Q32_A_Part_2,Q32_A_Part_3,Q32_A_Part_4,Q32_A_Part_5,Q32_A_Part_6,Q32_A_Part_7,Q32_A_Part_8,Q32_A_Part_9,Q32_A_Part_10,Q32_A_Part_11,Q32_A_Part_12,Q32_A_Part_13,Q32_A_Part_14,Q32_A_Part_15,Q32_A_Part_16,Q32_A_Part_17,Q32_A_Part_18,Q32_A_Part_19,Q32_A_Part_20,Q32_A_OTHER,Q33,Q34_A_Part_1,Q34_A_Part_2,Q34_A_Part_3,Q34_A_Part_4,Q34_A_Part_5,Q34_A_Part_6,Q34_A_Part_7,Q34_A_Part_8,Q34_A_Part_9,Q34_A_Part_10,Q34_A_Part_11,Q34_A_Part_12,Q34_A_Part_13,Q34_A_Part_14,Q34_A_Part_15,Q34_A_Part_16,Q34_A_OTHER,Q35,Q36_A_Part_1,Q36_A_Part_2,Q36_A_Part_3,Q36_A_Part_4,Q36_A_Part_5,Q36_A_Part_6,Q36_A_Part_7,Q36_A_OTHER,Q37_A_Part_1,Q37_A_Part_2,Q37_A_Part_3,Q37_A_Part_4,Q37_A_Part_5,Q37_A_Part_6,Q37_A_Part_7,Q37_A_OTHER,Q38_A_Part_1,Q38_A_Part_2,Q38_A_Part_3,Q38_A_Part_4,Q38_A_Part_5,Q38_A_Part_6,Q38_A_Part_7,Q38_A_Part_8,Q38_A_Part_9,Q38_A_Part_10,Q38_A_Part_11,Q38_A_OTHER,Q39_Part_1,Q39_Part_2,Q39_Part_3,Q39_Part_4,Q39_Part_5,Q39_Part_6,Q39_Part_7,Q39_Part_8,Q39_Part_9,Q39_OTHER,Q40_Part_1,Q40_Part_2,Q40_Part_3,Q40_Part_4,Q40_Part_5,Q40_Part_6,Q40_Part_7,Q40_Part_8,Q40_Part_9,Q40_Part_10,Q40_Part_11,Q40_OTHER,Q41,Q42_Part_1,Q42_Part_2,Q42_Part_3,Q42_Part_4,Q42_Part_5,Q42_Part_6,Q42_Part_7,Q42_Part_8,Q42_Part_9,Q42_Part_10,Q42_Part_11,Q42_OTHER,Q27_B_Part_1,Q27_B_Part_2,Q27_B_Part_3,Q27_B_Part_4,Q27_B_Part_5,Q27_B_Part_6,Q27_B_Part_7,Q27_B_Part_8,Q27_B_Part_9,Q27_B_Part_10,Q27_B_Part_11,Q27_B_OTHER,Q29_B_Part_1,Q29_B_Part_2,Q29_B_Part_3,Q29_B_Part_4,Q29_B_OTHER,Q30_B_Part_1,Q30_B_Part_2,Q30_B_Part_3,Q30_B_Part_4,Q30_B_Part_5,Q30_B_Part_6,Q30_B_Part_7,Q30_B_OTHER,Q31_B_Part_1,Q31_B_Part_2,Q31_B_Part_3,Q31_B_Part_4,Q31_B_Part_5,Q31_B_Part_6,Q31_B_Part_7,Q31_B_Part_8,Q31_B_Part_9,Q31_B_OTHER,Q32_B_Part_1,Q32_B_Part_2,Q32_B_Part_3,Q32_B_Part_4,Q32_B_Part_5,Q32_B_Part_6,Q32_B_Part_7,Q32_B_Part_8,Q32_B_Part_9,Q32_B_Part_10,Q32_B_Part_11,Q32_B_Part_12,Q32_B_Part_13,Q32_B_Part_14,Q32_B_Part_15,Q32_B_Part_16,Q32_B_Part_17,Q32_B_Part_18,Q32_B_Part_19,Q32_B_Part_20,Q32_B_OTHER,Q34_B_Part_1,Q34_B_Part_2,Q34_B_Part_3,Q34_B_Part_4,Q34_B_Part_5,Q34_B_Part_6,Q34_B_Part_7,Q34_B_Part_8,Q34_B_Part_9,Q34_B_Part_10,Q34_B_Part_11,Q34_B_Part_12,Q34_B_Part_13,Q34_B_Part_14,Q34_B_Part_15,Q34_B_Part_16,Q34_B_OTHER,Q36_B_Part_1,Q36_B_Part_2,Q36_B_Part_3,Q36_B_Part_4,Q36_B_Part_5,Q36_B_Part_6,Q36_B_Part_7,Q36_B_OTHER,Q37_B_Part_1,Q37_B_Part_2,Q37_B_Part_3,Q37_B_Part_4,Q37_B_Part_5,Q37_B_Part_6,Q37_B_Part_7,Q37_B_OTHER,Q38_B_Part_1,Q38_B_Part_2,Q38_B_Part_3,Q38_B_Part_4,Q38_B_Part_5,Q38_B_Part_6,Q38_B_Part_7,Q38_B_Part_8,Q38_B_Part_9,Q38_B_Part_10,Q38_B_Part_11,Q38_B_OTHER
14,852,45-49,Man,Nigeria,Master’s degree,Program/Project Manager,5-10 years,Python,,SQL,,,,,,,,,,,Python,,,,,,Spyder,,,,,Jupyter Notebook,,,Kaggle Notebooks,,,,,,,,,,,,,,,,,A laptop,,,,,,,Never,Matplotlib,Seaborn,,,,,,,,,,,1-2 years,Scikit-learn,,,,,,Xgboost,,,,,,,,,,,,Linear or Logistic Regression,Decision Trees or Random Forests,,,,,,,,,,,,,,,,,,,,,,,,Shipping/Transportation,"1000-9,999 employees",10-14,We are exploring ML methods (and may one day p...,Analyze and understand data to influence produ...,Build and/or run the data infrastructure that ...,,,,,,,"2,000-2,999",$100-$999,,,,,Oracle Cloud,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No / None,,,,,Oracle Database,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,No / None,,,,,,,,,,,,,,,,,,,,No / None,,,,,,,Kaggle,,,,,Coursera,edX,Kaggle Learn Courses,DataCamp,,,Udemy,,,,,,"Local development environments (RStudio, Jupyt...",,,,"Kaggle (notebooks, forums, etc)",,"YouTube (Kaggle YouTube, Cloud AI Adventures, ...",,"Blogs (Towards Data Science, Analytics Vidhya,...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
15,838,22-24,Man,Nigeria,Bachelor’s degree,Other,< 1 years,Python,,,,,,,,,,,,,Python,,,,Visual Studio Code (VSCode),,,,,,,,,,,,,,,,,,,,,,,,,,,A laptop,NVIDIA GPUs,,,,,,Once,,,Plotly / Plotly Express,,,,,,,,,,Under 1 year,,TensorFlow,,,,,,,,,,,,,,,,,Linear or Logistic Regression,,,,,,,,,,,,,,,,,,,,,,,,,Academics/Education,0-49 employees,1-2,No (we do not use ML methods),Analyze and understand data to influence produ...,,,,,,,,$0-999,$0 ($USD),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Coursera,,,,,,,,,,,,"Basic statistical software (Microsoft Excel, G...",,"Email newsletters (Data Elixir, O'Reilly Data ...",,,,,"Podcasts (Chai Time Data Science, O’Reilly Dat...","Blogs (Towards Data Science, Analytics Vidhya,...",,,,,,Microsoft Azure,,,,,,,,,,,,Microsoft Azure Virtual Machines,,,,,,,,,,,,,Azure Machine Learning Studio,,,Databricks,,,,,,MySQL,,,,,,,Microsoft SQL Server,Microsoft Azure SQL Database,,,,,,,,,,,,,,,Google Data Studio,,,,Tableau CRM,,,,,,,Microsoft Azure Synapse,,,,,,"Automated model selection (e.g. auto-sklearn, ...",,,,,,,,,,,Azure Automated Machine Learning,,,,,,,TensorBoard,,,,,,,
111,699,18-21,Man,Nigeria,Bachelor’s degree,Student,3-5 years,Python,,,,,,,,,,,,,Python,"Jupyter (JupyterLab, Jupyter Notebooks, etc)",,,,PyCharm,,,,,,,,,Kaggle Notebooks,,,,,,,,,,,,,,,,,A laptop,,,,,,,Never,Matplotlib,,,,,,,,,,,,Under 1 year,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Amazon Web Services (AWS),,Google Cloud Platform (GCP),,,,,,,,,,,,Google Cloud Compute Engine,,,,,,,,,,,,,,DataRobot,Databricks,,,Rapidminer,,,MySQL,PostgreSQL,,,,,,,,,,,,,,,,,,,,Microsoft Power BI,,Google Data Studio,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [61]:
nigeria_1 = pd.DataFrame(nigeria, columns=["Q5", "Q25"])
nigeria_1.head(3)

Unnamed: 0,Q5,Q25
14,Program/Project Manager,"2,000-2,999"
15,Other,$0-999
111,Student,


In [62]:
salary_ng

Unnamed: 0,salary,percentage (%)
0,$0-999,45.3
1,"1,000-1,999",14.3
2,"2,000-2,999",9.2
6,"3,000-3,999",4.2
5,"4,000-4,999",4.4
3,"5,000-7,499",5.3
7,"7,500-9,999",2.5
4,"10,000-14,999",5.1
8,"15,000-19,999",1.6
9,"20,000-24,999",1.4


In [63]:
nigeria_2 = nigeria_1[~nigeria_1["Q25"].isin(["$0-999", "1,000-1,999", "2,000-2,999",
                                             "3,000-3,999", "4,000-4,999", "5,000-7,499",
                                             "7,500-9,999", "10,000-14,999",  "15,000-19,999",
                                                                                           ])]
nigeria_2.head(3) 

Unnamed: 0,Q5,Q25
111,Student,
196,Research Scientist,
273,Currently not employed,


In [64]:
len(nigeria_2)

304

In [65]:
nigeria_2["Q25"].unique()

array([nan, '80,000-89,999', '60,000-69,999', '150,000-199,999',
       '70,000-79,999', '20,000-24,999', '30,000-39,999', '40,000-49,999',
       '25,000-29,999', '100,000-124,999', '>$1,000,000', '50,000-59,999',
       '90,000-99,999', '200,000-249,999'], dtype=object)

In [66]:
nigeria_2.groupby("Q5").count()["Q25"].sort_values(ascending=False)

Q5
Other                        6
Data Scientist               6
Data Analyst                 6
Research Scientist           5
Software Engineer            4
Machine Learning Engineer    2
Data Engineer                2
Business Analyst             2
Program/Project Manager      1
Product Manager              1
Student                      0
Statistician                 0
Currently not employed       0
Name: Q25, dtype: int64