# Kaggle ML & DS Survey 2020

## Importing necessary Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from itertools import combinations
from collections import Counter
import warnings

warnings.filterwarnings("ignore")
sns.set_style('darkgrid')
sns.color_palette("rocket")
%matplotlib inline

pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv('data/kaggle_survey_2020_responses.csv')

In [3]:
df.head()

Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7_Part_1,Q7_Part_2,Q7_Part_3,...,Q35_B_Part_2,Q35_B_Part_3,Q35_B_Part_4,Q35_B_Part_5,Q35_B_Part_6,Q35_B_Part_7,Q35_B_Part_8,Q35_B_Part_9,Q35_B_Part_10,Q35_B_OTHER
0,Duration (in seconds),What is your age (# years)?,What is your gender? - Selected Choice,In which country do you currently reside?,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?,Select the title most similar to your current role (or most recent title if retired): - Selected Choice,For how many years have you been writing code and/or programming?,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - R,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - SQL,...,"In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Weights & Biases","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Comet.ml","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Sacred + Omniboard","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - TensorBoard","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Guild.ai","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Polyaxon","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Trains","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Domino Model Monitor","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - None","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Other"
1,1838,35-39,Man,Colombia,Doctoral degree,Student,5-10 years,Python,R,SQL,...,,,,TensorBoard,,,,,,
2,289287,30-34,Man,United States of America,Master’s degree,Data Engineer,5-10 years,Python,R,SQL,...,,,,,,,,,,
3,860,35-39,Man,Argentina,Bachelor’s degree,Software Engineer,10-20 years,,,,...,,,,,,,,,,
4,507,30-34,Man,United States of America,Master’s degree,Data Scientist,5-10 years,Python,,SQL,...,,,,,,,,,,


## Data Cleaning and Transformation

### Storing Questions as a seperate DataFrame

In [4]:
questions = df.loc[:0]

questions

Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7_Part_1,Q7_Part_2,Q7_Part_3,...,Q35_B_Part_2,Q35_B_Part_3,Q35_B_Part_4,Q35_B_Part_5,Q35_B_Part_6,Q35_B_Part_7,Q35_B_Part_8,Q35_B_Part_9,Q35_B_Part_10,Q35_B_OTHER
0,Duration (in seconds),What is your age (# years)?,What is your gender? - Selected Choice,In which country do you currently reside?,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?,Select the title most similar to your current role (or most recent title if retired): - Selected Choice,For how many years have you been writing code and/or programming?,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - R,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - SQL,...,"In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Weights & Biases","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Comet.ml","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Sacred + Omniboard","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - TensorBoard","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Guild.ai","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Polyaxon","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Trains","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Domino Model Monitor","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - None","In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) - Selected Choice - Other"


In [5]:
multiple_options_questions = set()
multiple_options_questions_A_B = set()

for q in list(questions.filter(regex=("Q\d+_Part_\d+")).columns):
    multiple_options_questions.add(int(q.split("_")[0][1:]))
for q in list(questions.filter(regex=("Q\d+_A_Part_\d+")).columns):
    multiple_options_questions_A_B.add(int(q.split("_")[0][1:]))

In [6]:
def extract_question(row):
    row = list(row)
    return row[0].split("-")[0].strip()

for i in list(multiple_options_questions):
    questions[f"Q{i}"] = questions.filter(regex=(f"Q{i}_Part_\d+")).apply(lambda row: extract_question(row.values.astype(str)), axis=1)
    
for i in list(multiple_options_questions_A_B):
    questions[f"Q{i}_A"] = questions.filter(regex=(f"Q{i}_A_Part_\d+")).apply(lambda row: extract_question(row.values.astype(str)), axis=1)
    questions[f"Q{i}_B"] = questions.filter(regex=(f"Q{i}_B_Part_\d+")).apply(lambda row: extract_question(row.values.astype(str)), axis=1)

In [7]:
questions.drop(columns=questions.filter(regex=("Q\d+_Part_\d+")).columns, inplace=True)
questions.drop(columns=questions.filter(regex=("Q\d+_A_Part_\d+")).columns, inplace=True)
questions.drop(columns=questions.filter(regex=("Q\d+_B_Part_\d+")).columns, inplace=True)
questions.drop(columns=questions.filter(regex=("Q\d+_A_OTHER")).columns, inplace=True)
questions.drop(columns=questions.filter(regex=("Q\d+_B_OTHER")).columns, inplace=True)
questions.drop(columns=questions.filter(regex=("Q\d+_OTHER")).columns, inplace=True)

In [8]:
questions= questions.transpose().iloc[1:]
questions.columns = ['Question']
questions.sort_index(inplace=True)
questions

Unnamed: 0,Question
Q1,What is your age (# years)?
Q10,Which of the following hosted notebook products do you use on a regular basis? (Select all that apply)
Q11,What type of computing platform do you use most often for your data science projects? - Selected Choice
Q12,Which types of specialized hardware do you use on a regular basis? (Select all that apply)
Q13,Approximately how many times have you used a TPU (tensor processing unit)?
Q14,What data visualization libraries or tools do you use on a regular basis? (Select all that apply)
Q15,For how many years have you used machine learning methods?
Q16,Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply)
Q17,Which of the following ML algorithms do you use on a regular basis? (Select all that apply):
Q18,Which categories of computer vision methods do you use on a regular basis? (Select all that apply)


In [9]:
df = df[1:]
df.reset_index(inplace=True)

### Marking all the missing values as `Not Specified`

In [10]:
df.replace(to_replace=np.nan, value="Not Specified", inplace=True)

### Grouping responses for questions with more than one selected options

In [11]:
def group(row):
    row = list(row)
    result = list()
    for val in row:
        if val == "Not Specified":
            pass
        else:
            result.append(val.strip())
    return ";".join(result)

responses = pd.DataFrame()
for i in range(1, 40):
    if i in list(multiple_options_questions):
        responses[f"Q{i}"] = df.filter(regex=(f"Q{i}_Part_\d+")).apply(lambda row: group(row.values.astype(str)), axis=1)
    elif i in list(multiple_options_questions_A_B):
        responses[f"Q{i}_A"] = df.filter(regex=(f"Q{i}_A_Part_\d+")).apply(lambda row: group(row.values.astype(str)), axis=1)
        responses[f"Q{i}_B"] = df.filter(regex=(f"Q{i}_B_Part_\d+")).apply(lambda row: group(row.values.astype(str)), axis=1)
    else:
        responses[f"Q{i}"] = df[f"Q{i}"]

### Splitting the respondents into two categories: `Professional` and `Non Professional`

According to the [Survey Methodology](data/supplementary_data/kaggle_survey_2020_methodology.pdf) provided with the Data, the respondents can be categorised as `Non Professional` if the respondent is either a 
- *student* or 
- *unemployed* or
- *has never spent money on cloud services* 

In [12]:
non_prof_index = responses[(responses['Q5']=='Student') | 
                           (responses['Q5']=='Currently not employed') | 
                           (responses['Q5']=='Other') |  
                           (responses['Q5']=='Not Specified') |
                           (responses['Q25']=='$0 ($USD)')].index

type_of_job_role =  list()
for index, row in responses.iterrows():
    if index in list(non_prof_index):
        type_of_job_role.append("Non Professional")
    else:
        type_of_job_role.append("Professional")
responses['type of Job Role'] = type_of_job_role

In [13]:
non_professional = responses[responses['type of Job Role']=="Non Professional"]
professional = responses[responses['type of Job Role']=="Professional"]

Part B of the questions are supplement questions rephrased for non professional respondents

In [14]:
non_professional.drop(columns=non_professional.filter(regex=("Q\d+_A")).columns, inplace=True)
professional.drop(columns=professional.filter(regex=("Q\d+_B")).columns, inplace=True)

In [15]:
questions.to_csv('data/questions.csv')
responses.to_csv('data/responses.csv', index=False)
professional.to_csv('data/professional.csv', index=False)
non_professional.to_csv('data/non_professional.csv', index=False)