In [1]:
# Import packages
import numpy as np
import pandas as pd

In [2]:
# Load data, 2 years
df1 = pd.read_csv("survey_results_public_2021.csv")
print("2021:", df1.shape)

df2 = pd.read_csv("survey_results_public_2022.csv")
print("2022:", df2.shape)

df = pd.concat([df1, df2], ignore_index=True)

# remove salary outliers
q1, q3 = df["ConvertedCompYearly"].quantile([0.25, 0.75])
df = df.loc[df.ConvertedCompYearly < q3 + 1.5 * (q3 - q1)]

print("2021-2022:", df.shape)

df.head(3)

2021: (83439, 48)
2022: (73268, 79)
2021-2022: (79115, 85)


Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Frequency_1,Frequency_2,Frequency_3,TimeSearching,TimeAnswering,Onboarding,ProfessionalTech,TrueFalse_1,TrueFalse_2,TrueFalse_3
0,1,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Slovakia,,,"Secondary school (e.g. American high school, G...",18 - 24 years,Coding Bootcamp;Other online resources (ex: vi...,,...,,,,,,,,,,
9,10,I am a developer by profession,Employed full-time,Sweden,,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",11 - 17 years,School,7.0,...,,,,,,,,,,
11,12,I am a developer by profession,Employed full-time,Spain,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",12.0,...,,,,,,,,,,


In [3]:
# Select columns 
keep_col = [#'ResponseId',

 'LanguageHaveWorkedWith',
 'ToolsTechHaveWorkedWith',
 #'MiscTechHaveWorkedWith',
 'WebframeHaveWorkedWith',
 'PlatformHaveWorkedWith',
 'DatabaseHaveWorkedWith',
 #'NEWCollabToolsHaveWorkedWith',
            
 #'NEWCollabToolsWantToWorkWith',
 #'DatabaseWantToWorkWith',
 #'WebframeWantToWorkWith',
 #'ToolsTechWantToWorkWith',
 #'PlatformWantToWorkWith',
 #'MiscTechWantToWorkWith',
 #'LanguageWantToWorkWith',
 
 'Age',
 'Accessibility',
 'EdLevel',
 'Employment',
 #'Ethnicity',
 'Gender',
 'MentalHealth',

 'MainBranch',
 #'DevType',
 'YearsCode',
 'YearsCodePro',

 #'Currency',
 'Country',
 'ConvertedCompYearly',
 #'CompTotal',
 #'CompFreq',
 #'Sexuality',
 #'OrgSize'
]

In [4]:
# Clean data : drop columns, NaNs, Concat (& drop) HaveWorkedWith

df = df[keep_col]
print(df.shape)

df = df.dropna(subset=[col for col in df.columns if not col.endswith("HaveWorkedWith")])
print(df.shape)

df.head(3)

(79115, 16)
(73554, 16)


Unnamed: 0,LanguageHaveWorkedWith,ToolsTechHaveWorkedWith,WebframeHaveWorkedWith,PlatformHaveWorkedWith,DatabaseHaveWorkedWith,Age,Accessibility,EdLevel,Employment,Gender,MentalHealth,MainBranch,YearsCode,YearsCodePro,Country,ConvertedCompYearly
9,C++;Python,Git,,,PostgreSQL,25-34 years old,None of the above,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Employed full-time,Man,None of the above,I am a developer by profession,7,4,Sweden,51552.0
11,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...,Git,Express;React.js;Vue.js,AWS,PostgreSQL,25-34 years old,None of the above,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Man,None of the above,I am a developer by profession,12,5,Spain,46482.0
12,C;C++;Java;Perl;Ruby,Git,Ruby on Rails,,,25-34 years old,None of the above,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Employed full-time,Man,Prefer not to say,I am a developer by profession,15,6,Germany,77290.0


In [5]:
# Clean data : Reduce distincs responses

def clean_gender(string):
    if string.split(';')[0] == 'Man':
        return 'Man'
    elif 'Woman' in string:
        return 'Woman'
    else:
        return 'NonBinary'
    
def clean_accessibility(string):
    if 'None of the above' in string:
        return 'No'
    elif 'Prefer not to say' in string:
        return 'No'
    else:
        return 'Yes'
    
def clean_mentalhealth(string):
    if 'None of the above' in string:
        return 'No'
    elif 'Prefer not to say' in string:
        return 'No'
    else:
        return 'Yes'
    
def clean_edlevel(string):
    """
     {'Associate degree (A.A., A.S., etc.)',
     'Bachelor’s degree (B.A., B.S., B.Eng., etc.)',
     'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)',
     'Other doctoral degree (Ph.D., Ed.D., etc.)',
     'Primary/elementary school',
     'Professional degree (JD, MD, etc.)',
     'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
     'Some college/university study without earning a degree',
     'Something else'}
     """
    if 'Associate degree (A.A., A.S., etc.)' in string:
        return 'Undergraduate'
    elif 'Bachelor’s degree (B.A., B.S., B.Eng., etc.)' in string:
        return 'Undergraduate'
    elif 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)' in string:
        return 'Master'
    elif 'Other doctoral degree (Ph.D., Ed.D., etc.)' in string:
        return 'PhD'
    elif 'Primary/elementary school' in string:
        return 'NoHigherEd'
    elif 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)' in string:
        return 'NoHigherEd'
    else:
        return 'Other'

    
def clean_mainbranch(string):
    """
    {'I am a developer by profession',
     'I am a student who is learning to code',
     'I am learning to code',
     'I am not primarily a developer, but I write code sometimes as part of my work',
     'I code primarily as a hobby',
     'I used to be a developer by profession, but no longer am',
     'None of these'}
    """
    if 'I am a developer by profession' in string:
        return 'Dev'
    else:
        return 'NotDev'

    
def clean_employ(string):
    """
    {'Employed full-time',
     'Employed part-time',
     'Employed, full-time',
     'Employed, full-time;Employed, part-time',
     'Employed, full-time;Independent contractor, freelancer, or self-employed',
     'Employed, full-time;Independent contractor, freelancer, or self-employed;Employed, part-time',
     'Employed, full-time;Independent contractor, freelancer, or self-employed;Retired',
     'Employed, full-time;Retired',
     'Employed, part-time',
     'Employed, part-time;Retired',
     'I prefer not to say',
     'Independent contractor, freelancer, or self-employed',
     'Independent contractor, freelancer, or self-employed;Employed, part-time',
     'Independent contractor, freelancer, or self-employed;Retired',
     'Retired'}
    """
    if 'full-time' in string:
        return 1
    else:
        return 0
    
age_mapping = {'35-44 years old': '>35', 
               '25-34 years old': '<35',
               '55-64 years old': '>35', 
               '45-54 years old': '>35', 
               '65 years or older': '>35',
               'Under 18 years old': '<35',
               '18-24 years old': '<35'}

In [6]:
# Clean columns kept

df = (df
      .loc[lambda df_: df_['Age'] != 'Prefer not to say'] # Drop rows where age is not filled in and change category names
      .rename(columns = {'ConvertedCompYearly': 'PreviousSalary'})
      .assign(Gender = lambda df_: df_.Gender.apply(clean_gender),
              Accessibility = lambda df_: df_.Accessibility.apply(clean_accessibility),
              EdLevel = lambda df_: df_.EdLevel.apply(clean_edlevel),
              MainBranch = lambda df_: df_.MainBranch.apply(clean_mainbranch),
              Employment = lambda df_: df_.Employment.apply(clean_employ),
              MentalHealth = lambda df_: df_.MentalHealth.apply(clean_mentalhealth),
              Age = lambda df_: df_.Age.replace(age_mapping))
      .reset_index(drop=True)
     )

print(df.shape)

(73462, 16)


In [7]:
# Clean YearsCode and YearsCodePro and make both integer column

df = (df
      .assign(YearsCode=lambda df_: df_.YearsCode.mask(df_.YearsCode == 'More than 50 years', '50'))
      .assign(YearsCode=lambda df_: df_.YearsCode.mask(df_.YearsCode == 'Less than 1 year', '0'))
      .assign(YearsCodePro=lambda df_: df_.YearsCodePro.mask(df_.YearsCodePro == 'More than 50 years', '50'))
      .assign(YearsCodePro=lambda df_: df_.YearsCodePro.mask(df_.YearsCodePro == 'Less than 1 year', '0'))
      .astype({'YearsCode': int, 'YearsCodePro': int})
     )

print(df.shape)

(73462, 16)


In [8]:
# clean HaveWorkedWith

cols = [col for col in df.columns if col.endswith("HaveWorkedWith")]

df = (df
      .assign(HaveWorkedWith=lambda df_: df_[cols].apply(lambda row: ';'.join(row.values.astype(str)), axis=1))
      .assign(HaveWorkedWith=lambda df_: df_.HaveWorkedWith.str.replace(';nan', ''))
      .assign(HaveWorkedWith=lambda df_: df_.HaveWorkedWith.str.replace('nan;', ''))
      .assign(HaveWorkedWith=lambda df_: df_.HaveWorkedWith.str.replace('nan', ''))
      .drop(columns=cols)
     )
              
print(df.shape)

(73462, 12)


In [9]:
# all skills
skills = set()
df["HaveWorkedWith"].str.split(";").apply(skills.update)
print(*sorted(skills))
df['ComputerSkills'] = df["HaveWorkedWith"].apply(lambda x: 0 if x== "" else x.count(";") + 1)
df.head(3)

 APL ASP.NET ASP.NET Core  AWS Angular Angular.js Ansible Assembly Bash/Shell Blazor C C# C++ COBOL Cassandra Chef Clojure Cloud Firestore Colocation CouchDB Couchbase Crystal Dart Delphi Deno DigitalOcean Django Docker Drupal DynamoDB Elasticsearch Elixir Erlang Express F# FastAPI Fastify Firebase Firebase Realtime Database Flask Flow Fortran Gatsby Git Go Google Cloud Google Cloud Platform Groovy HTML/CSS Haskell Heroku Homebrew IBM Cloud or Watson IBM DB2 Java JavaScript Julia Kotlin Kubernetes LISP Laravel Linode Lua MATLAB Managed Hosting MariaDB Matlab Microsoft Azure Microsoft SQL Server MongoDB MySQL Neo4j Next.js Node.js Nuxt.js OCaml OVH Objective-C OpenStack Oracle Oracle Cloud Infrastructure PHP Perl Phoenix Play Framework PostgreSQL PowerShell Pulumi Puppet Python R React.js Redis Ruby Ruby on Rails Rust SAS SQL SQLite Scala Solidity Spring Svelte Swift Symfony Terraform TypeScript Unity 3D Unreal Engine VBA VMware Vue.js Xamarin Yarn jQuery npm


Unnamed: 0,Age,Accessibility,EdLevel,Employment,Gender,MentalHealth,MainBranch,YearsCode,YearsCodePro,Country,PreviousSalary,HaveWorkedWith,ComputerSkills
0,<35,No,Master,1,Man,No,Dev,7,4,Sweden,51552.0,C++;Python;Git;PostgreSQL,4
1,<35,No,Undergraduate,1,Man,No,Dev,12,5,Spain,46482.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...,12
2,<35,No,Master,1,Man,No,Dev,15,6,Germany,77290.0,C;C++;Java;Perl;Ruby;Git;Ruby on Rails,7


In [10]:
# skilled mostly mentioned by Age == <35 and MentalHealth == No
df_skills = df['HaveWorkedWith'].str.get_dummies(";")
selected_skills_men = set(df_skills.loc[(df["Age"] == "<35") & (df["Gender"] == "Man")].sum().nlargest(20).index)
print("men skills:", *selected_skills_men, end="\n\n")
selected_skills_women = set(df_skills.loc[(df["Age"] == "<35") & (df["Gender"].isin(["Woman", "NonBinary"]))].sum().nlargest(10).index)
print("women skills:", *selected_skills_women, end="\n\n")
# Select skills present in the top 20 of men but not in the top 10 of women/nonbinary
selected_skills = selected_skills_men.difference(selected_skills_women)
print("selected skills:", *selected_skills, end="\n\n")

men skills: npm MongoDB JavaScript Docker Microsoft SQL Server TypeScript SQLite C# Node.js Bash/Shell HTML/CSS SQL MySQL Git Python PostgreSQL Java AWS jQuery React.js

women skills: MySQL Git Docker Python PostgreSQL HTML/CSS SQL JavaScript AWS React.js

selected skills: npm MongoDB SQLite C# Node.js Java Bash/Shell Microsoft SQL Server TypeScript jQuery



In [11]:
# Create a custom Employed target variable, using the total number of selected skills
print(df['Employment'].describe())

df = (df
      .assign(Employed = df_skills[list(selected_skills)].sum(axis=1))
      .assign(Employed = lambda df_: (df_['Employed'] >= 3).astype(int))
     )

df.head(3)

count    73462.000000
mean         0.883096
std          0.321308
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: Employment, dtype: float64


Unnamed: 0,Age,Accessibility,EdLevel,Employment,Gender,MentalHealth,MainBranch,YearsCode,YearsCodePro,Country,PreviousSalary,HaveWorkedWith,ComputerSkills,Employed
0,<35,No,Master,1,Man,No,Dev,7,4,Sweden,51552.0,C++;Python;Git;PostgreSQL,4,0
1,<35,No,Undergraduate,1,Man,No,Dev,12,5,Spain,46482.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...,12,1
2,<35,No,Master,1,Man,No,Dev,15,6,Germany,77290.0,C;C++;Java;Perl;Ruby;Git;Ruby on Rails,7,0


In [12]:
df['Employed'].value_counts()

1    39392
0    34070
Name: Employed, dtype: int64

In [13]:
df.sample(3).T

Unnamed: 0,44258,8678,71392
Age,>35,<35,<35
Accessibility,No,No,No
EdLevel,Undergraduate,Undergraduate,Master
Employment,1,1,1
Gender,Man,Woman,Man
MentalHealth,No,Yes,No
MainBranch,Dev,Dev,Dev
YearsCode,40,2,4
YearsCodePro,34,0,3
Country,Japan,United States of America,Greece


In [14]:
df["Country"].value_counts()

United States of America                                14696
Germany                                                  5395
India                                                    5360
United Kingdom of Great Britain and Northern Ireland     4688
Canada                                                   2779
                                                        ...  
Mauritania                                                  1
Burundi                                                     1
Saint Kitts and Nevis                                       1
Monaco                                                      1
Seychelles                                                  1
Name: Country, Length: 172, dtype: int64

In [15]:
# save full dataset
df.to_csv('stackoverflow.csv')

In [16]:
# save partial dataset
(df
 .drop(['Accessibility', 'Country', 'MentalHealth', 'HaveWorkedWith', 'Employment'], axis=1)
 .to_csv('stackoverflow_clean.csv')
)