In [None]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load data, 2 years
df1 = pd.read_csv("survey_results_public_2021.csv")
print(df1.shape)
df2 = pd.read_csv("survey_results_public_2022.csv")
print(df2.shape)

df = pd.concat([df1, df2], ignore_index=True)
print(f"{df.shape}")
df.head(3)

In [None]:
# Select columns 
keep_col = [#'ResponseId',

 #'LanguageHaveWorkedWith',
 #'ToolsTechHaveWorkedWith',
 #'MiscTechHaveWorkedWith',
 #'WebframeHaveWorkedWith',
 #'PlatformHaveWorkedWith',
 #'DatabaseHaveWorkedWith',
 #'NEWCollabToolsHaveWorkedWith',
            
 #'NEWCollabToolsWantToWorkWith',
 #'DatabaseWantToWorkWith',
 #'WebframeWantToWorkWith',
 #'ToolsTechWantToWorkWith',
 #'PlatformWantToWorkWith',
 #'MiscTechWantToWorkWith',
 #'LanguageWantToWorkWith',
 
 'Age',
 #'Accessibility',
 'EdLevel',
 #'Employment',
 #'Ethnicity',
 #'Gender',
 'MentalHealth',

 'MainBranch',
 #'DevType',
 'YearsCode',
 'YearsCodePro',

 #'Currency',
 #'Country',
 'ConvertedCompYearly',
 #'CompTotal',
 #'CompFreq',
 #'Sexuality',
 #'OrgSize'
]


In [None]:
# Clean data : drop columns, NaNs, Concat (& drop) HaveWorkedWith

df = df[keep_col]
print(df.shape)
df = df.dropna()
print(df.shape)


In [None]:
df.head(3)

In [None]:
# Clean data : Reduce distincs responses

def clean_gender(string):
    if string.split(';')[0] == 'Man':
        return 'Man'
    elif 'Woman' in string:
        return 'Woman'
    else:
        return 'Other'
    
def clean_accessibility(string):
    if 'None of the above' in string:
        return 'No'
    elif 'Prefer not to say' in string:
        return 'No'
    else:
        return 'Yes'
    
def clean_mentalhealth(string):
    if 'None of the above' in string:
        return 'No'
    elif 'Prefer not to say' in string:
        return 'No'
    else:
        return 'Yes'
    
def clean_edlevel(string):
    """
     {'Associate degree (A.A., A.S., etc.)',
     'Bachelor’s degree (B.A., B.S., B.Eng., etc.)',
     'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)',
     'Other doctoral degree (Ph.D., Ed.D., etc.)',
     'Primary/elementary school',
     'Professional degree (JD, MD, etc.)',
     'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
     'Some college/university study without earning a degree',
     'Something else'}
     """
    if 'Associate degree (A.A., A.S., etc.)' in string:
        return 'Undergraduate'
    elif 'Bachelor’s degree (B.A., B.S., B.Eng., etc.)' in string:
        return 'Undergraduate'
    elif 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)' in string:
        return 'Master'
    elif 'Other doctoral degree (Ph.D., Ed.D., etc.)' in string:
        return 'PhD'
    else:
        return 'Other'

    
def clean_mainbranch(string):
    """
    {'I am a developer by profession',
     'I am a student who is learning to code',
     'I am learning to code',
     'I am not primarily a developer, but I write code sometimes as part of my work',
     'I code primarily as a hobby',
     'I used to be a developer by profession, but no longer am',
     'None of these'}
    """
    if 'I am a developer by profession' in string:
        return 'Dev'
    else:
        return 'NotDev'

    
def clean_employ(string):
    """
    {'Employed full-time',
     'Employed part-time',
     'Employed, full-time',
     'Employed, full-time;Employed, part-time',
     'Employed, full-time;Independent contractor, freelancer, or self-employed',
     'Employed, full-time;Independent contractor, freelancer, or self-employed;Employed, part-time',
     'Employed, full-time;Independent contractor, freelancer, or self-employed;Retired',
     'Employed, full-time;Retired',
     'Employed, part-time',
     'Employed, part-time;Retired',
     'I prefer not to say',
     'Independent contractor, freelancer, or self-employed',
     'Independent contractor, freelancer, or self-employed;Employed, part-time',
     'Independent contractor, freelancer, or self-employed;Retired',
     'Retired'}
    """
    if 'full-time' in string:
        return 1
    else:
        return 0
    
age_mapping = {'35-44 years old': '>35', 
               '25-34 years old': '<35',
               '55-64 years old': '>35', 
               '45-54 years old': '>35', 
               '65 years or older': '>35',
               'Under 18 years old': '<35',
               '18-24 years old': '<35'}

In [None]:
# Clean columns kept

# df['Gender'] = df['Gender'].apply(lambda x: clean_gender(x))
# df['Accessibility'] = df['Accessibility'].apply(lambda x: clean_accessibility(x))
df['EdLevel'] = df['EdLevel'].apply(lambda x: clean_edlevel(x))
df['MainBranch'] = df['MainBranch'].apply(lambda x: clean_mainbranch(x))
# df['Employment'] = df['Employment'].apply(lambda x: clean_employ(x))
df['MentalHealth'] = df['MentalHealth'].apply(lambda x: clean_mentalhealth(x))

# Drop rows where age is not filled in and change category names
df.drop(df[df['Age'] == 'Prefer not to say'].index, inplace=True)
df['Age'] = df['Age'].replace(age_mapping)

#df['Ethnicity'] = df['Ethnicity'].apply(lambda x: clean_ethnicity(x))

df.reset_index(drop=True, inplace=True)

In [None]:
# Create a custom Employment target variable, remove the yearly compensation otherwise classificaton task
# will be too easy.

df['custom_employment'] = (df['ConvertedCompYearly'] > 4e04).astype(int)
df.drop(['ConvertedCompYearly'], inplace=True, axis=1)

In [None]:
print(df.shape)
print(df.head(3))

In [None]:
print(df.groupby(['custom_employment']).count())

In [None]:
df.to_csv('stackoverflow_clean.csv')