In [None]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns

In [None]:
# Load data, 2 years
df1 = pd.read_csv("survey_results_public_2021.csv")
print(df1.shape)
df2 = pd.read_csv("survey_results_public_2022.csv")
print(df2.shape)

df = pd.concat([df1, df2], ignore_index=True)
print(f"{df.shape}")
df.head(3)

In [None]:
# Select columns 
keep_col = [#'ResponseId',

 'LanguageHaveWorkedWith',
 'ToolsTechHaveWorkedWith',
 #'MiscTechHaveWorkedWith',
 'WebframeHaveWorkedWith',
 'PlatformHaveWorkedWith',
 'DatabaseHaveWorkedWith',
 #'NEWCollabToolsHaveWorkedWith',
            
 #'NEWCollabToolsWantToWorkWith',
 #'DatabaseWantToWorkWith',
 #'WebframeWantToWorkWith',
 #'ToolsTechWantToWorkWith',
 #'PlatformWantToWorkWith',
 #'MiscTechWantToWorkWith',
 #'LanguageWantToWorkWith',
 
 'Age',
 #'Accessibility',
 'EdLevel',
 #'Employment',
 #'Ethnicity',
 #'Gender',
 'MentalHealth',

 'MainBranch',
 #'DevType',
 'YearsCode',
 'YearsCodePro',

 #'Currency',
 #'Country',
 'ConvertedCompYearly',
 #'CompTotal',
 #'CompFreq',
 #'Sexuality',
 #'OrgSize'
]


In [None]:
# Clean data : drop columns, NaNs, Concat (& drop) HaveWorkedWith

df = df[keep_col]
print(df.shape)
df = df.dropna(subset=[col for col in df.columns if not col.endswith("HaveWorkedWith")])
print(df.shape)


In [None]:
df.head(3)

In [None]:
# Clean data : Reduce distincs responses

def clean_gender(string):
    if string.split(';')[0] == 'Man':
        return 'Man'
    elif 'Woman' in string:
        return 'Woman'
    else:
        return 'Other'
    
def clean_accessibility(string):
    if 'None of the above' in string:
        return 'No'
    elif 'Prefer not to say' in string:
        return 'No'
    else:
        return 'Yes'
    
def clean_mentalhealth(string):
    if 'None of the above' in string:
        return 'No'
    elif 'Prefer not to say' in string:
        return 'No'
    else:
        return 'Yes'
    
def clean_edlevel(string):
    """
     {'Associate degree (A.A., A.S., etc.)',
     'Bachelor’s degree (B.A., B.S., B.Eng., etc.)',
     'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)',
     'Other doctoral degree (Ph.D., Ed.D., etc.)',
     'Primary/elementary school',
     'Professional degree (JD, MD, etc.)',
     'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
     'Some college/university study without earning a degree',
     'Something else'}
     """
    if 'Associate degree (A.A., A.S., etc.)' in string:
        return 'Undergraduate'
    elif 'Bachelor’s degree (B.A., B.S., B.Eng., etc.)' in string:
        return 'Undergraduate'
    elif 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)' in string:
        return 'Master'
    elif 'Other doctoral degree (Ph.D., Ed.D., etc.)' in string:
        return 'PhD'
    else:
        return 'Other'

    
def clean_mainbranch(string):
    """
    {'I am a developer by profession',
     'I am a student who is learning to code',
     'I am learning to code',
     'I am not primarily a developer, but I write code sometimes as part of my work',
     'I code primarily as a hobby',
     'I used to be a developer by profession, but no longer am',
     'None of these'}
    """
    if 'I am a developer by profession' in string:
        return 'Dev'
    else:
        return 'NotDev'

    
def clean_employ(string):
    """
    {'Employed full-time',
     'Employed part-time',
     'Employed, full-time',
     'Employed, full-time;Employed, part-time',
     'Employed, full-time;Independent contractor, freelancer, or self-employed',
     'Employed, full-time;Independent contractor, freelancer, or self-employed;Employed, part-time',
     'Employed, full-time;Independent contractor, freelancer, or self-employed;Retired',
     'Employed, full-time;Retired',
     'Employed, part-time',
     'Employed, part-time;Retired',
     'I prefer not to say',
     'Independent contractor, freelancer, or self-employed',
     'Independent contractor, freelancer, or self-employed;Employed, part-time',
     'Independent contractor, freelancer, or self-employed;Retired',
     'Retired'}
    """
    if 'full-time' in string:
        return 1
    else:
        return 0
    
age_mapping = {'35-44 years old': '>35', 
               '25-34 years old': '<35',
               '55-64 years old': '>35', 
               '45-54 years old': '>35', 
               '65 years or older': '>35',
               'Under 18 years old': '<35',
               '18-24 years old': '<35'}

In [None]:
# Clean columns kept

# df['Gender'] = df['Gender'].apply(clean_gender)
# df['Accessibility'] = df['Accessibility'].apply(clean_accessibility)
df['EdLevel'] = df['EdLevel'].apply(clean_edlevel)
df['MainBranch'] = df['MainBranch'].apply(clean_mainbranch)
# df['Employment'] = df['Employment'].apply(clean_employ)
df['MentalHealth'] = df['MentalHealth'].apply(clean_mentalhealth)

# Drop rows where age is not filled in and change category names
df = df.loc[df['Age'] != 'Prefer not to say']
df['Age'] = df['Age'].replace(age_mapping)

#df['Ethnicity'] = df['Ethnicity'].apply(lambda x: clean_ethnicity(x))

df = df.reset_index(drop=True)

In [None]:
# clean HaveWorkedWith

cols = [col for col in df.columns if col.endswith("HaveWorkedWith")]
df['HaveWorkedWith'] = df[cols].apply(lambda row: ';'.join(row.values.astype(str)), axis=1)
df['HaveWorkedWith'] = df['HaveWorkedWith'].str.replace(';nan', '')
df['HaveWorkedWith'] = df['HaveWorkedWith'].str.replace('nan;', '')
df['HaveWorkedWith'] = df['HaveWorkedWith'].str.replace('nan', '')
df = df.drop(columns=cols)
print(df.shape)

In [None]:
# Create a custom Employment target variable, remove the yearly compensation otherwise classificaton task
# will be too easy.

print(df['ConvertedCompYearly'].describe())
df['custom_employment'] = (df['ConvertedCompYearly'] > 40_000).astype(int)
df = df.drop(['ConvertedCompYearly'], axis=1)

In [None]:
df['custom_employment'].value_counts()

In [None]:
# all skills
skills = set()
df["HaveWorkedWith"].str.split(";").apply(skills.update)
print(*sorted(skills))

In [None]:
# Create a custom employment target variable, using the total number of skills
df['custom_employment_skills'] = df["HaveWorkedWith"].apply(lambda x: 0 if x== "" else x.count(";") + 1)
print(df['custom_employment_skills'].describe())
df['custom_employment_skills'] = (df['custom_employment_skills'] >= 15).astype(int)

In [None]:
df['custom_employment_skills'].value_counts()

In [None]:
# skilled mostly mentioned by Age == <35 and MentalHealth == No
df_skills = df['HaveWorkedWith'].str.get_dummies(";")
selected_skills= df_skills.loc[(df["Age"]=="<35")&(df["MentalHealth"]=="No")].sum().nlargest(10)
selected_skills

In [None]:
# Create a custom employment target variable, using the total number of selected skills
df['custom_employment_selected_skills'] = df_skills[selected_skills.index].sum(axis=1)
print(df['custom_employment_selected_skills'].describe())
df['custom_employment_selected_skills'] = (df['custom_employment_selected_skills'] >= 6).astype(int)
df = df.drop(['HaveWorkedWith'], axis=1)

In [None]:
df['custom_employment_selected_skills'].value_counts()

In [None]:
df.sample(3).T

In [None]:
print(df.groupby(['custom_employment']).count())

In [None]:
print(df.groupby(['custom_employment_skills']).count())

In [None]:
print(df.groupby(['custom_employment_selected_skills']).count())

In [None]:
df.to_csv('stackoverflow_clean.csv')