In [41]:
import pandas as pd
import numpy as np
import re #Regex lib
from forex_python.converter import CurrencyRates #currencies lib
from datetime import datetime
import matplotlib

### Read in the dataframes and create a dict {(year, dataframe)}


In [42]:
# Create a list of filenames for the datasets
data_frames_dict = {}

data_frames_dict[2017] = pd.read_csv('datasets/2017.csv', encoding = "ISO-8859-1").filter(items=['Country', 'DeveloperType']).dropna(thresh=2) 
data_frames_dict[2018] = pd.read_csv('datasets/2018.csv', encoding = "ISO-8859-1").filter(items=['Country',  'DevType']).dropna(thresh=2) 
data_frames_dict[2019] = pd.read_csv('datasets/2019.csv', encoding = "ISO-8859-1").filter (items=['Country',  'DevType']).dropna(thresh=2) 
data_frames_dict[2020] = pd.read_csv('datasets/2020.csv', encoding = "ISO-8859-1").filter (items=['Country',  'DevType']).dropna(thresh=2) 
#preprocess 2021
def check_countryType(country, US_state, UK_country):
    
    if  isinstance(US_state, str) or country == 'United States':
        return 'United States of America'
    elif isinstance(UK_country, str) or country == 'United Kingdom of Great Britain and Northern Ireland':
        return 'United Kingdom'
    else:
        return country


df_2021 = pd.read_csv('datasets/2021.csv', encoding = "ISO-8859-1").filter (items=['Country', 'US_State', 'UK_Country',  'DevType'])
df_2021 = df_2021.dropna(subset = [ 'DevType'])
df_2021.Country =  df_2021.apply(lambda row : check_countryType(row['Country'],row['US_State'],  
                     row['UK_Country']), axis=1)
df_2021 = df_2021.drop(['US_State', 'UK_Country'], axis = 1).dropna(thresh=2)
data_frames_dict[2021]  = df_2021
df_2021

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Country,DevType
0,Slovakia,"Developer, mobile"
3,Austria,"Developer, front-end"
4,United Kingdom,"Developer, desktop or enterprise applications;..."
8,India,"Developer, front-end"
9,Sweden,Data scientist or machine learning specialist
...,...,...
83434,United States of America,"Developer, back-end"
83435,Benin,"Developer, full-stack"
83436,United States of America,Data scientist or machine learning specialist;...
83437,Canada,"Developer, back-end"


### obtain developer types 


In [43]:
import json
import codecs
df_2017 = data_frames_dict[2017]

# Change column name  DeveloperType in 2017  to DevType
df_2017.columns = ['DevType' if x=='DeveloperType' else x for x in df_2017.columns]

data_frames_dict[2017] = df_2017

# This is a dict that maps similar kinds of devs to one common name
devTypes_clean_dict= { 'C-suite executive (CEO, CTO, etc.)': 'Senior executive',
 'Data or business analyst': 'Data Scientist/Engineer/Analyst',
 'Data scientist' : 'Data Scientist/Engineer/Analyst',
 'Data scientist or machine learning specialist': 'Data Scientist/Engineer/Analyst',
 'Desktop or enterprise applications developer':  'Desktop applications developer',
 'Developer, QA or test':  'Quality assurance engineer',
 'Developer, back-end': "Web developer",
 'Developer, desktop or enterprise applications':'Desktop applications developer',
 'Developer, embedded applications or devices' :'Desktop applications developer',
 'Developer, front-end': "Web developer",
 'Developer, full-stack' : 'Web developer',
 'Developer, game or graphics': 'Game developer',
 'Developer, mobile':  'Mobile developer',
 'Graphic designer': "Graphics Developer", 
 'Graphics programming': "Graphics Developer",             
 'Educator or academic researcher': 'Academic researcher',
 'Embedded applications or devices developer': 'Embedded applications developer',
 'Embedded applications/devices developer': 'Embedded applications developer',
 'Engineer, data': 'Data Scientist/Engineer/Analyst',
 'Front-end developer':  "Web developer",
 'Full-stack developer': "Web developer",
 'Game or graphics developer' : 'Game developer',
 'Machine learning specialist' : 'Data Scientist/Engineer/Analyst',
 'Other (please specify):': 'Other',
 'QA or test developer' : 'Quality assurance engineer',
 'Senior Executive (C-Suite, VP, etc.)': 'Senior executive',
 'Senior executive/VP': 'Senior executive',
 'Systems administrator': 'System administrator',
 'Engineer, site reliability': 'Site reliability engineer',
 }
#helper function to map a string into a list of unique dev types
def get_unique_devetypes(devtypesEntry):
    types = map(lambda s: s.strip(), devtypesEntry.split(';'))
    return list(set(map(lambda s: devTypes_clean_dict[s] if s in devTypes_clean_dict else s,types )))

#select only rows with devtype entry 
for year, df in data_frames_dict.items(): 
    if 'DevType' in df:
        data_frames_dict[year] = df[df.DevType.apply(lambda s: not pd.isna(s))]
        

# This for loop was used to identify different kinds of devtypes that can map to a similar dev type
# it was used in the construction of the devTypes_clean_dict above 
devTypes = set()
for year, df in data_frames_dict.items(): 
    if 'DevType' in df:
        for devType in df.DevType:
                #split into devtypes and remove end and start spaces from each type
                devTypes.update(get_unique_devetypes(devType))

 
# create a column in each year dataframe that is a list of unique dev types a user submitted as their devType            
for year, df in data_frames_dict.items(): 
    if 'DevType' in df:      
        df['DevTypes_count_per_user'] = df.DevType.apply(lambda s:len(get_unique_devetypes(s)))
        df.DevType = df.DevType.apply(lambda s:get_unique_devetypes(s))
        data_frames_dict[year] = df
  
# For each year, count the number of each devtype reported 
dev_type_count_per_year = {}   
for year, df in data_frames_dict.items(): 
    
    #df.set_index(['Country', 'DevType'])
    if 'DevType' in df:  
        # explode devtypes lists into separate rows for users who are more than one deev type
        df = df.explode('DevType')

        
        df['DevCount'] = pd.Series(np.ones(len(df.DevType), dtype=np.int32), index = df.index)
        
        df = df.groupby(by=['Country', 'DevType'], as_index = False).sum()
    
        
        json_dict = {}
        for k, v in df.groupby('Country'):
            inner_dict = {}
            for typ, count in zip(v['DevType'], v['DevCount']):
                inner_dict[typ] = count 
            json_dict[k] = inner_dict


        with codecs.open("processed_data/dev_types/{}-dev_types_count_per_country.json".format(year), "w", encoding='utf-8') as outfile:
            json.dump(json_dict, outfile, indent = 4, ensure_ascii=False)


        
