In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import country_converter as coco

# load the world city dataframe for debugging purposes 
# worldcities = pd.read_csv('worldcities.csv', usecols = ['city', 'city_ascii', 'country', 'iso3'])

In [2]:
''' functions and constant to be used to map survey result values '''
def mapStudentStatus(x):
    return 'PT' if x == 'Yes, part-time' else 'FT' if x == 'Yes, full-time' else 'No'
def split_or_empty(x):
    return [] if pd.isna(x) else x.split(';')

# the value of 2019 is mapped in a way to align the scale of the two mapping result
SATISFACTION_MAP_2018 = {'Extremely satisfied': 7, 'Moderately satisfied': 6, 'Slightly satisfied': 5, 
                    'Neither satisfied nor dissatisfied': 4, 'Slightly dissatisfied': 3, 'Moderately dissatisfied': 2,
                    'Extremely dissatisfied': 1}
SATISFACTION_MAP_2019 = {'Very dissatisfied': 1, 'Slightly dissatisfied': 2.5, 'Neither satisfied nor dissatisfied': 4,
                         'Slightly satisfied': 5.5, 'Very satisfied': 7}

# 2018 user survey data

In [3]:
# plt.hist(survey_result_pub2018[survey_result_pub2018.ConvertedSalary != 'NA']['ConvertedSalary'].map(float))
# plt.show()

''' following shows using the converted salary seems to preserve most of more of the data '''
# print(survey_result_pub2018[survey_result_pub2018.Salary != 'NA'].Salary.size) # 50578
# print(survey_result_pub2018[(survey_result_pub2018.Currency != 'NA') & (survey_result_pub2018.Salary != 'NA')].Currency.size)  # 45150
# print(survey_result_pub2018[survey_result_pub2018.ConvertedSalary != 'NA'].ConvertedSalary.size) # 47702
# print(survey_result_pub2018.ConvertedSalary.size)    # 98855

' following shows using the converted salary seems to preserve most of more of the data '

In [4]:
# country may need to be cleaned as its user filled field
# columns with accessJobs and accessBenefits should change the title accordingly --> also lots of NA
# JobContactPriorities, JobEmailPriorities are in page 17 / 18

survey2018 = pd.read_csv('part_of_data/developer_survey_2018/survey_results_public.csv', 
                                    usecols = [
                                        'Hobby', 'Country', 'Student', 'YearsCoding', 'YearsCodingProf',
                                        'JobSatisfaction', 'CareerSatisfaction', 'Age', 'Gender', 
# add the following if the ranking for criteria to access a job
#                                         'DevType', 'AssessJob1', 'AssessJob2', 'AssessJob3', 'AssessJob4',
#                                         'AssessJob5', 'AssessJob6', 'AssessJob7', 'AssessJob8', 'AssessJob9', 'AssessJob10',
                                        'ConvertedSalary', 'LanguageWorkedWith', 'LanguageDesireNextYear', 
                                        'DatabaseWorkedWith', 'DatabaseDesireNextYear', 'PlatformWorkedWith', 
                                        'PlatformDesireNextYear', 'FrameworkWorkedWith', 'FrameworkDesireNextYear',
# interesting fields
                                        'AIFuture', 'EthicsChoice', 'EthicsResponsible', 'EthicalImplications',
                                        'StackOverflowRecommend', 'StackOverflowVisit', 'StackOverflowParticipate',
# interesting fields 2 
                                        'WakeTime', 'HoursComputer', 'HoursOutside', 'SkipMeals', 'Exercise', 
                                        'RaceEthnicity'
                                    ])
survey2018 = survey2018.dropna(thresh=10)
survey2018.reset_index(drop=True, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
''' converting the country into isconverted_countrycountry converter '''
converted_country = pd.Series(coco.convert(names=survey2018.Country.tolist(), to='ISO3', not_found='NA'))
survey2018['Country'] = converted_country



In [6]:
''' 
Hobby : no NA field 
Country : converted to iso3, other countries is mapped to NA
Student : {  No -> No, Full time -> FT, part time -> PT, NaN -> NA  }
Years of coding : just keep as it is --> as there are multiple reason why it is NA (coding as hobby / student)
Satisfaction : Nan is map to mean of the data
'''
# display(survey2018['JobSatisfaction'].head(20))
# display(survey2018[survey2018.JobSatisfaction == 'No']) 


''' cleaning student status '''
survey2018['Student'] = survey2018['Student'].map(mapStudentStatus, na_action='ignore')
survey2018['Student'].fillna(value='NA', inplace=True)

''' cleaning year of satisfaction (can only run once) '''
survey2018['JobSatisfaction'] = survey2018['JobSatisfaction'].map(lambda x: SATISFACTION_MAP_2018[x], na_action='ignore')
survey2018['JobSatisfaction'].fillna(survey2018.JobSatisfaction.mean(), inplace=True)
survey2018['CareerSatisfaction'] = survey2018['CareerSatisfaction'].map(lambda x: SATISFACTION_MAP_2018[x], na_action='ignore')
survey2018['CareerSatisfaction'].fillna(survey2018.CareerSatisfaction.mean(), inplace=True)

display(survey2018.head(5), survey2018.tail(5))

# survey2018[(survey2018.Student.isna()) | (survey2018.JobSatisfaction.isna()) | (survey2018.CareerSatisfaction.isna())]

Unnamed: 0,Hobby,Country,Student,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,ConvertedSalary,LanguageWorkedWith,LanguageDesireNextYear,...,StackOverflowVisit,StackOverflowParticipate,WakeTime,HoursComputer,HoursOutside,SkipMeals,Exercise,Gender,RaceEthnicity,Age
0,Yes,KEN,No,3-5 years,3-5 years,7.0,7.0,,JavaScript;Python;HTML;CSS,JavaScript;Python;HTML;CSS,...,Multiple times per day,I have never participated in Q&A on Stack Over...,Between 5:00 - 6:00 AM,9 - 12 hours,1 - 2 hours,Never,3 - 4 times per week,Male,Black or of African descent,25 - 34 years old
1,Yes,GBR,No,30 or more years,18-20 years,2.0,4.0,70841.0,JavaScript;Python;Bash/Shell,Go;Python,...,A few times per month or weekly,A few times per month or weekly,Between 6:01 - 7:00 AM,5 - 8 hours,30 - 59 minutes,Never,Daily or almost every day,Male,White or of European descent,35 - 44 years old
2,No,USA,No,18-20 years,12-14 years,4.0,3.0,,C#;JavaScript;SQL;TypeScript;HTML;CSS;Bash/Shell,C#;JavaScript;SQL;TypeScript;HTML;CSS;Bash/Shell,...,A few times per week,A few times per month or weekly,Between 6:01 - 7:00 AM,9 - 12 hours,Less than 30 minutes,3 - 4 times per week,I don't typically exercise,Male,White or of European descent,35 - 44 years old
3,Yes,ZAF,PT,6-8 years,0-2 years,5.0,6.0,21426.0,C;C++;Java;Matlab;R;SQL;Bash/Shell,Assembly;C;C++;Matlab;SQL;Bash/Shell,...,Daily or almost daily,Less than once per month or monthly,Before 5:00 AM,Over 12 hours,1 - 2 hours,Never,3 - 4 times per week,Male,White or of European descent,18 - 24 years old
4,Yes,GBR,No,6-8 years,3-5 years,6.0,5.0,41671.0,Java;JavaScript;Python;TypeScript;HTML;CSS,C#;Go;Java;JavaScript;Python;SQL;TypeScript;HT...,...,A few times per month or weekly,Less than once per month or monthly,Between 7:01 - 8:00 AM,9 - 12 hours,30 - 59 minutes,1 - 2 times per week,1 - 2 times per week,Male,White or of European descent,18 - 24 years old


Unnamed: 0,Hobby,Country,Student,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,ConvertedSalary,LanguageWorkedWith,LanguageDesireNextYear,...,StackOverflowVisit,StackOverflowParticipate,WakeTime,HoursComputer,HoursOutside,SkipMeals,Exercise,Gender,RaceEthnicity,Age
79095,No,GBR,No,15-17 years,,5.04686,5.153767,,,,...,Daily or almost daily,,Between 6:01 - 7:00 AM,Over 12 hours,1 - 2 hours,1 - 2 times per week,I don't typically exercise,,,
79096,Yes,IND,,3-5 years,3-5 years,5.04686,5.153767,,,,...,Daily or almost daily,A few times per month or weekly,Between 6:01 - 7:00 AM,Over 12 hours,3 - 4 hours,Never,I don't typically exercise,,,
79097,Yes,ZAF,,3-5 years,0-2 years,5.04686,5.153767,,,,...,A few times per week,I have never participated in Q&A on Stack Over...,Between 5:00 - 6:00 AM,1 - 4 hours,1 - 2 hours,Never,3 - 4 times per week,,,
79098,Yes,HUN,No,3-5 years,0-2 years,5.04686,5.153767,,,,...,Daily or almost daily,A few times per month or weekly,I do not have a set schedule,5 - 8 hours,Less than 30 minutes,Never,I don't typically exercise,,,
79099,Yes,NPL,No,0-2 years,0-2 years,5.04686,5.153767,,,,...,Multiple times per day,A few times per week,I do not have a set schedule,5 - 8 hours,1 - 2 hours,1 - 2 times per week,I don't typically exercise,,,


In [7]:
'''
Age : NaN -> 'NA'
Gender : NaN -> 'NA'
'''
# survey2018[survey2018['Age'].isna()]     # 26632 rows does not provide age
survey2018.Age.fillna(value='NA', inplace=True)
# survey2018[survey2018['Gender'].isna()]    # 26738 rows does not provide gender
survey2018.Gender.fillna(value='NA', inplace=True)

In [8]:
''' 
Converted Salary : fill na with student -> the converted salary = 0 , else leave as NA --> TBC
'''
# TODO
survey2018.loc[(survey2018.ConvertedSalary.isna()) & (survey2018.Student != 'No'), 'ConvertedSalary'] = 0
# survey2018[(survey2018.ConvertedSalary.isna())]  # 29716 rows does not provide salary

In [9]:
'''
LanguageWorkedWith to FrameworkDesireNextYear : replace na with empty list, other split into list of lang / framework
'''
# clean all the fields (can only run once)

# survey2018[survey2018.LanguageWorkedWith.isna()].shape        # 13021 rows does not provide prog languages
# survey2018[survey2018.LanguageDesireNextYear.isna()].shape    # 18000 rows does not provide prog languages
# survey2018[survey2018.DatabaseWorkedWith.isna()].shape        # 24941 rows does not provide db 
# survey2018[survey2018.DatabaseDesireNextYear.isna()].shape    # 33709 rows does not provide db 
# survey2018[survey2018.PlatformWorkedWith.isna()].shape        # 25210 rows does not provide platform 
# survey2018[survey2018.PlatformDesireNextYear.isna()].shape    # 29857 rows does not provide platform 
# survey2018[survey2018.FrameworkWorkedWith.isna()].shape       # 39587 rows does not provide framework 
# survey2018[survey2018.FrameworkDesireNextYear.isna()].shape   # 35990 rows does not provide framework 


''' apply function to clean these fields '''
survey2018.loc[:, 'LanguageWorkedWith':'FrameworkDesireNextYear'] = survey2018.loc[:, 
                                        'LanguageWorkedWith':'FrameworkDesireNextYear'].applymap(split_or_empty) 

In [10]:
''' mid progress check (only year of coding x2 and converted salary has NA values), 
    i.e. all the following tests should return false '''

print(survey2018.loc[:, 'Hobby':'Student'].isnull().values.any(), 
      survey2018.loc[:, 'CareerSatisfaction':'JobSatisfaction'].isnull().values.any(),
      survey2018.loc[:, 'LanguageWorkedWith':'LanguageDesireNextYear'].isnull().values.any(),
      survey2018.loc[:, 'Age':'Gender'].isnull().values.any())

False False False False


In [11]:
survey2018.to_pickle('./survey2018_half_cleaned.pkl')

In [None]:
''' Other fields left for later.... '''

# 2019 user Survey data 

In [12]:
survey2019 = pd.read_csv('part_of_data/developer_survey_2019/survey_results_public.csv', 
                                    usecols = [
                                        'Hobbyist', 'Country', 'Student', 'YearsCode', 'YearsCodePro',
                                        'JobSat', 'CareerSat', 'Age', 'Gender', 
                                        'ConvertedComp', 'LanguageWorkedWith', 'LanguageDesireNextYear', 
                                        'DatabaseWorkedWith', 'DatabaseDesireNextYear', 'PlatformWorkedWith', 
                                        'PlatformDesireNextYear', 'WebFrameWorkedWith', 'WebFrameDesireNextYear',
                                        'MiscTechWorkedWith', 'MiscTechDesireNextYear',
# interesting fields
                                        'SOVisitFreq', 'SOPartFreq',
# interesting fields 2 
                                        'Ethnicity', 'WorkWeekHrs'
                                    ]).rename(columns={'Hobbyist': 'Hobby', 'YearsCode': 'YearsCoding', 
                                                       'YearsCodePro': 'YearsCodingProf', 'JobSat': 'JobSatisfaction',
                                                       'CareerSat': 'CareerSatisfaction', 'ConvertedComp': 'ConvertedSalary',
                                                       'SOVisitFreq': 'StackOverflowVisit', 'SOPartFreq': 'StackOverflowParticipate',
                                                       'Ethnicity': 'RaceEthnicity'})
survey2019.dropna(thresh=10, inplace=True)
survey2019.reset_index(drop=True, inplace=True) # 88140 rows

In [13]:
''' converting the country into isconverted_countrycountry converter '''
converted_country = pd.Series(coco.convert(names=survey2019.Country.tolist(), to='ISO3', not_found='NA'))
survey2019['Country'] = converted_country



In [14]:
''' 
Hobby : no null fields
Country : no null fields (?!
Student : {  No -> No, Full time -> FT, part time -> PT, NaN -> NA  }
Years of coding : just keep as it is --> as there are multiple reason why it is NA (coding as hobby / student)
Satisfaction : Nan is map to mean of the data
'''
# display(survey2018['JobSatisfaction'].head(20))
# display(survey2018[survey2018.JobSatisfaction == 'No']) 

''' cleaning student status '''
survey2019['Student'] = survey2019['Student'].map(mapStudentStatus, na_action='ignore')
survey2019['Student'].fillna(value='NA', inplace=True)

''' cleaning year of satisfaction (can only run once) '''
survey2019['JobSatisfaction'] = survey2019['JobSatisfaction'].map(lambda x: SATISFACTION_MAP_2019[x], na_action='ignore')
survey2019['JobSatisfaction'].fillna(survey2019.JobSatisfaction.mean(), inplace=True)
survey2019['CareerSatisfaction'] = survey2019['CareerSatisfaction'].map(lambda x: SATISFACTION_MAP_2019[x], na_action='ignore')
survey2019['CareerSatisfaction'].fillna(survey2019.CareerSatisfaction.mean(), inplace=True)

display(survey2019.head(5), survey2019.tail(5))

survey2019[(survey2019.Student.isnull()) | (survey2019.JobSatisfaction.isna()) | (survey2019.CareerSatisfaction.isna())]

Unnamed: 0,Hobby,Country,Student,YearsCoding,YearsCodingProf,CareerSatisfaction,JobSatisfaction,ConvertedSalary,WorkWeekHrs,LanguageWorkedWith,...,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,StackOverflowVisit,StackOverflowParticipate,Age,Gender,RaceEthnicity
0,Yes,GBR,No,4.0,,5.406422,5.028677,,,HTML/CSS;Java;JavaScript;Python,...,Android;Arduino;Windows,Django;Flask,Flask;jQuery,Node.js,Node.js,A few times per month or weekly,,14.0,Man,
1,No,BIH,FT,,,5.406422,5.028677,,,C++;HTML/CSS;Python,...,Windows,Django,Django,,,Daily or almost daily,A few times per month or weekly,19.0,Man,
2,Yes,THA,No,3.0,1,5.5,5.5,8820.0,40.0,HTML/CSS,...,,,Other(s):,,,A few times per week,Less than once per month or monthly,28.0,Man,
3,No,USA,No,3.0,Less than 1 year,7.0,5.5,61000.0,80.0,C;C++;C#;Python;SQL,...,Linux;Windows,,,.NET,.NET,Daily or almost daily,Less than once per month or monthly,22.0,Man,White or of European descent
4,Yes,UKR,No,16.0,9,1.0,2.5,,55.0,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,...,Android;Docker;Kubernetes;Linux;Slack,Django;Express;Flask;jQuery;React.js;Spring,Flask;jQuery;React.js;Spring,Cordova;Node.js,Apache Spark;Hadoop;Node.js;React Native,Multiple times per day,A few times per month or weekly,30.0,Man,White or of European descent;Multiracial


Unnamed: 0,Hobby,Country,Student,YearsCoding,YearsCodingProf,CareerSatisfaction,JobSatisfaction,ConvertedSalary,WorkWeekHrs,LanguageWorkedWith,...,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,StackOverflowVisit,StackOverflowParticipate,Age,Gender,RaceEthnicity
88135,Yes,PAK,,1.0,Less than 1 year,5.406422,5.028677,,,HTML/CSS;Java;JavaScript,...,,Other(s):,,Other(s):,,I have never visited Stack Overflow (before to...,,,Man,
88136,No,ESP,No,18.0,15,5.406422,5.028677,,,HTML/CSS;JavaScript;Python,...,Arduino,Django;React.js,Django;React.js,,,A few times per week,I have never participated in Q&A on Stack Over...,40.0,Man,White or of European descent
88137,Yes,USA,No,38.0,38,5.406422,5.028677,,,Bash/Shell/PowerShell;Go;HTML/CSS;JavaScript;W...,...,Linux;Raspberry Pi,React.js,Vue.js,Node.js,Ansible,A few times per month or weekly,I have never participated in Q&A on Stack Over...,,Man,
88138,Yes,CAN,No,,,5.406422,5.028677,,,HTML/CSS;JavaScript;Other(s):,...,Google Cloud Platform;Linux,jQuery,jQuery;Vue.js,Node.js,React Native;Unity 3D;Unreal Engine,A few times per week,I have never participated in Q&A on Stack Over...,,Man,
88139,Yes,ESP,FT,8.0,3,5.406422,5.028677,,,Bash/Shell/PowerShell;HTML/CSS;Java;JavaScript...,...,Android;AWS;Google Cloud Platform;IBM Cloud or...,Django;jQuery;React.js,Django,Unity 3D;Unreal Engine,,Daily or almost daily,A few times per month or weekly,18.0,Man,Hispanic or Latino/Latina;White or of European...


Unnamed: 0,Hobby,Country,Student,YearsCoding,YearsCodingProf,CareerSatisfaction,JobSatisfaction,ConvertedSalary,WorkWeekHrs,LanguageWorkedWith,...,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,StackOverflowVisit,StackOverflowParticipate,Age,Gender,RaceEthnicity


In [15]:
'''
Age : NaN -> 'NA'
Gender : NaN -> 'NA'
'''
# survey2019[survey2019['Age'].isna()]     # 9061 rows does not provide age
survey2019.Age.fillna(value='NA', inplace=True)
# survey2019[survey2019['Gender'].isna()]    # 2997 rows does not provide gender
survey2019.Gender.fillna(value='NA', inplace=True)

In [16]:
''' 
Converted Salary : fill na with student -> the converted salary = 0 , else leave as NA --> TBC
'''
# TODO
survey2019.loc[(survey2019.ConvertedSalary.isna()) & (survey2019.Student != 'No'), 'ConvertedSalary'] = 0
# survey2019[(survey2019.ConvertedSalary.isna())]  # 32323 rows does not provide salary

In [17]:
'''
LanguageWorkedWith to MiscTechDesireNextYear : replace na with empty list, other split into list of lang / framework
'''

# survey2019[survey2019.LanguageWorkedWith.isna()].shape        # 822 rows does not provide prog languages
# survey2019[survey2019.LanguageDesireNextYear.isna()].shape    # 4263 rows does not provide prog languages
# survey2019[survey2019.DatabaseWorkedWith.isna()].shape        # 12180 rows does not provide db 
# survey2019[survey2019.DatabaseDesireNextYear.isna()] .shape   # 19036 rows does not provide db 
# survey2019[survey2019.PlatformWorkedWith.isna()].shape        # 7526 rows does not provide platform 
# survey2019[survey2019.PlatformDesireNextYear.isna()].shape    # 10776 rows does not provide platform 
# survey2019[survey2019.WebFrameWorkedWith.isna()].shape        # 23142 rows does not provide framework 
# survey2019[survey2019.WebFrameDesireNextYear.isna()].shape    # 25220 rows does not provide framework 
# survey2019[survey2019.MiscTechWorkedWith.isna()].shape        # 28578 rows does not provide misc tech
# survey2019[survey2019.MiscTechDesireNextYear.isna()].shape    # 23652 rows does not provide misc tech 

''' apply function to clean these fields (can only run once) '''
survey2019.loc[:, 'LanguageWorkedWith':'MiscTechDesireNextYear'] = survey2019.loc[:, 
                                        'LanguageWorkedWith':'MiscTechDesireNextYear'].applymap(split_or_empty)

In [18]:
''' mid progress check (only year of coding x2 and converted salary has NA values), 
    i.e. all the following tests should return false '''

print(survey2019.loc[:, 'Hobby':'Student'].isnull().values.any(), 
      survey2019.loc[:, 'CareerSatisfaction':'JobSatisfaction'].isnull().values.any(),
      survey2019.loc[:, 'LanguageWorkedWith':'LanguageDesireNextYear'].isnull().values.any(),
      survey2019.loc[:, 'Age':'Gender'].isnull().values.any())

False False False False


In [19]:
survey2019.to_pickle('./survey2019_half_cleaned.pkl')

# *************** Separator for old code ***************


In [4]:
# ''' cleaning function '''
# # remove NA in the questions that ask the user to rank different choices
# choices_ques = [col for col in survey_result_pub2018 if col[-1].isdigit()]
# priority_Arranging_Ques = [col for col in choices_ques if not any([k in col for k in ['Tools', 'Agree']])]

# def remove_NA_priorityQ(item):
#     return 0 if item == 'NA' else int(item)

# for col in priority_Arranging_Ques:
#     survey_result_pub2018[col] = survey_result_pub2018[col].map(remove_NA_priorityQ)
# survey_result_pub2018[priority_Arranging_Ques]

Unnamed: 0,AssessJob1,AssessJob2,AssessJob3,AssessJob4,AssessJob5,AssessJob6,AssessJob7,AssessJob8,AssessJob9,AssessJob10,...,JobEmailPriorities5,JobEmailPriorities6,JobEmailPriorities7,AdsPriorities1,AdsPriorities2,AdsPriorities3,AdsPriorities4,AdsPriorities5,AdsPriorities6,AdsPriorities7
0,10,7,8,1,2,5,3,4,9,6,...,1,4,3,1,5,4,7,2,6,3
1,1,7,10,8,2,5,4,3,6,9,...,2,6,7,3,5,1,4,6,7,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,5,7,1,2,6,4,3,10,9,...,1,4,5,2,3,4,6,1,7,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98853,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# all(survey_result_pub2018['AdsPriorities2'].map(lambda x: isinstance(x, int))) # true after mapping
# # all(survey_result_pub2018[('AdsPriorities3')].map(lambda x: isinstance(x, str)))  # true in original df

True

In [25]:
# survey_result_pub2018.shape

(98855, 129)

In [3]:
# ''' cleaning function '''
# # remove the rows that the country is NA
# survey_result_pub2018 = survey_result_pub2018[survey_result_pub2018.Country != 'NA']

In [30]:
# survey_result_pub2018.shape

(98443, 129)

In [17]:
# survey_result_pub2018.groupby('Country').agg('count')
# ['Respondent' > 100]

TypeError: '>' not supported between instances of 'str' and 'int'

## Looking at the dataset of 2011

In [17]:
''' testing the loading of data sets ''' 
# survey_result_pub2013 = pd.read_csv('part_of_data/2013 Stack Overflow Survey Responses.csv', keep_default_na = False)
# survey_result_pub2014 = pd.read_csv('part_of_data/2014 Stack Overflow Survey Responses.csv', keep_default_na = False)
# survey_result_pub2015 = pd.read_csv('part_of_data/2015 Stack Overflow Developer Survey Responses.csv', keep_default_na = False)
# survey_result_pub2016 = pd.read_csv('part_of_data/2016 Stack Overflow Survey Results/2016 Stack Overflow Survey Responses.csv', keep_default_na = False)
# survey_result_pub2017 = pd.read_csv('part_of_data/developer_survey_2017/survey_results_public.csv', keep_default_na = False)
# survey_result_pub2018 = pd.read_csv('part_of_data/developer_survey_2018/survey_results_public.csv', keep_default_na = False)
# survey_result_pub2019 = pd.read_csv('part_of_data/developer_survey_2019/survey_results_public.csv', keep_default_na = False)
