# Data Analysis on Stackoverflow Developer Survey- 2017

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
pd.options.display.max_rows = 4000
%matplotlib inline
plt.rcParams.update({'font.size':14})

ModuleNotFoundError: No module named 'pandas'

## Contents

* [Business Understanding](#business)
* [Data Understanding](#understanding)
* [Data Preparation](#preparation)
* [Data Modelling](#modelling)
* [Evaluating Results](#results)

<a id="business"></a>
## Business Understanding



<a id="understanding"></a>
## Data Understanding

In [None]:
survey_df = pd.read_csv('survey_results_public.csv')
survey_df.head()

In [None]:
survey_df.shape

In [None]:
print(sum(~survey_df['ExpectedSalary'].isnull()))
print(sum(~survey_df['Salary'].isnull()))

In [None]:
survey_schema[survey_schema['Column'].str.contains('have|want', flags=re.IGNORECASE)].values

In [None]:
sum(survey_df['Overpaid'].dropna().apply(lambda x: 1 if 'Somewhat underpaid' in x or 'Greatly underpaid' in x else 0))/ survey_df['Overpaid'].dropna().shape[0]

In [None]:
sum(survey_df['Overpaid'].dropna().apply(lambda x: 1 if 'Neither' in x else 0))/ survey_df['Overpaid'].dropna().shape[0]

In [None]:
sum(survey_df['Overpaid'].dropna().apply(lambda x: 1 if 'Somewhat overpaid' in x or 'Greatly overpaid' in x else 0))/ survey_df['Overpaid'].dropna().shape[0]

In [None]:
survey_df['HomeRemote'].value_counts()

In [None]:
sum(survey_df[survey_df['HomeRemote'] != 'Never']['CollaborateRemote'].dropna().apply(lambda x: 1 if 'agree' in x.lower() and 'disagree' not in x.lower() else 0))/ survey_df[survey_df['HomeRemote'] != 'Never']['CollaborateRemote'].dropna().shape[0]


In [None]:
survey_df.filter(regex='Edu*')

In [None]:
new_df = survey_df[~survey_df['HighestEducationParents'].isna() & ~survey_df['Salary'].isna() ][['HighestEducationParents', 'Salary']]

<a id="preparation"></a>
## Data Preparation

In [None]:
def return_df(column_name):
    '''
        Doc String 
        Description: This function returns the dataframe with the top 5 keys and values of the column given as input
        Input: String - Column name of the dataframe
        Output: Dataframe - A dataframe with (5,) shape 
    '''
    temp_dict = {}
    def add_or_append(x):
        '''
            Doc String 
            Description: This function performs manipulations in a row of the column name input. It splits the row with the ';' 
                            and creates a count of each of the values obtained from the list after splitting. 
                            Populates the global variable for this function temp_dict
            Input: String - None
            Output: Dataframe - None 
        '''
        if x is not np.nan:
            for i in x:
                temp_dict[i.strip().lower()] = temp_dict.get(i.strip().lower(), 0) + 1
    z = survey_df[column_name].str.split(';').apply(lambda x: add_or_append(x))
    df = pd.DataFrame()
    df['tech'] = temp_dict.keys()
    df['Count'] = temp_dict.values()
    df = df.sort_values(by='Count', ascending = False)
    return df.iloc[:5,:]
return_df('HaveWorkedFramework')

In [None]:
education_type_count = {}
def get_count(education_list):
    '''
        Doc String 
        Description: This function takes in a list of education types and populates the education_type_count variable 
                            with the count of each of these education types.   
        Input: String - list - of education types. 
        Output: Dataframe - None 
    '''
    if education_list is not np.NaN:
        if np.nan not in education_list:
            for education in education_list:
                education_type_count[education.strip()] = education_type_count.get(education.strip(), 0) + 1

<a id="modelling"></a>
## Data Modelling

In [None]:
survey_df['EducationTypes'].str.split(";").apply(lambda x: get_count(x))
education_type_count

<a id="results"></a>
## Evaluating Results

| = pd.read_csv('survey_results_schema.csv')
survey_schema

### What is the distribution of profession among the respondents?

In [None]:
ax = survey_df.groupby(by='Professional').count()['Respondent'].plot(kind='bar', figsize=(19,7),
                                                                     width=0.25, title = "Distribution of profession among the respondents")
ax.set_xlabel("Profession category")
ax.set_ylabel("Number of respondents")

Write stuff abouth te graohhhh

### How many people were comfortable with telling us their salary and salary expectations?

# Developer related questions

### How many developers love solving problems?

In [None]:
ax = survey_df['ProblemSolving'].value_counts().plot(kind='bar', figsize=(19,7), title='Do developers love solving problems?')
ax.set_xlabel('Category of developer opinion')
ax.set_ylabel('Number of respondents')

### What is the distribution for level of satisfaction in career?

In [None]:
ax = survey_df['CareerSatisfaction'].value_counts().sort_index().plot(kind='bar', figsize=(19,7), title = 'Career Satisfaction')
ax.set_xlabel('Rating')
ax.set_ylabel('Number of respondents')
ax.grid(True)

### What is the distribution for level of satisfaction in job?

In [None]:
ax = survey_df['JobSatisfaction'].value_counts().sort_index().plot(kind='bar', figsize=(19,7), title = 'Job Satisfaction')
ax.set_xlabel('Rating')
ax.set_ylabel('Number of respondents')
ax.grid(True)

### What is the distribution for level of satisfaction in career and jobs? Are people more satisfied with their career or job?


In [None]:
plt.figure(figsize=(19,7))

sorted_df_job = survey_df['JobSatisfaction'].value_counts().sort_index()
ax1 = plt.bar(sorted_df_job.index, sorted_df_job.values, width = 0.25, label = 'Job Satisfaction')

sorted_df_career = survey_df['CareerSatisfaction'].value_counts().sort_index()
ax2 = plt.bar(sorted_df_career.index+0.25, sorted_df_career.values, width = 0.25, label = 'Career Satisfaction')

plt.legend(handles=[ax1, ax2])
plt.xlabel('Rating')
plt.ylabel('Counts')
plt.title('Job Satisfaction vs Career Satisfaction')

### Of all the developers who have an opinion about either overpaid or underpaid 
* 56.46% of them think they are underpaid 
* 7.3% of them think they are overpaid
* 36.13% of them they they are neither overpaid nor underpaid

In [None]:
ax = survey_df['Overpaid'].value_counts().plot(kind='bar', figsize=(19,7), width=0.25, title = "Respondent's opinion about overpaid or underpaid at a job")
ax.set_xlabel('Category in overpaid')
ax.set_ylabel('Number of respondents')
ax.grid(True)

### Of all the developers who worked remote before, 69.67% of the people feel communication is difficult during remote work 

### What languages, frameworks, databases and platforms developers used before vs what do they want to work on? 

In [None]:
fig, ax= plt.subplots(4,2, figsize=(22,30), sharey=True)

ax[0,0].bar(return_df('HaveWorkedLanguage')['tech'],return_df('HaveWorkedLanguage')['Count'], width=0.25, label = 'HaveWorkedLanguage')
ax[0,0].legend()
ax[0,0].grid(True)

ax[0,1].bar(return_df('WantWorkLanguage')['tech'],return_df('WantWorkLanguage')['Count'], width=0.25, label = 'WantWorkLanguage')
ax[0,1].legend()
ax[0,1].grid(True)

ax[1,0].bar(return_df('HaveWorkedFramework')['tech'],return_df('HaveWorkedFramework')['Count'], width=0.25, label = 'HaveWorkedFramework')
ax[1,0].legend()
ax[1,0].grid(True)

ax[1,1].bar(return_df('WantWorkFramework')['tech'],return_df('WantWorkFramework')['Count'], width=0.25, label = 'WantWorkFramework')
ax[1,1].legend()
ax[1,1].grid(True)

ax[2,0].bar(return_df('HaveWorkedDatabase')['tech'],return_df('HaveWorkedDatabase')['Count'], width=0.25, label = 'HaveWorkedDatabase')
ax[2,0].legend()
ax[2,0].grid(True)

ax[2,1].bar(return_df('WantWorkDatabase')['tech'],return_df('WantWorkDatabase')['Count'], width=0.25 , label = 'WantWorkDatabase')
ax[2,1].legend()
ax[2,1].grid(True)

df = return_df('HaveWorkedPlatform')['tech'].apply(lambda x: x.split(' ')[0] if 'desktop' in x else x)
df = df.apply(lambda x: 'aws' if 'aws' in x else x)
ax[3,0].bar(df,return_df('HaveWorkedPlatform')['Count'], width=0.25 , label = 'HaveWorkedPlatform')
ax[3,0].legend()
ax[3,0].grid(True)

df = return_df('WantWorkPlatform')['tech'].apply(lambda x: x.split(' ')[0] if 'desktop' in x else x)
df = df.apply(lambda x: 'aws' if 'aws' in x else x)
ax[3,1].bar(df,return_df('WantWorkPlatform')['Count'], width=0.25 , label = 'WantWorkPlatform')
ax[3,1].legend()
ax[3,1].grid(True)

plt.show()

### How did respondents become developers?

In [None]:
fig, ax = plt.subplots(figsize = (19,7))
ax.barh(list(education_type_count.keys()), education_type_count.values())
ax.set_title('Different ways developers used to learn')
ax.set_xlabel('Number of respondents')
ax.set_ylabel('Mode of learning')
for i,v in enumerate(education_type_count.values()):
    ax.text(v+4, i-0.05, str(v))

### How does a parent's education influence the success of the respondent?

In [None]:
ax = new_df.groupby(by='HighestEducationParents').mean().plot(kind='barh', figsize=(19,7))
ax.set_title("Effect of parent's education on children")
ax.set_xlabel('Number of respondents')
ax.set_ylabel('Level of education')