In [110]:
import pandas as pd
from config import config
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import os

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
    
plt.rcParams['figure.figsize'] = [10, 10]

In [111]:
# create cache directory
os.makedirs('cache', exist_ok=True)

assuming you have data and its placed where is should ("python3 fetch_data.py")

Business Understanding

* Is the salary correlated with some programming language, framework or platform?
* How much influence have the country the developer is working ?
* Are we able to predict salary based on the stack overflow data?

In [112]:
config[-1]

{'year': '2020',
 'url': 'https://drive.google.com/uc?id=1dfGerWeWkcyQ9GX9x20rdSGj7WtEpzBB&export=download',
 'local_path': 'data/2020.zip',
 'unpack_path': 'data/unpack/2020.zip',
 'data_path': 'data/unpack/2020.zip/survey_results_public.csv',
 'json_path': '2020.json',
 'leave_columns': ['Respondent', 'CompTotal', 'ConvertedComp']}

In [113]:
df_raw = pd.read_csv(config[-1]['data_path'], dtype=str)

salary encoded here as `ConvertedComp` - its in USD

In [114]:
len(df_raw)

64461

In [115]:
df_raw.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

The data of stackoverflow contains a lot of columns (61 in total). Most of the data is a string of a set of possible answers. Only a small amount contains floating point values. In total, we have 64461 answers, where about 53.9% (34756) contain an answer related to their current job salary, less than we got compared to the job satisfaction we analysed at the beginning of this year (70%).

In [116]:
df = df_raw[~ df_raw['ConvertedComp'].isnull()]

In [117]:
float(len(df)) / len(df_raw)

0.5391787282232667

In [118]:
# del df_raw

In [119]:
len(df)

34756

In [120]:
df['CompTotal'].describe()

count     34756
unique     2997
top       1e+05
freq        767
Name: CompTotal, dtype: object

In [121]:
set(df['CompFreq'])

{'Monthly', 'Weekly', 'Yearly'}

We have a total 2997 unique values. But this values can be references as 'Monthly', 'Weekly' and 'Yearly' - that needs to be considered then doing the data preparation.

# Section 3: Data Preparation

## Selection

All rows that don't have a salary defined, will where removed from the dataset.

# Construct

I created new columns out of the existings ones in order to extract the features we want to analyse. This was done for all columns, resulting in a very wide table. Run `preprocessing.py` in the project directory.

In [122]:
df_raw.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

In [123]:


def load(column:str,  year:int) -> pd.DataFrame:
    '''
    Calculate features and cache result. Further calls will return the precalculated results
    
    :param df: (pd.DataFrame) The DataFrame with the data to be processed
    :param column: (str) The column with the data to process
    :param year: (int) the year of the data, used as cache key
    :returns: go.Figure, ready to use plotly figure
    '''
    
    cache_key = f"{year}_{column}.json"
    cache_path = os.path.join('cache', cache_key)
    if os.path.exists(cache_path):
        return pd.read_json(cache_path, lines=True)


def plot_distribution(column:str, year:int, sort_by:str='values') -> go.Figure:
    '''
    Plots the sum of the positives of a partical column with multilable values.
    
    :param df: (pd.DataFrame) The DataFrame with the data to be plotted
    :param column: (str) The column with the data to plot
    :returns: go.Figure, ready to use plotly figure
    '''
    
    df_cached = load(column, year)
    df_sum = df_cached.sum().to_frame()
    if sort_by == 'values':
        df_sorted = df_sum.sort_values(by=0, ascending=True)
    elif sort_by == 'index': 
        df_sorted = df_sum.sort_index(ascending=False)
    
    y = list(map(lambda x: x.split('_')[-1], df_sorted.index))
    x = df_sorted.values.flatten()
    
    fig = go.Figure(data=[go.Bar(x=x, y=y, text=y, orientation='h')])
    return fig


In [124]:
plot_distribution('MainBranch', 2019).show()


In [125]:
plot_distribution('Age', 2020, 'index').show()

In [126]:
plot_distribution('Age1stCode', 2020, 'index').show()

age can be converted into ranges of age, as 

In [127]:
plot_distribution('CompFreq', 2020).show()

19107 yearly salary information, 14680 Monthly and 969 Weekly 

In [128]:
plot_distribution('Country', 2020, 'index').show()

In [129]:
plot_distribution('DatabaseDesireNextYear', 2020).show()

In [130]:
plot_distribution('DatabaseWorkedWith', 2020).show()

In [131]:
plot_distribution('EdLevel', 2020).show()

In [132]:
plot_distribution( 'Employment', 2020).show()

In [133]:
plot_distribution('Ethnicity', 2020).show()

In [134]:
plot_distribution('Gender', 2020).show()

In [135]:
plot_distribution('JobFactors', 2020).show()

In [136]:
plot_distribution('JobSat', 2020).show()

In [137]:
plot_distribution('JobSeek', 2020).show()

In [138]:
plot_distribution('LanguageDesireNextYear', 2020).show()

In [139]:
plot_distribution('LanguageWorkedWith', 2020).show()

In [140]:
plot_distribution('MiscTechDesireNextYear', 2020).show()

In [141]:
plot_distribution('MiscTechWorkedWith', 2020).show()

In [142]:
plot_distribution('NEWCollabToolsDesireNextYear', 2020).show()

In [143]:
plot_distribution('NEWCollabToolsWorkedWith', 2020).show()

In [144]:
plot_distribution('NEWDevOps', 2020).show()

In [145]:
plot_distribution('NEWDevOpsImpt', 2020).show()

In [146]:
plot_distribution('NEWEdImpt', 2020).show()

In [147]:
plot_distribution('NEWJobHunt', 2020).show()

In [148]:
plot_distribution('NEWJobHuntResearch', 2020).show()

In [149]:
plot_distribution('NEWLearn', 2020).show()

In [150]:
plot_distribution('NEWOffTopic', 2020).show()

In [151]:
plot_distribution('NEWOnboardGood', 2020).show()

In [152]:
plot_distribution('NEWOtherComms', 2020).show()

In [153]:
plot_distribution('NEWOvertime', 2020).show()

In [154]:
plot_distribution('NEWPurchaseResearch', 2020).show()

In [155]:
plot_distribution('NEWPurpleLink', 2020).show()

In [156]:
plot_distribution('NEWSOSites', 2020).show()

In [157]:
plot_distribution('NEWStuck', 2020).show()

In [158]:
plot_distribution('OpSys', 2020).show()

In [159]:
plot_distribution('OrgSize', 2020).show()

In [160]:
plot_distribution('PlatformDesireNextYear', 2020).show()

In [161]:
plot_distribution('PlatformWorkedWith', 2020).show()

In [162]:
plot_distribution('PurchaseWhat', 2020).show()

In [163]:
plot_distribution('Sexuality', 2020).show()

In [164]:
plot_distribution('SOAccount', 2020).show()

In [165]:
plot_distribution('SOComm', 2020).show()

In [166]:
plot_distribution('SOPartFreq', 2020).show()

In [167]:
plot_distribution('SOVisitFreq', 2020).show()

In [168]:
plot_distribution('SurveyEase', 2020).show()

In [169]:
plot_distribution('SurveyLength', 2020).show()

In [170]:
plot_distribution('Trans', 2020).show()

In [171]:
plot_distribution('UndergradMajor', 2020).show()

In [172]:
plot_distribution('WebframeDesireNextYear', 2020).show()

In [173]:
plot_distribution('WebframeWorkedWith', 2020).show()

In [174]:
plot_distribution('WelcomeChange', 2020).show()

In [178]:
plot_distribution('WorkWeekHrs', 2020, 'index').show()

In [176]:
plot_distribution('YearsCode', 2020, 'index').show()

In [177]:
plot_distribution('YearsCodePro', 2020).show()

* Age, Age1stCode, YearsCode, YearsCodePro for similar ranges
* WorkWeekHrs for similar ranges
* Country for regions