### Exploratory Analysis of Stack Overflow Survey Data

#### Broadly speaking, we are interested in answering the following question: Who are the respondents?

In particular, we will look at the following information about respondents:
* Country of residence
* Gender
* Age
* Preferred current technology
* Technology they are excited about

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import seaborn as sns
import os
import pprint
import column_rename_dicts as crd
%matplotlib inline
pd.options.display.max_seq_items = 500

In [9]:
#regex patterns needed in data cleaning
date_pattern = r"^([1-9]|1[012])[- /.]([1-9]|[12][0-9]|3[01])[- /.](19|20)\d\d$"

In [15]:
#Load Data
df_2017 = pd.read_csv('./so-survey-2017/survey_results_public.csv')

In [24]:
df_2017[['DeveloperType','WebDeveloperType','MobileDeveloperType','NonDeveloperType']].head()
# pd.__version__

Unnamed: 0,DeveloperType,WebDeveloperType,MobileDeveloperType,NonDeveloperType
0,,,,
1,,,,
2,Other,,,
3,,,,Data scientist
4,Mobile developer; Graphics programming; Deskto...,,,


In [23]:
df_2017.groupby(['CompanyType']).size()

CompanyType
Government agency or public school/university                             2451
I don't know                                                              3233
I prefer not to answer                                                    1816
Non-profit/non-governmental organization or private school/university     1225
Pre-series A startup                                                      1288
Privately-held limited company, not in startup mode                      16709
Publicly-traded corporation                                               5871
Sole proprietorship or partnership, not in startup mode                   2831
Something else                                                             342
State-owned company                                                        670
Venture-funded startup                                                    2387
dtype: int64

The first few rows of the 2013 and 2014 data indicate that there are date values in the following variables that do not make sense, given the expected variable type:
* 'YearsProgram'
* 'CompanySize'
* 'NumDevsAtCompany'
* 'SizeOfTeam'

The fact that they are all 2013 dates suggests that they may be some sort of date stamp in lieu of a non-response. Since it is not clear what they ought to be, we will recode them as missing.

In [18]:
def recode_null_not_null_as_0_1(df, cols_not_to_recode):
    '''
    This function will split the data frame into columns that can easily be
    recoded as 0/1, and those that cannot. More specifically, if a column
    has a single non-NaN string value, and the relevant information is already contained
    in the column name, then it will convert that column to a 0/1 dummy
    '''
    
    # Split the dataframe into columns that will be operated on, and those that won't
    df_recode = df.drop(columns = cols_not_to_recode)
    df = df[cols_not_to_recode]
    df_recode = df_recode.notnull().astype('int')
    df = pd.concat([df, df_recode], axis=1)
    return df

In [28]:
df_missingness_stats(df_2017, '2017')

Number of Rows in 2017 data: 51392
Number of Columns in 2017 data: 154
Number of Columns in 2017 data with no missing values: 7
Number of Columns in 2017 data with > 75% missing values: 14
Number of Columns in 2017 data with all missing values: 0
Columns in 2017 data with no missing values: {'Country', 'ProgramHobby', 'Respondent', 'FormalEducation', 'University', 'Professional', 'EmploymentStatus'}
Columns in 2017 data with > 75% missing values: {'ExCoderBelonged', 'ExCoderWillNotCode', 'ExCoder10Years', 'ExpectedSalary', 'NonDeveloperType', 'ExCoderBalance', 'TimeAfterBootcamp', 'YearsCodedJobPast', 'ExCoderNotForMe', 'ExCoderSkills', 'ExCoderReturn', 'MobileDeveloperType', 'ExCoderActive', 'WebDeveloperType'}


In [33]:
df_2017['Professional'].value_counts()

Professional developer                                  36131
Student                                                  8224
Professional non-developer who sometimes writes code     5140
Used to be a professional developer                       983
None of these                                             914
Name: Professional, dtype: int64

In [34]:
df_2017['ProgramHobby'].value_counts()

Yes, I program as a hobby                    24801
Yes, both                                    13756
No                                            9787
Yes, I contribute to open source projects     3048
Name: ProgramHobby, dtype: int64

In [36]:
df_2017['University'].value_counts()

No                     37543
Yes, full-time          9369
Yes, part-time          3352
I prefer not to say     1128
Name: University, dtype: int64

In [26]:
def df_missingness_stats(df, year):
    df.name = year + ' data'
    print('Number of Rows in {}: {}'.format(df.name, df.shape[0]))
    print('Number of Columns in {}: {}'.format(df.name, df.shape[1]))
    print('Number of Columns in {} with no missing values: {}'.format(df.name, len(set(df.columns[~df.isnull().any()]))))
    print('Number of Columns in {} with > 75% missing values: {}'.format(df.name, 
                                                                         len(set(df.columns[df.isnull().sum()/len(df) > .75]))))
    print('Number of Columns in {} with all missing values: {}'.format(df.name, 
                                                                         len(set(df.columns[df.isnull().sum()/len(df) == 1]))))
    print('Columns in {} with no missing values: {}'.format(df.name, 
                                                            set(df.columns[~df.isnull().any()])))
    print('Columns in {} with > 75% missing values: {}'.format(df.name, 
                                                               set(df.columns[df.isnull().sum()/len(df) > .75])))

In [19]:
def create_dummy_df(df, cols_to_dummy, dummy_na=True):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in cols_to_dummy:
        try:
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], 
                                                                 prefix=col, 
                                                                 prefix_sep='_', 
                                                                 drop_first=True,
                                                                 dummy_na=dummy_na
                                                                )], axis=1)
        except:
            continue

    return df

In [20]:
#creating labelEncoder
def process_y_var_split_data(df, yvar):
    df[yvar] = df[yvar].str.replace('<', 'less than ')
    df[yvar] = df[yvar].str.replace('>', 'greater than ')
    df[yvar] = df[yvar].astype('str')
    df = df[df[yvar]!='nan']
    
    print(df[yvar].value_counts())
    le = preprocessing.LabelEncoder()
    X = df.drop(columns=[yvar])
    y = le.fit_transform(df[yvar])
    
    return X, y 


In [None]:
X_2013, y_2013 = process_y_var_split_data(df_2013, 'TotalCompensation')

In [43]:
X_2014, y_2014 = process_y_var_split_data(df_2014, 'TotalCompensation')

Student / Unemployed     1147
Rather not say           1130
less than $20,000         970
$40,000 - $60,000         794
$20,000 - $40,000         746
$60,000 - $80,000         697
$80,000 - $100,000        587
$100,000 - $120,000       386
greater than $140,000     244
$120,000 - $140,000       199
Name: TotalCompensation, dtype: int64


In [24]:
X_2015, y_2015 = process_y_var_split_data(df_2015, 'Compensation')

Less than $20,000      4000
$20,000 - $40,000      2732
Rather not say         2628
$40,000 - $60,000      2429
$60,000 - $80,000      2007
Unemployed             1996
$80,000 - $100,000     1394
$100,000 - $120,000     991
$120,000 - $140,000     562
More than $160,000      462
$140,000 - $160,000     280
Name: Compensation, dtype: int64


In [72]:
# Split dataset into training set and test set
def compare_classifiers(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42) # 70% training and 30% test


    #Create a Gaussian Classifier
    gnb = GaussianNB()

    #Train the model using the training sets
    gnb.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_gnb = gnb.predict(X_test)

    # Logistic Regression Classifier
    logreg = LogisticRegression()

    #Train the model using the training sets
    logreg.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_lr = logreg.predict(X_test)

    # Random Forest Classifier
    rf = RandomForestClassifier()

    #Train the model using the training sets
    rf.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_rf = rf.predict(X_test)

    # Adaboost Classifier
    ada = AdaBoostClassifier()

    #Train the model using the training sets
    ada.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_ada = ada.predict(X_test)

    print("GNB Accuracy:",metrics.accuracy_score(y_test, y_pred_gnb))

    print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, y_pred_lr))

    print("Random Forest Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))

    print("AdaBoost Accuracy:",metrics.accuracy_score(y_test, y_pred_ada))


GNB Accuracy: 0.14353163361661944
Logistic Regression Accuracy: 0.32483474976392823
Random Forest Accuracy: 0.2554296506137866
AdaBoost Accuracy: 0.279508970727101


In [25]:
# Split dataset into training set and test set
def compare_classifiers(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42) # 70% training and 30% test

    #Create a Gaussian Classifier
    gnb = GaussianNB()

    #Train the model using the training sets
    gnb.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_gnb = gnb.predict(X_test)

    # Logistic Regression Classifier
    logreg = LogisticRegression()

    #Train the model using the training sets
    logreg.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_lr = logreg.predict(X_test)

    # Random Forest Classifier
    rf = RandomForestClassifier()

    #Train the model using the training sets
    rf.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_rf = rf.predict(X_test)

    # Adaboost Classifier
    ada = AdaBoostClassifier()

    #Train the model using the training sets
    ada.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_ada = ada.predict(X_test)

    print("GNB Accuracy:",metrics.accuracy_score(y_test, y_pred_gnb))

    print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, y_pred_lr))

    print("Random Forest Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))

    print("AdaBoost Accuracy:",metrics.accuracy_score(y_test, y_pred_ada))

In [45]:
compare_classifiers(X_2014, y_2014)

GNB Accuracy: 0.1067632850241546
Logistic Regression Accuracy: 0.39806763285024155
Random Forest Accuracy: 0.3565217391304348
AdaBoost Accuracy: 0.3782608695652174


In [26]:
compare_classifiers(X_2015, y_2015)

GNB Accuracy: 0.03866552609067579
Logistic Regression Accuracy: 0.38751069289991447
Random Forest Accuracy: 0.28160821214713433
AdaBoost Accuracy: 0.3286569717707442


In [8]:
# pp.pprint(crd.column_name_map_2014)

In [9]:
# pp.pprint(list(df_2014.columns))

In [10]:
def plot_value_counts(df, col, plot_title):
    status_vals = df[col].value_counts() 
    print(status_vals)
    # The below is a bar chart of the proportion of observations in each category of df[col]
    (status_vals/df.shape[0]).plot(kind="bar");
    plt.title("What kind of developer are you?");

In [None]:
def get_description(column_name, schema=schema):
    '''
    INPUT - schema - pandas dataframe with the schema of the developers survey
            column_name - string - the name of the column you would like to know about
    OUTPUT - 
            desc - string - the description of the column
    '''
    desc = schema['Question'][schema['Column']==column_name].values[0]
    return desc

In [None]:
possible_vals = ["Take online courses", "Buy books and work through the exercises", 
                 "None of these", "Part-time/evening courses", "Return to college",
                 "Contribute to open source", "Conferences/meet-ups", "Bootcamp",
                 "Get a job as a QA tester", "Participate in online coding competitions",
                 "Master's degree", "Participate in hackathons", "Other"]

def clean_and_plot(df, title='Method of Educating Suggested', plot=True):
    '''
    INPUT 
        df - a dataframe holding the CousinEducation column
        title - string the title of your plot
        axis - axis object
        plot - bool providing whether or not you want a plot back
        
    OUTPUT
        study_df - a dataframe with the count of how many individuals
        Displays a plot of pretty things related to the CousinEducation column.
    '''
    study = df['CousinEducation'].value_counts().reset_index()
    study.rename(columns={'index': 'method', 'CousinEducation': 'count'}, inplace=True)
    study_df = t.total_count(study, 'method', 'count', possible_vals)

    study_df.set_index('method', inplace=True)
    if plot:
        (study_df/study_df.sum()).plot(kind='bar', legend=None);
        plt.title(title);
        plt.show()
    props_study_df = study_df/study_df.sum()
    return props_study_df
    
props_df = clean_and_plot(df)