### Exploratory Analysis of Stack Overflow Survey Data

#### Broadly speaking, we are interested in answering the following question: Who are the respondents?

In particular, we will look at the following information about respondents:
* Country of residence
* Gender
* Age
* Preferred current technology
* Technology they are excited about

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import seaborn as sns
import os
import pprint
import column_rename_dicts as crd
%matplotlib inline
pd.options.display.max_seq_items = 500

  from numpy.core.umath_tests import inner1d


In [3]:
#regex patterns needed in data cleaning
date_pattern = r"^([1-9]|1[012])[- /.]([1-9]|[12][0-9]|3[01])[- /.](19|20)\d\d$"

In [4]:
#Load Data
# df_2011 = pd.read_csv("2013 Stack Overflow Survey Responses.csv")
# df_2012 = pd.read_csv('\\data\\2012\\2012 Stack Overflow Survey Results.csv')
df_2013 = pd.read_csv('.\\data\\2013\\2013 Stack Overflow Survey Responses.csv', dtype=str)
schema_2013 = pd.read_csv('.\\data\\2013\\2013_Schema.csv')
df_2014 = pd.read_csv('.\\data\\2014\\2014 Stack Overflow Survey Responses.csv')
df_2015 = pd.read_csv('.\\data\\2015\\2015 Stack Overflow Developer Survey Responses.csv', skiprows=1,dtype=str)
df_2016 = pd.read_csv('.\\data\\2016\\2016 Stack Overflow Survey Results\\2016 Stack Overflow Survey Responses.csv')
df_2017 = pd.read_csv('.\\data\\2017\\survey_results_public.csv')
df_2018 = pd.read_csv('.\\data\\2018\\survey_results_public.csv', dtype=str)
df_2019 = pd.read_csv('.\\data\\2019\\survey_results_public.csv')

In [5]:
# df_2017[['DeveloperType','WebDeveloperType','MobileDeveloperType','NonDeveloperType']].head()
# df_2015['Occupation'].value_counts()
# df_2017.groupby(['CompanyType']).size()
# print(count)
# pd.__version__

The first few rows of the 2013 and 2014 data indicate that there are date values in the following variables that do not make sense, given the expected variable type:
* 'YearsProgram'
* 'CompanySize'
* 'NumDevsAtCompany'
* 'SizeOfTeam'

The fact that they are all 2013 dates suggests that they may be some sort of date stamp in lieu of a non-response. Since it is not clear what they ought to be, we will recode them as missing.

In [6]:
def basic_cleaning_13_14_data(df, col_rename_map, col_to_modify, col_to_drop):
    df=df.rename(columns=col_rename_map).drop([0]).reset_index(drop=True)
    df.drop(columns=col_to_drop, inplace = True)
    for col in col_to_modify:
        df[col] = df[col].str.replace(date_pattern, 'NaN')

    return df

In [6]:
col_to_modify13 = ['YearsProgram', 'CompanySize', 'NumDevsAtCompany', 'SizeOfTeam']
col_to_drop13 = ['StckOvrflwAdsRemembered']
df_2013 = basic_cleaning_13_14_data(df_2013, crd.column_rename_map_2013, col_to_modify13, col_to_drop13)
df_2013.head()

Unnamed: 0,Country,USState,Age,YearsProgram,Industry,CompanySize,Occupation,NumDevsAtCompany,SizeOfTeam,InteractWithSysAdm,...,StckOvrflwAdsEntertaining,StckOvrflwAdsInformative,StckOvrflwAdsClickThruIntrstingAds,StckOvrflwAdsPurchasedProduct,StckOvrflwAdsUseAdBlocker,StckOvrflwReputation,StckOvrflwUseReadOthrQs,StckOvrflwUseAskQs,StckOvrflwUseAnsQs,StckOvrflwUseOther
0,United Kingdom,,35-39,,Finance / Banking,101-999,Enterprise Level Services,100.0,,System Administrators,...,Neutral,Neutral,Neutral,Neutral,Neutral,Don't have an account,Read other people's questions to solve my prob...,,,
1,United States of America,Oregon,25-29,,Retail,101-999,Back-End Web Developer,,,System Administrators,...,Agree,Neutral,Agree,Disagree,Neutral,1,Read other people's questions to solve my prob...,Ask questions to solve problems,Answer questions I know the answer to,
2,United States of America,Wisconsin,51-60,11.0,Software Products,26-100,Enterprise Level Services,,Just me!,System Administrators,...,Neutral,Neutral,Strongly Disagree,Strongly Disagree,Strongly Disagree,Don't have an account,Read other people's questions to solve my prob...,,,
3,Germany,,,,,,,,,,...,,,,,,,,,,
4,United States of America,Idaho,35-39,11.0,Consulting,,,,,,...,,,,,,,,,,


In [7]:
df_2013['Occupation'].value_counts()

Full-Stack Web Developer                 2387
Desktop Software Developer                929
Back-End Web Developer                    768
Student                                   708
Mobile Application Developer              631
Other                                     555
Manager of Developers or Team Leader      535
Enterprise Level Services                 432
Front-End Web Developer                   420
IT Staff / System Administrator           224
Embedded Application Developer            210
Executive (VP of Eng, CTO, CIO, etc.)     195
I don't work in tech                      160
Database Administrator                     64
Name: Occupation, dtype: int64

In [8]:
col_to_modify14 = ['YearsProgram', 'NumDevsAtCompany']
col_to_drop14 = ['StckOvrflwAdsRemembered']
df_2014 = basic_cleaning_13_14_data(df_2014, crd.column_rename_map_2014, col_to_modify14, col_to_drop14)
df_2014.head()

Unnamed: 0,Country,CountryOther,USState,Age,Gender,YearsProgram,Occupation,TotalCompensation,Industry,NumDevsAtCompany,...,AwareOfApptivate,ParticipatedInApptivate,StckOvrflwReputation,StckOvrflwUseReadOthrQs,StckOvrflwUseAskQs,StckOvrflwUseAnsQs,StckOvrflwUseJobSearch,StckOvrflwUseBuildOnlineRep,StckOvrflwUseOther,FreqFindQSolnOnStckOvrflw
0,India,,,30-34,Female,,Back-End Web Developer,"$20,000 - $40,000",Finance / Banking,100.0,...,No,No,500,Read other people's questions to solve my prob...,Ask questions to solve problems,,,,,Almost Always
1,Thailand,,,20-24,Male,<2,Back-End Web Developer,Student / Unemployed,Healthcare,,...,,,Don't have an account,Read other people's questions to solve my prob...,,,,,,
2,Iran,,,25-29,Male,,Desktop Software Developer,"<$20,000",Not Currently Employed,,...,No,No,1,Read other people's questions to solve my prob...,,,,,,Almost Always
3,Ukraine,,,< 20,Male,<2,Student,Student / Unemployed,Student,,...,,,50,Read other people's questions to solve my prob...,Ask questions to solve problems,,,,,Almost Always
4,India,,,25-29,Male,,Full-Stack Web Developer,Rather not say,Manufacturing,,...,,,,,,,,,,


In [9]:
df_2014['Occupation'].value_counts()

Full-Stack Web Developer                 1966
Student                                  1024
Desktop Software Developer                721
Back-End Web Developer                    711
Mobile Application Developer              579
Other                                     528
Front-End Web Developer                   384
Manager of Developers or Team Leader      347
Enterprise Level Services                 263
I don't work in tech                      193
Embedded Application Developer            185
IT Staff / System Administrator           157
DevOps                                    123
Executive (VP of Eng, CTO, CIO, etc.)     104
Database Administrator                     61
Name: Occupation, dtype: int64

In [6]:
df_2015=df_2015.rename(columns=crd.column_rename_map_2015)

In [7]:
def df_missingness_stats(df, year):
    df.name = year + ' data'
    print('Number of Rows in {}: {}'.format(df.name, df.shape[0]))
    print('Number of Columns in {}: {}'.format(df.name, df.shape[1]))
    print('Number of Columns in {} with no missing values: {}'.format(df.name, len(set(df.columns[~df.isnull().any()]))))
    print('Number of Columns in {} with > 75% missing values: {}'.format(df.name, 
                                                                         len(set(df.columns[df.isnull().sum()/len(df) > .75]))))
    print('Number of Columns in {} with all missing values: {}'.format(df.name, 
                                                                         len(set(df.columns[df.isnull().sum()/len(df) == 1]))))
    print('Columns in {} with no missing values: {}'.format(df.name, 
                                                            set(df.columns[~df.isnull().any()])))
    print('Columns in {} with > 75% missing values: {}'.format(df.name, 
                                                               set(df.columns[df.isnull().sum()/len(df) > .75])))

In [44]:
df_missingness_stats(df_2013, '2013')
# print(df_2013.name)

Number of Rows in 2013 data: 9742
Number of Columns in 2013 data: 127
Number of Columns in 2013 data with no missing values: 0
Number of Columns in 2013 data with > 75% missing values: 56
Number of Columns in 2013 data with all missing values: 2
Columns in 2013 data with no missing values: set()
Columns in 2013 data with > 75% missing values: {'ProgLangUsedPHP', 'ProgLangUsedJQuery', 'ProgLangUsedOther', 'TechOwnediPad', 'TechOwnedAndrTablet', 'TechOwnedWii', 'ProgLangUsedObjective-C', 'ProdPurchTypeServers', 'ProgLangUsedC', 'CompRevSourceOther', 'TechExcitedAbtDart', 'CompRevSourceMobilApps', 'InteractWithConsultants', 'InteractWithHR', 'ProdPurchTypeConsultants', 'TechExcitedAbtMongoDB', 'TechExcitedAbtPhoneGap', 'TechOwnedOther', 'InteractWithFinance', 'ProgLangUsedPython', 'AppSuppAndroidTablet', 'ProdPurchTypeOther', 'TechExcitedAbtC++11', 'TechOwnedXbox', 'TechExcitedAbtF#', 'TechExcitedAbtCoffeeScript', 'TechExcitedAbtTypeScript', 'TechOwnedKindleFire', 'ProdPurchRoleNone', 'Pr

In [34]:
df_missingness_stats(df_2014, '2014')

Number of Rows in 2014 data: 7643
Number of Columns in 2014 data: 119
Number of Columns in 2014 data with no missing values: 94
Number of Columns in 2014 data with > 75% missing values: 2
Number of Columns in 2014 data with all missing values: 1
Columns in 2014 data with no missing values: {'TechOwnedAndroid', 'ProdPurchTypeSoftware', 'JobOppCntctPrefPhone', 'StckOvrflwUseAskQs', 'ProdPurchRoleBudget', 'SpendTimeOnInternet', 'ProgLangUsedPHP', 'ProgLangUsedRuby', 'ProdPurchTypeConsultants', 'ProgLangUsedOther (please specify)', 'TechExcitedAbtHaskell', 'TechOwnedPS3', 'ProdPurchRoleNone', 'TechOwnedWinTablet', 'ProdPurchTypeHardware', 'StckOvrflwAdsEntertaining', 'JobOppCntctPrefStckOvrfw', 'StckOvrflwAdsUseAdBlocker', 'SpendTimeOnMeetings', 'AppSuppAndroidPhone', 'AppSuppiPad', 'TechOwnedWiiU', 'JobOppRespToDescCompCultr', 'ProdPurchRoleInfluence', 'JobOppRespToDescBenefts', 'JobOppRespToPersMsg', 'TechOwnediPhone', 'StckOvrflwAdsPurchasedProduct', 'ProgLangUsedNode.js', 'JobOppCntctP

In [8]:
df_missingness_stats(df_2015, '2015')

Number of Rows in 2015 data: 26086
Number of Columns in 2015 data: 222
Number of Columns in 2015 data with no missing values: 0
Number of Columns in 2015 data with > 75% missing values: 157
Number of Columns in 2015 data with all missing values: 0
Columns in 2015 data with no missing values: set()
Columns in 2015 data with > 75% missing values: {'AssessJobTech', 'CurrLangTechMatlab', 'CurrLangTechHaskell', 'JobSearchAnnoyFindQualfJob', 'WhyAnswerQSelfPromo', 'ImprvmntToIntrvwFewerPuzzles', 'JobSearchAnnoyNoResp', 'CurrLangTechRust', 'FutureLangTechArduino', 'WhyStckOvrflwCareersJobSelect', 'CurrLangTechLAMP', 'TrainingEdBootCamp', 'FutureLangTechClojure', 'SourceCntrlUsedCVS', 'CurrLangTechCoffeeScript', 'CurrLangTechVisualBasic', 'CurrLangTechC', 'CurrLangTechArduino', 'CurrLangTechGo', 'CurrLangTechScala', 'AppealingMsgBenefits', 'FutureLangTechObjective-C', 'FutureLangTechAndroid', 'FutureLangTechGo', 'CurrLangTechWindowsPhone', 'MostUrgJobInfoCompName', 'FutureLangTechC', 'FutureLa

In [9]:
def recode_null_not_null_as_0_1(df, cols_not_to_recode):
    '''
    This function will split the data frame into columns that can easily be
    recoded as 0/1, and those that cannot. More specifically, if a column
    has a single non-NaN string value, and the relevant information is already contained
    in the column name, then it will convert that column to a 0/1 dummy
    '''
    
    # Split the dataframe into columns that will be operated on, and those that won't
    df_recode = df.drop(columns = cols_not_to_recode)
    df = df[cols_not_to_recode]
    df_recode = df_recode.notnull().astype('int')
    df = pd.concat([df, df_recode], axis=1)
    return df

In [45]:
cols_not_to_recode_2013 = ['Country', 'USState', 'Age', 'YearsProgram', 'Industry', 'CompanySize',
                        'Occupation', 'NumDevsAtCompany','SizeOfTeam', 'OutsideExpenseBudget',
                        'DesktopOS', 'ChangedJobsInLastYear', 'CareerJobSatisfaction', 'TotalCompensation',
                        'AmtSpentOnTechLastYr', 'StckOvrflwReputation']
df_2013 = recode_null_not_null_as_0_1(df_2013, cols_not_to_recode_2013)

In [27]:
cols_not_to_recode_2014 = ['Country', 'CountryOther', 'USState', 'Age', 'Gender', 'YearsProgram', 
                           'Industry', 'Occupation', 'NumDevsAtCompany', 'OutsideExpenseBudget', 
                           'DesktopOS', 'ChangedJobsInLastYear', 'TotalCompensation', 
                           'StckOvrflwReputation', 'WorkRemotely', 'EnjoyWorkRemotely', 
                           'WhereWorkRemotely', 'HowFindOutCurrJob', 'HowFindOutCurrJobOth', 
                           'CurrLookingNewJob', 'FreqOfCntctByRecruitrs', 'FreqOfJobBoardVisits', 
                           'AwareOfStckOvrflw2.0', 'HaveStckOvrflw2.0Profile', 'AwareOfApptivate', 
                           'ParticipatedInApptivate']
df_2014 = recode_null_not_null_as_0_1(df_2014, cols_not_to_recode_2014)

In [10]:
cols_not_to_recode_2015 = ['Country', 'Age', 'Gender', 'TabsOrSpaces', 'YearsProgram',
                           'Occupation', 'DesktopOS', 'DesktopOSWriteIn', 'Compensation',
                           'EmploymentStatus', 'Industry', 'JobSatisfaction',
                           'PurchasingPower', 'RemoteStatus', 'ChangedJobsInLastYear', 'CurrLookingNewJob',
                           'ImpOfRemote', 'FreqOfCntctByRecruitrs', 'NumCaffBevPerDay', 
                           'NumHoursPrgrmHobbyPerWeek', 'HowFreqAtStackOvrflw', 'PreferredTextEditor', 
                           'PreferredTextEditorWriteIn', 'PreferredIDETheme', 'PreferredSourceControl',
                           'PreferredSourceControlWriteIn', 'HowFreqStckOvrflwHelpful']
df_2015 = recode_null_not_null_as_0_1(df_2015, cols_not_to_recode_2015)

In [13]:
df_2015.drop(columns=['CompensationMidPt'], inplace=True) 

In [47]:
df_2013.head()

Unnamed: 0,Country,USState,Age,YearsProgram,Industry,CompanySize,Occupation,NumDevsAtCompany,SizeOfTeam,OutsideExpenseBudget,...,StckOvrflwAdsRelevant,StckOvrflwAdsEntertaining,StckOvrflwAdsInformative,StckOvrflwAdsClickThruIntrstingAds,StckOvrflwAdsPurchasedProduct,StckOvrflwAdsUseAdBlocker,StckOvrflwUseReadOthrQs,StckOvrflwUseAskQs,StckOvrflwUseAnsQs,StckOvrflwUseOther
0,United Kingdom,,35-39,,Finance / Banking,101-999,Enterprise Level Services,100.0,,,...,1,1,1,1,1,1,1,0,0,0
1,United States of America,Oregon,25-29,,Retail,101-999,Back-End Web Developer,,,"<$1,000",...,1,1,1,1,1,1,1,1,1,0
2,United States of America,Wisconsin,51-60,11.0,Software Products,26-100,Enterprise Level Services,,Just me!,Don't know,...,1,1,1,1,1,1,1,0,0,0
3,Germany,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,United States of America,Idaho,35-39,11.0,Consulting,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [14]:
def create_dummy_df(df, cols_to_dummy, dummy_na=True):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in cols_to_dummy:
        try:
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], 
                                                                 prefix=col, 
                                                                 prefix_sep='_', 
                                                                 drop_first=True,
                                                                 dummy_na=dummy_na
                                                                )], axis=1)
        except:
            continue

    return df

In [49]:
cols_to_dummy_2013 = cols_not_to_recode_2013.copy()
cols_to_dummy_2013.remove('TotalCompensation')
df_2013 = create_dummy_df(df_2013, cols_to_dummy_2013, dummy_na=True)

In [50]:
print(df_2013.shape)

(9742, 273)


In [37]:
cols_to_dummy_2014 = cols_not_to_recode_2014.copy()
cols_to_dummy_2014.remove('TotalCompensation')
df_2014 = create_dummy_df(df_2014, cols_to_dummy_2014, dummy_na=True)

In [38]:
print(print(df_2014.shape))

(7643, 640)
None


In [18]:
cols_to_dummy_2015 = cols_not_to_recode_2015.copy()
cols_to_dummy_2015 = [x for x in cols_to_dummy_2015 if x not in ['Compensation', 'CompensationMidPt']]
df_2015 = create_dummy_df(df_2015, cols_to_dummy_2015, dummy_na=True)

In [19]:
print(print(df_2015.shape))

(26086, 1975)
None


In [20]:
print(df_2015['Compensation'].value_counts())

Less than $20,000      4000
$20,000 - $40,000      2732
Rather not say         2628
$40,000 - $60,000      2429
$60,000 - $80,000      2007
Unemployed             1996
$80,000 - $100,000     1394
$100,000 - $120,000     991
$120,000 - $140,000     562
More than $160,000      462
$140,000 - $160,000     280
Name: Compensation, dtype: int64


In [23]:
#creating labelEncoder
def process_y_var_split_data(df, yvar):
    df[yvar] = df[yvar].str.replace('<', 'less than ')
    df[yvar] = df[yvar].str.replace('>', 'greater than ')
    df[yvar] = df[yvar].astype('str')
    df = df[df[yvar]!='nan']
    
    print(df[yvar].value_counts())
    le = preprocessing.LabelEncoder()
    X = df.drop(columns=[yvar])
    y = le.fit_transform(df[yvar])
    
    return X, y 


In [None]:
X_2013, y_2013 = process_y_var_split_data(df_2013, 'TotalCompensation')

In [43]:
X_2014, y_2014 = process_y_var_split_data(df_2014, 'TotalCompensation')

Student / Unemployed     1147
Rather not say           1130
less than $20,000         970
$40,000 - $60,000         794
$20,000 - $40,000         746
$60,000 - $80,000         697
$80,000 - $100,000        587
$100,000 - $120,000       386
greater than $140,000     244
$120,000 - $140,000       199
Name: TotalCompensation, dtype: int64


In [24]:
X_2015, y_2015 = process_y_var_split_data(df_2015, 'Compensation')

Less than $20,000      4000
$20,000 - $40,000      2732
Rather not say         2628
$40,000 - $60,000      2429
$60,000 - $80,000      2007
Unemployed             1996
$80,000 - $100,000     1394
$100,000 - $120,000     991
$120,000 - $140,000     562
More than $160,000      462
$140,000 - $160,000     280
Name: Compensation, dtype: int64


In [72]:
# Split dataset into training set and test set
def compare_classifiers(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42) # 70% training and 30% test


    #Create a Gaussian Classifier
    gnb = GaussianNB()

    #Train the model using the training sets
    gnb.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_gnb = gnb.predict(X_test)

    # Logistic Regression Classifier
    logreg = LogisticRegression()

    #Train the model using the training sets
    logreg.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_lr = logreg.predict(X_test)

    # Random Forest Classifier
    rf = RandomForestClassifier()

    #Train the model using the training sets
    rf.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_rf = rf.predict(X_test)

    # Adaboost Classifier
    ada = AdaBoostClassifier()

    #Train the model using the training sets
    ada.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_ada = ada.predict(X_test)

    print("GNB Accuracy:",metrics.accuracy_score(y_test, y_pred_gnb))

    print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, y_pred_lr))

    print("Random Forest Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))

    print("AdaBoost Accuracy:",metrics.accuracy_score(y_test, y_pred_ada))


GNB Accuracy: 0.14353163361661944
Logistic Regression Accuracy: 0.32483474976392823
Random Forest Accuracy: 0.2554296506137866
AdaBoost Accuracy: 0.279508970727101


In [25]:
# Split dataset into training set and test set
def compare_classifiers(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42) # 70% training and 30% test

    #Create a Gaussian Classifier
    gnb = GaussianNB()

    #Train the model using the training sets
    gnb.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_gnb = gnb.predict(X_test)

    # Logistic Regression Classifier
    logreg = LogisticRegression()

    #Train the model using the training sets
    logreg.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_lr = logreg.predict(X_test)

    # Random Forest Classifier
    rf = RandomForestClassifier()

    #Train the model using the training sets
    rf.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_rf = rf.predict(X_test)

    # Adaboost Classifier
    ada = AdaBoostClassifier()

    #Train the model using the training sets
    ada.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred_ada = ada.predict(X_test)

    print("GNB Accuracy:",metrics.accuracy_score(y_test, y_pred_gnb))

    print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, y_pred_lr))

    print("Random Forest Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))

    print("AdaBoost Accuracy:",metrics.accuracy_score(y_test, y_pred_ada))

In [45]:
compare_classifiers(X_2014, y_2014)

GNB Accuracy: 0.1067632850241546
Logistic Regression Accuracy: 0.39806763285024155
Random Forest Accuracy: 0.3565217391304348
AdaBoost Accuracy: 0.3782608695652174


In [26]:
compare_classifiers(X_2015, y_2015)

GNB Accuracy: 0.03866552609067579
Logistic Regression Accuracy: 0.38751069289991447
Random Forest Accuracy: 0.28160821214713433
AdaBoost Accuracy: 0.3286569717707442


In [8]:
# pp.pprint(crd.column_name_map_2014)

In [9]:
# pp.pprint(list(df_2014.columns))

In [10]:
def plot_value_counts(df, col, plot_title):
    status_vals = df[col].value_counts() 
    print(status_vals)
    # The below is a bar chart of the proportion of observations in each category of df[col]
    (status_vals/df.shape[0]).plot(kind="bar");
    plt.title("What kind of developer are you?");

In [None]:
def get_description(column_name, schema=schema):
    '''
    INPUT - schema - pandas dataframe with the schema of the developers survey
            column_name - string - the name of the column you would like to know about
    OUTPUT - 
            desc - string - the description of the column
    '''
    desc = schema['Question'][schema['Column']==column_name].values[0]
    return desc

In [None]:
possible_vals = ["Take online courses", "Buy books and work through the exercises", 
                 "None of these", "Part-time/evening courses", "Return to college",
                 "Contribute to open source", "Conferences/meet-ups", "Bootcamp",
                 "Get a job as a QA tester", "Participate in online coding competitions",
                 "Master's degree", "Participate in hackathons", "Other"]

def clean_and_plot(df, title='Method of Educating Suggested', plot=True):
    '''
    INPUT 
        df - a dataframe holding the CousinEducation column
        title - string the title of your plot
        axis - axis object
        plot - bool providing whether or not you want a plot back
        
    OUTPUT
        study_df - a dataframe with the count of how many individuals
        Displays a plot of pretty things related to the CousinEducation column.
    '''
    study = df['CousinEducation'].value_counts().reset_index()
    study.rename(columns={'index': 'method', 'CousinEducation': 'count'}, inplace=True)
    study_df = t.total_count(study, 'method', 'count', possible_vals)

    study_df.set_index('method', inplace=True)
    if plot:
        (study_df/study_df.sum()).plot(kind='bar', legend=None);
        plt.title(title);
        plt.show()
    props_study_df = study_df/study_df.sum()
    return props_study_df
    
props_df = clean_and_plot(df)