In [None]:
import itertools
import re
import pandas as pd
pd.options.display.max_columns = 300
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
%matplotlib inline

In [None]:
df_raw = pd.read_csv('survey_results_public.csv', index_col='Respondent')
df_raw = df_raw[df_raw['Salary'].notnull()]

In [None]:
multi_select = []
numeric = []
other = []

for col in df_raw.columns:
    series = df_raw[col]
    
    if series.dtype == np.float64:
        numeric.append(col)
        continue
    
    unique_values = series.unique()
    
    is_multi_select = False
    for entry in unique_values:
        if type(entry) == str and ';' in entry:
            multi_select.append(col)
            is_multi_select = True
            break
    if is_multi_select:
        continue
    
    other.append(col)

In [None]:
def clean(df, discrete_cols=[], multi_select_cols=[], numeric=[]):
    result = df[discrete_cols + numeric]
    
    for col in df.columns:
        if col.startswith('Years'):
            years_as_string = df_raw[col] \
                .fillna('') \
                .replace('Less than a year', '0 to 1 years') \
                .str.extract('(\d+) ', expand=False)
            result[col] = pd.to_numeric(years_as_string) + 0.5
    
    df = pd.get_dummies(df)
    for multi_select_col in multi_select_cols:
        to_concat = df_raw[multi_select_col].str.get_dummies(sep='; ')
        df = pd.concat([df, to_concat], axis=1)
    return df

In [None]:
def do_regression(df, discrete_cols=[], multi_select_cols=[]):
    df = clean(df, discrete_cols, multi_select_cols)
    
    train = df.sample(frac=0.9)
    test = df.drop(train.index)
    
    reg = linear_model.LinearRegression()
    reg.fit(train.drop(columns='Salary'), train['Salary'])
    coeffs = dict(zip(
        filter(lambda c: c != 'Salary', test.columns),
        reg.coef_
    ))
    
    predictions = reg.predict(test.drop(columns='Salary'))
    errors = predictions - test['Salary'].values
    plt.hist(errors, bins=20)
    plt.show()
    
    print('R^2 = {}'.format(reg.score(test.drop(columns='Salary'),
                                      test['Salary'])))
    return coeffs

In [None]:
cleaned_df = clean(df_raw,
                   discrete_cols = other + convert_to_numeric,
                   multi_select_cols = multi_select,
                   numeric = numeric)

In [None]:
cleaned_df.to_csv('cleaned_data.csv')

In [None]:
result = do_regression(
    df_raw,
    discrete_cols=['CompanySize', 'YearsCodedJob'])

In [None]:
result = do_regression(
    df_raw,
    discrete_cols=['CompanySize', 'YearsCodedJob'],
    multi_select_cols=['DeveloperType'])

In [None]:
result = do_regression(
    df_raw,
    multi_select_cols=['DeveloperType'])
result