In [16]:
import json
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple
from sklearn import linear_model
%matplotlib inline

In [17]:
df = pd.read_csv('cleaned_data.csv')
with open('column_mapping.json', 'r') as f:
    column_mapping = json.load(f)

In [18]:
RegressionResult = namedtuple('RegressionResult', ['r2', 'coefficients'])

In [19]:
def regression(df, plot=True):
    train = df.sample(frac=0.9)
    train_without_salary = train.drop(columns='Salary')
    test = df.drop(train.index)
    test_without_salary = test.drop(columns='Salary')
    
    reg = linear_model.LinearRegression()
    reg.fit(train_without_salary, train['Salary'])
    coeffs = dict(zip(train_without_salary.columns, reg.coef_))
    
    predictions = reg.predict(test_without_salary)
    errors = predictions - test['Salary'].values
    if plot:
        plt.hist(errors, bins=20)
        plt.show()
    
    r2 = reg.score(test_without_salary, test['Salary'])
    return RegressionResult(r2=r2, coefficients=coeffs)

In [20]:
def keep_columns(df, original_columns_to_keep):
    cleaned_columns_to_keep = []
    for original_col in original_columns_to_keep:
        cleaned_columns_to_keep += column_mapping[original_col]
    return df[cleaned_columns_to_keep + ['Salary']].dropna()

In [21]:
result_by_col = {}
for original_column in column_mapping:
    if original_column in ('Salary', 'ExpectedSalary'):
        continue
    result = regression(keep_columns(df, [original_column]), plot=False)
    result_by_col[original_column] = result

In [22]:
most_correlated_inputs = sorted(result_by_col, key=lambda c: result_by_col[c].r2, reverse=True)

In [23]:
for input_var in most_correlated_inputs:
    print('{}: {}'.format(input_var, result_by_col[input_var].r2))

Currency: 0.44569751244824457
YearsCodedJob: 0.24977704185572847
YearsProgram: 0.19123542414138583
CompanyType: 0.11224916191403868
JobProfile: 0.07518587247353681
HaveWorkedLanguage: 0.0730223241803859
Race: 0.06497506745476433
MetricAssess: 0.06296275904218662
WantWorkLanguage: 0.06227833085174783
IDE: 0.06163160189840011
ImportantBenefits: 0.06147648296770836
University: 0.05833510350850557
CompanySize: 0.05801901843695101
SelfTaughtTypes: 0.05169805856574783
ImportantHiringPMExp: 0.04760251126573867
MajorUndergrad: 0.04672887962437666
ImportantHiringGettingThingsDone: 0.041301157816398766
WantWorkPlatform: 0.03773952705096717
HomeRemote: 0.03699792079708564
EducationTypes: 0.035416935455256815
Overpaid: 0.03512343695006015
HaveWorkedPlatform: 0.034756973483951126
StackOverflowCopiedCode: 0.03472122733248906
CousinEducation: 0.033207908956645715
Methodology: 0.027853179455248678
HaveWorkedDatabase: 0.027852791636309496
Gender: 0.026219860617068602
ShipIt: 0.0258320031772149
SurveyLo

In [24]:
result_by_col['Country'].coefficients

{'Country_Afghanistan': -898181810978285.8,
 'Country_Albania': -898181810945652.0,
 'Country_Anguilla': -898181810887111.2,
 'Country_Argentina': -898181810961546.0,
 'Country_Armenia': -898181810970327.4,
 'Country_Australia': -898181810919035.1,
 'Country_Austria': -898181810941169.9,
 'Country_Azerbaidjan': -898181810983609.5,
 'Country_Bahrain': -898181810912111.4,
 'Country_Bangladesh': -898181810961289.6,
 'Country_Barbados': -898181810957111.5,
 'Country_Belarus': -898181810967078.8,
 'Country_Belgium': -898181810949288.4,
 'Country_Bermuda': -898181810837111.0,
 'Country_Bolivia': -898181810974042.2,
 'Country_Bosnia-Herzegovina': -898181810971019.4,
 'Country_Botswana': -898181810967911.0,
 'Country_Brazil': -898181810966475.0,
 'Country_Bulgaria': -898181810965347.1,
 'Country_Cambodia': -898181810986611.2,
 'Country_Canada': -898181810930304.0,
 'Country_Cayman Islands': -898181810929794.0,
 'Country_Chile': -898181810972127.6,
 'Country_China': -898181810951456.9,
 'Countr