In [1]:
import os
os.getcwd()

'C:\\Users\\ManishaSpatialInsigh'

In [2]:
os.chdir('D:\\testproject')
os.getcwd()

'D:\\testproject'

In [3]:
ls

 Volume in drive D is D drive
 Volume Serial Number is 084B-96ED

 Directory of D:\testproject

01/13/2020  04:47 PM    <DIR>          .
01/13/2020  04:47 PM    <DIR>          ..
12/26/2019  08:18 PM    <DIR>          data
12/26/2019  08:16 PM                34 README.md
12/27/2019  10:02 AM            21,999 salaryprediction .ipynb
01/13/2020  04:47 PM           477,869 salarypredictionEDA.ipynb
12/27/2019  10:11 AM            21,989 salarypredictionupdated.ipynb
               4 File(s)        521,891 bytes
               3 Dir(s)  232,547,061,760 bytes free


# Download all packages

In [14]:


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt



# Define functions 



In [None]:
def upload_file(file):
    '''loads file from directory to pd dataframe'''
    return pd.read_csv(file)

def final_data(df1, df2, key=None, left_index=False, right_index=False):
    '''perform inner join to return only records that are present in both dataframes'''
    return pd.merge(left=df1, right=df2, how='inner', on=key, left_index=left_index, right_index=right_index)

def clean_data(raw_df):
    '''remove rows that contain salary <= 0 or duplicate job IDs'''
    clean_df = raw_df.drop_duplicates(subset='jobId')
    clean_df = clean_df[clean_df.salary>0]
    return clean_df

def one_hot_code_df(df, cat_vars=None, num_vars=None):
    '''performs one-hot encoding on all categorical variables and combines result with continous variables'''
    cat_df = pd.get_dummies(df[cat_vars])
    num_df = df[num_vars].apply(pd.to_numeric)
    return pd.concat([cat_df, num_df], axis=1)#,ignore_index=False)

def gettarget_df(df, target):
    '''returns target dataframe'''
    return df[target]

def train_model(model, feature_df, target_df, num_procs, mean_mse, cv_std):
    neg_mse = cross_val_score(model, feature_df, target_df, cv=2, n_jobs=num_procs, scoring='neg_mean_squared_error')
    mean_mse[model] = -1.0*np.mean(neg_mse)
    cv_std[model] = np.std(neg_mse)
    
    
def print_result(model, mean_mse, cv_std):
    print('\nModel:\n', model)
    print('Average MSE:\n', mean_mse[model])
    print('Standard deviation during CV:\n', cv_std[model])    
    
def save_results(model, mean_mse, predictions, feature_importances):
    '''saves model, model summary, feature importances, and predictions'''
    with open('model.txt', 'w') as file:
        file.write(str(model))
    feature_importances.to_csv('feature_importances.csv') 
    np.savetxt('predictions.csv', predictions, delimiter=',')

    
    
if __name__ == '__main__':
    #define inputs
    train_data = 'data/train_features.csv'
    target_data = 'data/train_salaries.csv'
    test_data = 'data/test_features.csv'

    #define variables
    categorical_vars = ['companyId', 'jobType', 'degree', 'major', 'industry']
    numeric_vars = ['yearsExperience', 'milesFromMetropolis']
    target_var = 'salary'

    #load data
    print("Loading data")
    feature_df = upload_file(train_data)
    target_df = upload_file(target_data)
    test_df = upload_file(test_data)
    
#consolidate training data
    raw_train_df = final_data(feature_df, target_df, key='jobId')

    #clean, shuffle, and reindex training data -- shuffling may improve cross-validation accuracy
    clean_train_df = shuffle(clean_data(raw_train_df)).reset_index()

    #encode categorical data and get final feature dfs
    print("Encoding data")
    feature_df = one_hot_code_df(clean_train_df, cat_vars=categorical_vars, num_vars=numeric_vars)
    test_df = one_hot_code_df(test_df, cat_vars=categorical_vars, num_vars=numeric_vars)

    #get target df
    target_df = gettarget_df(clean_train_df, target_var)

    #initialize model list and dicts
    models = []
    mean_mse = {}
    cv_std = {}
    res = {}
    
    #define number of processes to run in parallel
    num_procs = 2

    #shared model paramaters
    verbose_lvl = 5

    #create models -- hyperparameter tuning already done by hand for each model
    lr = LinearRegression()
    lr_std_pca = make_pipeline(StandardScaler(), PCA(), LinearRegression())
    rf = RandomForestRegressor(n_estimators=60, n_jobs=num_procs, max_depth=25, min_samples_split=60, \
                               max_features=30, verbose=verbose_lvl)
    gbm = GradientBoostingRegressor(n_estimators=40, max_depth=5, loss='ls', verbose=verbose_lvl)

    models.extend([lr, lr_std_pca, rf, gbm])

    #parallel cross-validate models, using MSE as evaluation metric, and print summaries
    print("Beginning cross validation")
    for model in models:
        train_model(model, feature_df, target_df, num_procs, mean_mse, cv_std)
        print_result(model, mean_mse, cv_std)

    #choose model with lowest mse
    model = min(mean_mse, key=mean_mse.get)
    print('\nPredictions calculated using model with lowest MSE:')
    print(model)

    #train model on entire dataset
    model.fit(feature_df, target_df)

    #create predictions based on test data
    predictions = model.predict(test_df)

    #store feature importances
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    else:
        #linear models don't have feature_importances_
        importances = [0]*len(feature_df.columns)

    feature_importances = pd.DataFrame({'feature':feature_df.columns, 'importance':importances})
    feature_importances.sort_values(by='importance', ascending=False, inplace=True)
    
    #set index to 'feature'
    feature_importances.set_index('feature', inplace=True, drop=True)
    
    #create plot
    feature_importances[0:25].plot.bar(figsize=(20,10))
    plt.show()

    #save results
    save_results(model, mean_mse[model], predictions, feature_importances)

    





Loading data
Encoding data
Beginning cross validation

Model:
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Average MSE:
 384.44857972971863
Standard deviation during CV:
 0.27707210323725917

Model:
 Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)
Average MSE:
 384.4470721757263
Standard deviation during CV:
 0.2776247252882058

Model:
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=25,
                      max_features=30, max_leaf_nodes=None,
             