# Salary Predictions Based on Job Descriptions

# Part 1-Problem Description

The task is to examine a set of job postings with salaries and then predict salaries for a new set of job postings.

In [1]:
# Basic data analysis libraries
import pandas as pd
import sklearn as sk
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Personal Information
__author__ = "Emrul Hasan"
__email__ = "emrul.phy@gmail.com"

## Part 2 - Functions to load and clean the data

In [2]:
# Load data
def load_data(df):
    return pd.read_csv(df)

#Merging data
def full_data(df1,df2,key=None ):
    return pd.merge(df1, df2,on=key)

# remove lower outliers and drop the job id
def clean_outliers(df):
    df=df[df.salary>8.5]
    return df

# drop the jobId column
def drop_col(df,col):
    return df.drop(columns=[col])

# Encode categorical data: label encoding for ordinal data and One hot encoding for nominal data
from sklearn.preprocessing import LabelEncoder

def encode_cat(df,x1,x2):
    for col in x1:
        df[col]=LabelEncoder().fit_transform(df[col])
    
    clean_data=pd.get_dummies(df, columns=x2)
    
    return clean_data.drop(columns=['jobId'])

# Part 3-Defining the data and variables

In [3]:
# load data
train_data=load_data('train_features.csv')
test_data=load_data('test_features.csv')
salary_data=load_data('train_salaries.csv')

# Merging data
train=full_data(train_data, salary_data,key='jobId')

# Define nominal and ordinal categorical Variables
var1=['jobType','degree'] # ordinal data
var2=['major','industry'] # Nominal data

# Numerical Columns
num_col=['yearsExperience', 'milesFromMetropolis']

# Remove the outliers and drop the jobId column
train_df=clean_outliers(train)

#test=clean_outliers(test_data)

# encode the data
clean_data=encode_cat(train_df,var1,var2)
test_clean=encode_cat(test_data,var1,var2)

feature_df=drop_col(clean_data,'companyId')
feature_df=drop_col(feature_df,'salary')
test_df=drop_col(test_clean,'companyId')

target_df=clean_data['salary']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Part 4- Model Building

In [4]:
#  Import libraries for model building
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

ML_model=[
    LinearRegression(),
    RandomForestRegressor(),
    GradientBoostingRegressor()
]

def model_fit(models,feature,target):
    
    MSE=[]
    CV_STD=[]
    model_name=[]
    for model in models:

        neg_mse = cross_val_score(model, feature, target, cv=5, scoring='neg_mean_squared_error')
        mean_mse= -1.0*np.mean(neg_mse)
        MSE.append(mean_mse)
        cv_std= np.std(neg_mse) 
        CV_STD.append(cv_std)
        model_name.append(model.__class__.__name__)
    
    result=pd.DataFrame({'Model':model_name,'CV_std': CV_STD, 'MSE':MSE})     
    return result

# Parameter Tunning and choosing the best model

RFReg=RandomForestRegressor(n_estimators=150, max_depth=25, min_samples_split=60)
GBReg=GradientBoostingRegressor(n_estimators=150, max_depth=5)

Tuned_MLmodel=[RFReg,GBReg]

In [5]:
model_fit(ML_model,feature_df,target_df)

Unnamed: 0,Model,CV_std,MSE
0,LinearRegression,2.172739,719.373351
1,RandomForestRegressor,0.709746,468.239159
2,GradientBoostingRegressor,1.715171,377.494054


In [6]:
result_table=model_fit(Tuned_MLmodel, feature_df,target_df)
result_table.sort_values(by='MSE',ascending=True)

Unnamed: 0,Model,CV_std,MSE
1,GradientBoostingRegressor,1.257249,357.062027
0,RandomForestRegressor,1.469785,367.223049


In [7]:
min(result_table['MSE'])

357.0620265006031

# Part 6: Predictions and feature important

In [8]:
#Choose model with lowest mse
model = min(mean_mse, key=mean_mse.get)
print('\nPredictions calculated using model with lowest MSE:')
print(model)

#train model on entire dataset
model.fit(feature_df, target_df)

#create predictions based on test data
predictions = model.predict(test_df)

NameError: name 'mean_mse' is not defined