## Linear Regression

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

In [2]:
df = pd.read_csv('/Users/calebreed/Documents/GitHub/DATA-4950-Capstone/DATA-4950-Capstone/data/external/salaries_clean.csv')
# drop unneccessary variables
df = df.drop(["index","salary_id","location_latitude","location_longitude","comments","submitted_at"],axis=1)
# job_title_rank, location_state, and location_country have too many missing values so they will be dropped
df = df.drop(['job_title_rank','location_state','location_country'],axis=1)
# drops employer experience years
df = df.drop("employer_experience_years",axis=1)
# drops columns with too many unique values
df = df.drop("employer_name",axis=1)
df = df.drop("job_title",axis=1)
df = df.drop("location_name",axis=1) 
# fill experience missing values with the mean
df.loc[:, "total_experience_years"] = df.loc[:, "total_experience_years"].fillna(df["total_experience_years"].mean())
df.loc[:, "annual_base_pay"] = df.loc[:, "annual_base_pay"].fillna(df["annual_base_pay"].mean())
df.loc[:, "signing_bonus"] = df.loc[:, "signing_bonus"].fillna(method='ffill')
df.loc[:, "annual_bonus"] = df.loc[:, "annual_bonus"].fillna(method='ffill')
# stock bonus is a string so will be dropping it
df = df.drop("stock_value_bonus",axis=1)
df.head()

Unnamed: 0,job_title_category,total_experience_years,annual_base_pay,signing_bonus,annual_bonus
0,Engineering,13.0,125000.0,5000.0,0.0
1,Software,15.0,65000.0,5000.0,5000.0
2,Software,4.0,86000.0,5000.0,6000.0
3,Other,4.0,105000.0,5000.0,8500.0
4,Software,4.0,110000.0,5000.0,7000.0


In [3]:
# creates dummy variables for job category
dummies = pd.get_dummies(df['job_title_category'], prefix='job_category')
dummies.head()

Unnamed: 0,job_category_Applied Science,job_category_Data,job_category_Engineering,job_category_Management,job_category_Operations,job_category_Other,job_category_Software,job_category_Web
0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0


In [4]:
# merges dummy variables with dataframe and drops original column
df = pd.concat([df, dummies], axis=1)
df = df.drop('job_title_category', axis=1)
df.head()

Unnamed: 0,total_experience_years,annual_base_pay,signing_bonus,annual_bonus,job_category_Applied Science,job_category_Data,job_category_Engineering,job_category_Management,job_category_Operations,job_category_Other,job_category_Software,job_category_Web
0,13.0,125000.0,5000.0,0.0,0,0,1,0,0,0,0,0
1,15.0,65000.0,5000.0,5000.0,0,0,0,0,0,0,1,0
2,4.0,86000.0,5000.0,6000.0,0,0,0,0,0,0,1,0
3,4.0,105000.0,5000.0,8500.0,0,0,0,0,0,1,0,0
4,4.0,110000.0,5000.0,7000.0,0,0,0,0,0,0,1,0


In [5]:
# seperates features into x and y variables
X = df.drop('annual_base_pay', axis = 1)

y = df['annual_base_pay'] 

# splits data into 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [6]:
# builds intial model with all predictors
X_reg = X_train
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.072
Method:,Least Squares,F-statistic:,9.952
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,3.82e-16
Time:,18:28:24,Log-Likelihood:,-19414.0
No. Observations:,1158,AIC:,38850.0
Df Residuals:,1147,BIC:,38910.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.593e+05,3.99e+05,0.901,0.368,-4.23e+05,1.14e+06
total_experience_years,-4.025e+04,2.6e+04,-1.550,0.121,-9.12e+04,1.07e+04
signing_bonus,-4.6148,5.082,-0.908,0.364,-14.585,5.356
annual_bonus,19.2912,2.058,9.373,0.000,15.253,23.329
job_category_Applied Science,6.919e+04,2.08e+06,0.033,0.973,-4.01e+06,4.15e+06
job_category_Data,-1.765e+05,6.77e+05,-0.261,0.794,-1.5e+06,1.15e+06
job_category_Engineering,-7.388e+04,4.97e+05,-0.149,0.882,-1.05e+06,9.01e+05
job_category_Management,-2.686e+05,5.77e+05,-0.465,0.642,-1.4e+06,8.64e+05
job_category_Operations,9.43e+04,1.71e+06,0.055,0.956,-3.26e+06,3.45e+06

0,1,2,3
Omnibus:,3188.18,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56716532.419
Skew:,32.399,Prob(JB):,0.0
Kurtosis:,1085.254,Cond. No.,1.28e+21


In [7]:
# Backward elimination
X_reg = X_reg.drop('job_category_Applied Science', axis = 1)
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.072
Method:,Least Squares,F-statistic:,9.952
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,3.82e-16
Time:,18:28:24,Log-Likelihood:,-19414.0
No. Observations:,1158,AIC:,38850.0
Df Residuals:,1147,BIC:,38910.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.285e+05,2.33e+06,0.184,0.854,-4.15e+06,5.01e+06
total_experience_years,-4.025e+04,2.6e+04,-1.550,0.121,-9.12e+04,1.07e+04
signing_bonus,-4.6148,5.082,-0.908,0.364,-14.585,5.356
annual_bonus,19.2912,2.058,9.373,0.000,15.253,23.329
job_category_Data,-2.456e+05,2.41e+06,-0.102,0.919,-4.98e+06,4.49e+06
job_category_Engineering,-1.431e+05,2.36e+06,-0.061,0.952,-4.76e+06,4.48e+06
job_category_Management,-3.377e+05,2.38e+06,-0.142,0.887,-5e+06,4.33e+06
job_category_Operations,2.511e+04,3e+06,0.008,0.993,-5.86e+06,5.91e+06
job_category_Other,8.178e+05,2.35e+06,0.348,0.728,-3.79e+06,5.42e+06

0,1,2,3
Omnibus:,3188.18,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56716532.419
Skew:,32.399,Prob(JB):,0.0
Kurtosis:,1085.254,Cond. No.,3290000.0


In [8]:
X_reg = X_reg.drop('job_category_Engineering', axis = 1)
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.073
Method:,Least Squares,F-statistic:,11.07
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,1.11e-16
Time:,18:28:24,Log-Likelihood:,-19414.0
No. Observations:,1158,AIC:,38850.0
Df Residuals:,1148,BIC:,38900.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.893e+05,4.35e+05,0.665,0.506,-5.65e+05,1.14e+06
total_experience_years,-4.023e+04,2.6e+04,-1.550,0.121,-9.12e+04,1.07e+04
signing_bonus,-4.6184,5.079,-0.909,0.363,-14.584,5.347
annual_bonus,19.2906,2.057,9.377,0.000,15.254,23.327
job_category_Data,-1.065e+05,7.59e+05,-0.140,0.888,-1.6e+06,1.38e+06
job_category_Management,-1.986e+05,6.39e+05,-0.311,0.756,-1.45e+06,1.05e+06
job_category_Operations,1.642e+05,1.93e+06,0.085,0.932,-3.63e+06,3.96e+06
job_category_Other,9.57e+05,5.14e+05,1.860,0.063,-5.25e+04,1.97e+06
job_category_Software,-4.047e+04,4.31e+05,-0.094,0.925,-8.85e+05,8.04e+05

0,1,2,3
Omnibus:,3188.176,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56716075.024
Skew:,32.399,Prob(JB):,0.0
Kurtosis:,1085.249,Cond. No.,967000.0


In [9]:
X_reg = X_reg.drop('job_category_Web', axis = 1)
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.073
Method:,Least Squares,F-statistic:,12.46
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,3.01e-17
Time:,18:28:24,Log-Likelihood:,-19414.0
No. Observations:,1158,AIC:,38850.0
Df Residuals:,1149,BIC:,38890.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.918e+05,3.76e+05,0.775,0.438,-4.47e+05,1.03e+06
total_experience_years,-4.025e+04,2.59e+04,-1.552,0.121,-9.11e+04,1.06e+04
signing_bonus,-4.6208,5.073,-0.911,0.363,-14.573,5.332
annual_bonus,19.2903,2.056,9.382,0.000,15.256,23.324
job_category_Data,-1.089e+05,7.28e+05,-0.150,0.881,-1.54e+06,1.32e+06
job_category_Management,-2.01e+05,6.04e+05,-0.333,0.739,-1.39e+06,9.84e+05
job_category_Operations,1.618e+05,1.92e+06,0.084,0.933,-3.61e+06,3.93e+06
job_category_Other,9.546e+05,4.7e+05,2.032,0.042,3.29e+04,1.88e+06
job_category_Software,-4.287e+04,3.76e+05,-0.114,0.909,-7.8e+05,6.94e+05

0,1,2,3
Omnibus:,3188.174,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56715750.313
Skew:,32.399,Prob(JB):,0.0
Kurtosis:,1085.246,Cond. No.,958000.0


In [10]:
X_reg = X_reg.drop('job_category_Operations', axis = 1)
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.074
Method:,Least Squares,F-statistic:,14.25
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,7.64e-18
Time:,18:28:24,Log-Likelihood:,-19414.0
No. Observations:,1158,AIC:,38840.0
Df Residuals:,1150,BIC:,38880.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.959e+05,3.73e+05,0.793,0.428,-4.36e+05,1.03e+06
total_experience_years,-4.018e+04,2.59e+04,-1.551,0.121,-9.1e+04,1.07e+04
signing_bonus,-4.6223,5.070,-0.912,0.362,-14.571,5.326
annual_bonus,19.2895,2.055,9.386,0.000,15.257,23.322
job_category_Data,-1.133e+05,7.26e+05,-0.156,0.876,-1.54e+06,1.31e+06
job_category_Management,-2.057e+05,6.01e+05,-0.342,0.732,-1.39e+06,9.74e+05
job_category_Other,9.5e+05,4.66e+05,2.037,0.042,3.5e+04,1.86e+06
job_category_Software,-4.733e+04,3.72e+05,-0.127,0.899,-7.77e+05,6.82e+05

0,1,2,3
Omnibus:,3188.175,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56715883.102
Skew:,32.399,Prob(JB):,0.0
Kurtosis:,1085.248,Cond. No.,426000.0


In [11]:
X_reg = X_reg.drop('job_category_Software', axis = 1)
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.075
Method:,Least Squares,F-statistic:,16.64
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,1.79e-18
Time:,18:28:24,Log-Likelihood:,-19414.0
No. Observations:,1158,AIC:,38840.0
Df Residuals:,1151,BIC:,38880.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.589e+05,2.35e+05,1.104,0.270,-2.01e+05,7.19e+05
total_experience_years,-3.985e+04,2.58e+04,-1.546,0.122,-9.04e+04,1.07e+04
signing_bonus,-4.6708,5.054,-0.924,0.356,-14.587,5.245
annual_bonus,19.2864,2.054,9.389,0.000,15.256,23.317
job_category_Data,-7.787e+04,6.7e+05,-0.116,0.908,-1.39e+06,1.24e+06
job_category_Management,-1.714e+05,5.37e+05,-0.319,0.750,-1.23e+06,8.83e+05
job_category_Other,9.845e+05,3.79e+05,2.598,0.010,2.41e+05,1.73e+06

0,1,2,3
Omnibus:,3188.206,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56719642.213
Skew:,32.4,Prob(JB):,0.0
Kurtosis:,1085.284,Cond. No.,337000.0


In [12]:
X_reg = X_reg.drop('job_category_Data', axis = 1)
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.076
Method:,Least Squares,F-statistic:,19.98
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,3.8e-19
Time:,18:28:24,Log-Likelihood:,-19414.0
No. Observations:,1158,AIC:,38840.0
Df Residuals:,1152,BIC:,38870.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.532e+05,2.29e+05,1.104,0.270,-1.97e+05,7.03e+05
total_experience_years,-3.969e+04,2.57e+04,-1.543,0.123,-9.02e+04,1.08e+04
signing_bonus,-4.6434,5.046,-0.920,0.358,-14.544,5.258
annual_bonus,19.2871,2.053,9.393,0.000,15.259,23.316
job_category_Management,-1.674e+05,5.36e+05,-0.312,0.755,-1.22e+06,8.84e+05
job_category_Other,9.889e+05,3.77e+05,2.623,0.009,2.49e+05,1.73e+06

0,1,2,3
Omnibus:,3188.217,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56721005.55
Skew:,32.4,Prob(JB):,0.0
Kurtosis:,1085.297,Cond. No.,270000.0


In [13]:
X_reg = X_reg.drop('job_category_Management', axis = 1)
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.077
Method:,Least Squares,F-statistic:,24.97
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,7.41e-20
Time:,18:28:24,Log-Likelihood:,-19414.0
No. Observations:,1158,AIC:,38840.0
Df Residuals:,1153,BIC:,38860.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.468e+05,2.28e+05,1.081,0.280,-2.01e+05,6.95e+05
total_experience_years,-4.09e+04,2.54e+04,-1.609,0.108,-9.08e+04,8978.221
signing_bonus,-4.6206,5.044,-0.916,0.360,-14.517,5.276
annual_bonus,19.2513,2.049,9.394,0.000,15.231,23.272
job_category_Other,1.006e+06,3.73e+05,2.696,0.007,2.74e+05,1.74e+06

0,1,2,3
Omnibus:,3187.805,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56672102.586
Skew:,32.389,Prob(JB):,0.0
Kurtosis:,1084.83,Cond. No.,188000.0


In [14]:
X_reg = X_reg.drop('signing_bonus', axis = 1)
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.079
Model:,OLS,Adj. R-squared:,0.077
Method:,Least Squares,F-statistic:,33.02
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,1.77e-20
Time:,18:28:24,Log-Likelihood:,-19415.0
No. Observations:,1158,AIC:,38840.0
Df Residuals:,1154,BIC:,38860.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.027e+05,2.23e+05,0.908,0.364,-2.35e+05,6.4e+05
total_experience_years,-4.02e+04,2.54e+04,-1.582,0.114,-9.01e+04,9646.440
annual_bonus,19.1078,2.043,9.352,0.000,15.099,23.116
job_category_Other,1.027e+06,3.72e+05,2.760,0.006,2.97e+05,1.76e+06

0,1,2,3
Omnibus:,3186.969,Durbin-Watson:,1.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56573089.41
Skew:,32.367,Prob(JB):,0.0
Kurtosis:,1083.883,Cond. No.,187000.0


In [15]:
X_reg = X_reg.drop('total_experience_years', axis = 1)
X_reg = sm.add_constant(X_reg) 
 
reg1 = sm.OLS(y_train, X_reg).fit()
pred = reg1.predict(X_reg) 
 
reg1.summary()

0,1,2,3
Dep. Variable:,annual_base_pay,R-squared:,0.077
Model:,OLS,Adj. R-squared:,0.075
Method:,Least Squares,F-statistic:,48.22
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,7.73e-21
Time:,18:28:24,Log-Likelihood:,-19416.0
No. Observations:,1158,AIC:,38840.0
Df Residuals:,1155,BIC:,38850.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.877e+04,1.5e+05,-0.392,0.695,-3.53e+05,2.36e+05
annual_bonus,19.0377,2.044,9.314,0.000,15.027,23.048
job_category_Other,9.694e+05,3.71e+05,2.615,0.009,2.42e+05,1.7e+06

0,1,2,3
Omnibus:,3190.584,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57018555.34
Skew:,32.462,Prob(JB):,0.0
Kurtosis:,1088.134,Cond. No.,187000.0


In [16]:
# testing results of a different package
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

print('MSE: %.2f' % mean_squared_error(y_test, y_pred))
print('R^2: %.10f' % r2_score(y_test, y_pred))

MSE: 202786762978519136.00
R^2: -0.0099214390
