# Salary Prediction Linear Regression

In [1]:
#Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score , mean_squared_error

In [2]:
#Load Dataset

salaries_data = pd.read_csv('Datasets/all_salaries_clean.csv')
salaries_data

Unnamed: 0.1,Unnamed: 0,Year,Company,City,State,Title,Area,Total Annual Compensation,Years of Experience,Years at Company,Base Salary,Stock Grant Value,Bonus,Masters Degree,Bachelors Degree,Doctorate Degree,Highschool,Some College
0,0,2017,Oracle,Redwood City,CA,Product Manager,,127000,1.5,1.5,107000,20000.0,10000.0,0,0,0,0,0
1,1,2017,eBay,San Francisco,CA,Software Engineer,,100000,5.0,3.0,0,0.0,0.0,0,0,0,0,0
2,2,2017,Amazon,Seattle,WA,Product Manager,,310000,8.0,0.0,155000,0.0,0.0,0,0,0,0,0
3,3,2017,Apple,Sunnyvale,CA,Software Engineering Manager,,372000,7.0,5.0,157000,180000.0,35000.0,0,0,0,0,0
4,4,2017,Microsoft,Mountain View,CA,Software Engineer,,157000,5.0,3.0,0,0.0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52834,62637,2018,Google,Seattle,WA,Software Engineer,Distributed Systems (Back-End),327000,10.0,1.0,155000,150000.0,22000.0,0,0,0,0,0
52835,62638,2018,Microsoft,Redmond,WA,Software Engineer,Full Stack,237000,2.0,2.0,146900,73200.0,16000.0,0,0,0,0,0
52836,62639,2018,MSFT,Seattle,WA,Software Engineer,Full Stack,220000,14.0,12.0,157000,25000.0,20000.0,0,0,0,0,0
52837,62640,2018,Salesforce,San Francisco,CA,Software Engineer,iOS,280000,8.0,4.0,194688,57000.0,29000.0,0,0,0,0,0


In [3]:
#Check for null values
salaries_data.isnull().sum()

Unnamed: 0                     0
Year                           0
Company                        0
City                           0
State                          0
Title                          0
Area                         816
Total Annual Compensation      0
Years of Experience            0
Years at Company               0
Base Salary                    0
Stock Grant Value              0
Bonus                          0
Masters Degree                 0
Bachelors Degree               0
Doctorate Degree               0
Highschool                     0
Some College                   0
dtype: int64

In [7]:
salaries_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62642 entries, 0 to 62641
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                62642 non-null  object 
 1   company                  62637 non-null  object 
 2   level                    62523 non-null  object 
 3   title                    62642 non-null  object 
 4   totalyearlycompensation  62642 non-null  int64  
 5   location                 62642 non-null  object 
 6   yearsofexperience        62642 non-null  float64
 7   yearsatcompany           62642 non-null  float64
 8   tag                      61788 non-null  object 
 9   basesalary               62642 non-null  float64
 10  stockgrantvalue          62642 non-null  float64
 11  bonus                    62642 non-null  float64
 12  gender                   43102 non-null  object 
 13  otherdetails             40137 non-null  object 
 14  cityid                

In [4]:
salaries_data.describe()

Unnamed: 0.1,Unnamed: 0,Year,Total Annual Compensation,Years of Experience,Years at Company,Base Salary,Stock Grant Value,Bonus,Masters Degree,Bachelors Degree,Doctorate Degree,Highschool,Some College
count,52839.0,52839.0,52839.0,52839.0,52839.0,52839.0,52839.0,52839.0,52839.0,52839.0,52839.0,52839.0,52839.0
mean,30154.050152,2020.005167,235398.5,7.242433,2.754126,147588.8,57008.89,20977.710474,0.244157,0.179337,0.030924,0.003918,0.005583
std,18077.267597,0.921413,137657.7,5.955109,3.340259,57409.06,86397.62,27987.338795,0.42959,0.383638,0.173114,0.062468,0.074511
min,0.0,2017.0,11000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14325.5,2019.0,153000.0,3.0,0.0,120000.0,0.0,2000.0,0.0,0.0,0.0,0.0,0.0
50%,29631.0,2020.0,200000.0,6.0,2.0,147000.0,29000.0,15000.0,0.0,0.0,0.0,0.0,0.0
75%,45571.5,2021.0,280000.0,10.0,4.0,175000.0,75000.0,29000.0,0.0,0.0,0.0,0.0,0.0
max,62641.0,2021.0,4980000.0,69.0,69.0,1659870.0,2800000.0,1000000.0,1.0,1.0,1.0,1.0,1.0


In [5]:
ds = pd.read_csv("Datasets/all_salaries_clean.csv", usecols = ['Total Annual Compensation', 'Years at Company', 'Years of Experience', 'Masters Degree', 'Bachelors Degree', 'Doctorate Degree', 'Highschool', 'Some College' ])
ds

Unnamed: 0,Total Annual Compensation,Years of Experience,Years at Company,Masters Degree,Bachelors Degree,Doctorate Degree,Highschool,Some College
0,127000,1.5,1.5,0,0,0,0,0
1,100000,5.0,3.0,0,0,0,0,0
2,310000,8.0,0.0,0,0,0,0,0
3,372000,7.0,5.0,0,0,0,0,0
4,157000,5.0,3.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
52834,327000,10.0,1.0,0,0,0,0,0
52835,237000,2.0,2.0,0,0,0,0,0
52836,220000,14.0,12.0,0,0,0,0,0
52837,280000,8.0,4.0,0,0,0,0,0


In [8]:
X = ds.drop('Total Annual Compensation',axis=1)
y = ds['Total Annual Compensation']

In [9]:
X.shape , y.shape

((52839, 7), (52839,))

In [10]:
X_train , X_test , Y_train , Y_test = train_test_split(X,y,random_state=101,test_size=0.2)
X_train.shape , X_test.shape , Y_train.shape , Y_test.shape

((42271, 7), (10568, 7), (42271,), (10568,))

In [11]:
lr = LinearRegression()
lr.fit(X_train, Y_train)

LinearRegression()

In [12]:
pred = lr.predict(X_test)
pred

array([508067.35909628, 245029.87054413, 288704.14390261, ...,
       203457.77460108, 185978.48251328, 175630.963964  ])

In [13]:
Y_test

52198    1716000
48650     411000
21461     333000
7089      410000
10980     140000
          ...   
40824     242000
23808     140000
43202     152000
39068     239000
31894     210000
Name: Total Annual Compensation, Length: 10568, dtype: int64

In [14]:
diff = Y_test - pred

In [15]:
pd.DataFrame(np.c_[Y_test , pred , diff] , columns=['Actual','Predicted','Difference'])

Unnamed: 0,Actual,Predicted,Difference
0,1716000.0,508067.359096,1.207933e+06
1,411000.0,245029.870544,1.659701e+05
2,333000.0,288704.143903,4.429586e+04
3,410000.0,221484.259844,1.885157e+05
4,140000.0,179858.633920,-3.985863e+04
...,...,...,...
10563,242000.0,243810.184647,-1.810185e+03
10564,140000.0,224492.243903,-8.449224e+04
10565,152000.0,203457.774601,-5.145777e+04
10566,239000.0,185978.482513,5.302152e+04


In [18]:
lr.score(X_test , Y_test)

0.22222032534605973

In [19]:
mean_squared_error(Y_test , pred, squared=False)

124042.05187670092

In [20]:
r2_score(Y_test , pred)

0.22222032534605973