# Regression model for the Glass Door Data set

In [47]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

#### Load the dataset

In [49]:
ds = pd.read_csv('salary_data_cleaned.csv')

# Check for missing values
print(ds.isnull().sum())
ds.head()

Job Title            0
Salary Estimate      0
Job Description      0
Rating               0
Company Name         0
Location             0
Headquarters         0
Size                 0
Founded              0
Type of ownership    0
Industry             0
Sector               0
Revenue              0
Competitors          0
hourly               0
employer_provided    0
min_salary           0
max_salary           0
avg_salary           0
company_txt          0
job_state            0
same_state           0
age                  0
python_yn            0
R_yn                 0
spark                0
aws                  0
excel                0
dtype: int64


Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,avg_salary,company_txt,job_state,same_state,age,python_yn,R_yn,spark,aws,excel
0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,...,72.0,Tecolote Research\n,NM,0,47,1,0,0,0,1
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,...,87.5,University of Maryland Medical System\n,MD,0,36,1,0,0,0,0
2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,...,85.0,KnowBe4\n,FL,1,10,1,0,1,0,1
3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,...,76.5,PNNL\n,WA,1,55,1,0,0,0,0
4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,...,114.5,Affinity Solutions\n,NY,1,22,1,0,0,0,1


In [50]:
# Get the shape of the data set
shapeInfo = ds.shape
columnInfo = ds.dtypes.reset_index()
columnInfo.columns = ['Column Name', 'Data Type']

print(shapeInfo)
print("Column information for the data set:")
print(columnInfo)

(742, 28)
Column information for the data set:
          Column Name Data Type
0           Job Title    object
1     Salary Estimate    object
2     Job Description    object
3              Rating   float64
4        Company Name    object
5            Location    object
6        Headquarters    object
7                Size    object
8             Founded     int64
9   Type of ownership    object
10           Industry    object
11             Sector    object
12            Revenue    object
13        Competitors    object
14             hourly     int64
15  employer_provided     int64
16         min_salary     int64
17         max_salary     int64
18         avg_salary   float64
19        company_txt    object
20          job_state    object
21         same_state     int64
22                age     int64
23          python_yn     int64
24               R_yn     int64
25              spark     int64
26                aws     int64
27              excel     int64


In [51]:
# Look at the info for the dataset
dsInfo = ds.info()

print("Info for the data set: ")
print(dsInfo)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742 entries, 0 to 741
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          742 non-null    object 
 1   Salary Estimate    742 non-null    object 
 2   Job Description    742 non-null    object 
 3   Rating             742 non-null    float64
 4   Company Name       742 non-null    object 
 5   Location           742 non-null    object 
 6   Headquarters       742 non-null    object 
 7   Size               742 non-null    object 
 8   Founded            742 non-null    int64  
 9   Type of ownership  742 non-null    object 
 10  Industry           742 non-null    object 
 11  Sector             742 non-null    object 
 12  Revenue            742 non-null    object 
 13  Competitors        742 non-null    object 
 14  hourly             742 non-null    int64  
 15  employer_provided  742 non-null    int64  
 16  min_salary         742 non

In [52]:
# A couple of different company metrics we can look at
avgCompRate = ds['Rating'].mean()
uniqueCompSize = ds['Size'].unique()
compFoundingYear = ds['Founded'].value_counts()
topIndustries = ds['Industry'].value_counts().head(5)
avgCompSal = ds['avg_salary'].mean()

# Display the information
print("Avgerae company rating: ", avgCompRate)

print("Different company sizes: ")
print(uniqueCompSize)

print("Company founding years: ")
print(compFoundingYear)

print("Top 5 industries: ")
print(topIndustries)

print("Average salary", avgCompSal)

Avgerae company rating:  3.6188679245283017
Different company sizes: 
['501 to 1000 employees' '10000+ employees' '1001 to 5000 employees'
 '51 to 200 employees' '201 to 500 employees' '5001 to 10000 employees'
 '1 to 50 employees' 'Unknown' '-1']
Company founding years: 
Founded
-1       50
 2010    32
 2008    31
 1996    27
 2006    24
         ..
 1860     1
 1942     1
 1878     1
 1971     1
 1889     1
Name: count, Length: 102, dtype: int64
Top 5 industries: 
Industry
Biotech & Pharmaceuticals           112
Insurance Carriers                   63
Computer Hardware & Software         59
IT Services                          50
Health Care Services & Hospitals     49
Name: count, dtype: int64
Average salary 100.62601078167116


In [61]:
# Predict average salary based on other columns
X = ds[['Rating', 'Founded', 'hourly', 'employer_provided', 'min_salary', 'max_salary', 'same_state', 'age']]
y = ds['avg_salary']

# data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print('training set size:', X_train.shape[0])
print('test set size:', X_test.shape[0])

training set size: 519
test set size: 223


In [62]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error, r2_score

# We will train a regression model

slr = LR()
slr.fit(X_train, y_train)

y_test_pred = slr.predict(X_test)

mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 3.7767688647989286e-28
R-squared: 1.0
