In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv('higher_ed_sal.csv')

In [3]:
data.head()

Unnamed: 0,Name,School,Job Description,Department,Earnings,Year
0,Don Potter,University of Akron,Assistant Lecturer,Social Work,2472.0,2019
1,Emily Potter,The Ohio State University,Administrative Assistant 3,Arts and Sciences | Chemistry and Biochemistry...,48538.02,2022
2,Carol Jean Potter,The Ohio State University,Associate Professor-Clinical,Pediatrics,22722.8,2013
3,Kim Potter,The Ohio State University,"Manager 4, Compliance",Legal Affairs | Compliance,170143.44,2022
4,Graham Potter,Miami University,Building and Grounds Assistant,"Assoc VP Housing,Dining,Rec,Bus Svc",3075.2,2012


In [4]:
data.shape

(934348, 6)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 934348 entries, 0 to 934347
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Name             934348 non-null  object 
 1   School           934348 non-null  object 
 2   Job Description  907680 non-null  object 
 3   Department       873896 non-null  object 
 4   Earnings         924673 non-null  float64
 5   Year             934348 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 42.8+ MB


In [6]:
null_values_per_column = data.isnull().sum() * 100 / len(data)

In [7]:
print(null_values_per_column)

Name               0.000000
School             0.000000
Job Description    2.854183
Department         6.469966
Earnings           1.035481
Year               0.000000
dtype: float64


In [8]:
data_cleaned = data.dropna()

In [9]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 848591 entries, 0 to 934347
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Name             848591 non-null  object 
 1   School           848591 non-null  object 
 2   Job Description  848591 non-null  object 
 3   Department       848591 non-null  object 
 4   Earnings         848591 non-null  float64
 5   Year             848591 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 45.3+ MB


In [10]:
data_cleaned.describe(include='all')

Unnamed: 0,Name,School,Job Description,Department,Earnings,Year
count,848591,848591,848591,848591,848591.0,848591.0
unique,246341,13,33832,9215,,
top,Michael Smith,The Ohio State University,Professor,University Hospitals,,
freq,64,423931,25419,45236,,
mean,,,,,54664.75,2017.091136
std,,,,,62740.9,3.219944
min,,,,,0.02,2011.0
25%,,,,,19508.78,2014.0
50%,,,,,44006.88,2017.0
75%,,,,,71011.13,2020.0


In [11]:
# Dropping Name as we don't want to predict values based on Name
data_cleaned = data_cleaned.drop('Name',axis=1)

In [12]:
# Shuffle data to avoid bias, improve generalization, and fair evaluation
# Shuffle 100% of data without replacement and reset index afterwards
# drop=True to avoid creating new columns
data_cleaned = data_cleaned.sample(frac=1.0).reset_index(drop=True)

In [13]:
X = data_cleaned.drop('Earnings',axis=1)
y = data_cleaned['Earnings']

In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848591 entries, 0 to 848590
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   School           848591 non-null  object
 1   Job Description  848591 non-null  object
 2   Department       848591 non-null  object
 3   Year             848591 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 25.9+ MB


In [15]:
# Use K Fold validation instead of traditioinal train-test split
# Train-test split we might get test set that isn't representative of train set
# K Fold: every training sample has a chance to be part of the test set

def build_pipeline(regressor):
    # regressor is model
    
    # transform categorical features into numeric
    # pipeline built to create more robust, efficient, and reusable machine learning workflows
    nominal_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) 
    # handle_unknown = 'ignore' for circumstances when unknown value shows up to avoid throwing error

    # tell model which features are categorical in dataframe
    cat_cols = ['School', 'Job Description', 'Department']
    preprocessor = ColumnTransformer(transformers=[
        ('nominal', nominal_transformer, cat_cols)], remainder='passthrough')
    # remainder='passthrough' to avoid dropping columns that aren't listed in cat_cols
    
    # create model
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler(with_mean=False)), # scales data, so that each column has the same range of values 
        ('regressor', regressor)
    ])

    return model

In [16]:
models = {
    'Linear Regression (Ridge)': build_pipeline(Ridge()), # fits a linear model to the data, adding a regularization term to prevent overfitting
    'Decision Tree': build_pipeline(DecisionTreeRegressor()), # creates a tree-like model of decisions and their possible consequences
    # 'Neural Network': build_pipeline(MLPRegressor()), # a complex model inspired by the human brain, composed of interconnected layers of artificial neurons
    # 'Random Forest': build_pipeline(RandomForestRegressor()), # builds decision trees in parallel
    # 'GradientBoostingRegressor': build_pipeline(GradientBoostingRegressor()) # builds decision trees in series (one after another)
    }

In [17]:
# divides data set into K sections, standard is 5 or 10
# you get a variaty of test sets using K Fold
# larger K, the more data the model will have to train on, the less data the model will have to test on
# high K is more computational expensive

def evaluate_model(model, X, y):
    kf = KFold(n_splits=5)
    rmses = []
    rs2s = []
    # split x into 5 sections, provides indexes for each split
    # iteration 1: test index = 1st/5 of data, train index = 4/5
    # iteration 2: test index = 2nd/5 of data, train index = 3rd/5
    for train_idx, test_idx in kf.split(X): 
        # Fit model
        # X.iloc[train_idx, :] - train_idx=rows, : all cols
        # y.iloc[train_idx] - no need for cols, since it's one dimensional series
        model.fit(X.iloc[train_idx, :], y.iloc[train_idx])

        # Make predictions
        pred = model.predict(X.iloc[test_idx, :])

        # Calculate Root Mean Square Error (RMSE)
        rmse = np.sqrt(np.mean((y.iloc[test_idx]-pred)**2))
        rmses.append(rmse)

        # Calculate R2
        rs2 = 1 - (np.sum((y.iloc[test_idx]-pred)**2)/np.sum((y.iloc[test_idx]-y.iloc[test_idx].mean())**2)) 
        rs2s.append(rs2)

    # Return average RMSE and R2
    return np.mean(rmses), np.mean(rs2s)


In [18]:
for name, model in models.items():
    print(name + ' RMSE: {:.2F}'.format(evaluate_model(model, X, y)[0]))

Linear Regression (Ridge) RMSE: 37371.18
Decision Tree RMSE: 35762.50


In [19]:
for name, model in models.items():
    print(name + ' R2: {:.5F}'.format(evaluate_model(model, X, y)[1]))

Linear Regression (Ridge) R2: 0.64492
Decision Tree R2: 0.67571


The Decision Tree has a lower RMSE, indicating that its predictions are, on average, closer to the actual earnings values.

The Decision Tree also has a higher R², suggesting that it explains a larger proportion of the variance in earnings compared to the Ridge Regression model.

Therefore, the decision tree model has the best performance.

In [20]:
new_employees = pd.DataFrame({
    'School': ['Miami University', 'Miami University'],
    'Job Description': ['Professor','Professor'],
    'Department': ['Pediatrics','Social Work'],
    'Year': [2024, 2024]
})

In [23]:
# Make prediction on new dataset
predict_earnings = models['Decision Tree'].predict(new_employees)

In [25]:
new_employees['Predicted Earnings'] = predict_earnings

In [26]:
new_employees.head()

Unnamed: 0,School,Job Description,Department,Year,Predicted Earnings
0,Miami University,Professor,Pediatrics,2024,215563.32
1,Miami University,Professor,Social Work,2024,99886.175
