The aim of this notebook is to compare two simple ML models: Linear Regression and Decision Tree Regression

#  Quick look at the Data Structure

In [1]:
import numpy as np 
import pandas as pd 
job_data = pd.read_csv("/kaggle/input/datascience-job-data/data_science_job.csv")
job_data.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,20.0,,,36.0,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15.0,50-99,Pvt Ltd,47.0,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5.0,,,83.0,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,0.0,,Pvt Ltd,52.0,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,20.0,50-99,Funded Startup,8.0,0.0


In [2]:
job_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  18679 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  float64
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  training_hours          18392 non-null  float64
 12  target                  19158 non-null  float64
dtypes: float64(4), int64(1), object(8)
memory usage: 1.9+ MB


In [3]:
job_data.isnull().sum().sort_values(ascending=False) / len(job_data)

company_type              0.320493
company_size              0.309949
gender                    0.235306
major_discipline          0.146832
training_hours            0.039983
city_development_index    0.025003
education_level           0.024011
enrolled_university       0.020148
experience                0.003393
enrollee_id               0.000000
city                      0.000000
relevent_experience       0.000000
target                    0.000000
dtype: float64

# Data Preparation

In [4]:
job_data = job_data.drop(["company_type", "company_size", "gender"], axis=1)

In [5]:
na_columns = ["major_discipline", "training_hours", "city_development_index", "education_level", "enrolled_university", "experience"]
job_data = job_data.dropna(subset=na_columns)

In [6]:
job_data.isnull().sum().sort_values(ascending=False) / len(job_data)

enrollee_id               0.0
city                      0.0
city_development_index    0.0
relevent_experience       0.0
enrolled_university       0.0
education_level           0.0
major_discipline          0.0
experience                0.0
training_hours            0.0
target                    0.0
dtype: float64

In [7]:
job_data["relevent_experience"].value_counts()

relevent_experience
Has relevent experience    11717
No relevent experience      3312
Name: count, dtype: int64

In [8]:
job_data["relevent_experience"] = job_data["relevent_experience"].map(lambda p: 1 if p == "Has relevent experience" else 0)

In [9]:
bool_col = ["relevent_experience", "target"]
job_data[bool_col] = job_data[bool_col].astype('bool')

In [10]:
job_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15029 entries, 0 to 19155
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15029 non-null  int64  
 1   city                    15029 non-null  object 
 2   city_development_index  15029 non-null  float64
 3   relevent_experience     15029 non-null  bool   
 4   enrolled_university     15029 non-null  object 
 5   education_level         15029 non-null  object 
 6   major_discipline        15029 non-null  object 
 7   experience              15029 non-null  float64
 8   training_hours          15029 non-null  float64
 9   target                  15029 non-null  bool   
dtypes: bool(2), float64(3), int64(1), object(4)
memory usage: 1.1+ MB


# Discovering Data

In [11]:
object_col = job_data.select_dtypes(include=["object"]).columns
job_num = job_data.drop(object_col, axis=1)
corr_matrix = job_num.corr()

In [12]:
corr_matrix["target"].sort_values(ascending=False)

target                    1.000000
enrollee_id               0.046335
training_hours           -0.025512
relevent_experience      -0.168891
experience               -0.209390
city_development_index   -0.336429
Name: target, dtype: float64

# Create a Test Set and Data Transforming

In [13]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(job_data, test_size=0.2, random_state=42)

In [14]:
job_data = train_set.drop("target", axis=1)
job_labels = train_set["target"].copy()

In [15]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_attribs = job_data.select_dtypes(exclude=["object"]).columns
cat_attribs = job_data.select_dtypes(include=["object"]).columns
pipeline = ColumnTransformer([("num", Pipeline([("std", StandardScaler())]), num_attribs), ("cat", OneHotEncoder(), cat_attribs)])
job_data_prepared = pipeline.fit_transform(job_data)

# Model Training

## Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(job_data_prepared, job_labels)

In [17]:
some_data = job_data.iloc[:5]
some_labels = job_labels.iloc[:5]
some_data_prepared = pipeline.transform(some_data)
print("Predictions: ", lin_reg.predict(some_data_prepared))

Predictions:  [0.22554823 0.62614302 0.15805625 0.15649843 0.10434165]


In [18]:
print("Labels: ", list(some_labels))

Labels:  [True, True, False, False, False]


In [19]:
from sklearn.metrics import mean_squared_error
job_predictions = lin_reg.predict(job_data_prepared)
lin_mse = mean_squared_error(job_labels, job_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.39184047937441757

## Decision Tree Regression

In [20]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(job_data_prepared, job_labels)

In [21]:
some_data = job_data.iloc[:5]
some_labels = job_labels.iloc[:5]
some_data_prepared = pipeline.transform(some_data)
print("Predictions: ", tree_reg.predict(some_data_prepared))

Predictions:  [1. 1. 0. 0. 0.]


In [22]:
print("Labels: ", list(some_labels))

Labels:  [True, True, False, False, False]


In [23]:
from sklearn.metrics import mean_squared_error #this model overfits 
job_predictions = tree_reg.predict(job_data_prepared) 
tree_mse = mean_squared_error(job_labels, job_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

# Running Test Sample

In [24]:
data = test_set.drop("target", axis=1)
labels = test_set["target"].copy()
data_prepared = pipeline.transform(data)

# Decision Tree Model
job_predictions_tree = tree_reg.predict(data_prepared)
tree_mse = mean_squared_error(labels, job_predictions_tree)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.5686416212459071

In [25]:
#Linear Regression Model
job_predictions_lin = lin_reg.predict(data_prepared)
lin_mse = mean_squared_error(labels, job_predictions_lin)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.40670718163148517