In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib

In [2]:
df = pd.read_csv(r"C:\Users\panku\Downloads\Aryan\jupyter notebook\final_hr\hrdataset.csv")
df.head()

Unnamed: 0,Gender,Business,Dependancies,Calls,Type,Billing,Rating,Age,Salary,Base_pay,Bonus,Unit_Price,Volume,openingbalance,closingbalance,low,Unit_Sales,Total_Sales,Months,Education
0,Female,0,No,Yes,Month-to-month,No,Yes,18,5089.0,2035.6,254.45,3.77,21226600,3.75,3.76,3.65,18.25,18.8,0,High School or less
1,Female,0,No,Yes,Month-to-month,No,Yes,19,5698.12,2279.248,284.906,3.74,10462800,3.85,3.68,3.65,18.4,18.85,0,High School or less
2,Male,0,No,Yes,Month-to-month,Yes,No,22,5896.65,2358.66,294.8325,3.89,18761000,4.23,4.29,3.72,18.7,18.9,0,High School or less
3,Female,1,No,Yes,Month-to-month,Yes,Yes,21,6125.12,2450.048,306.256,4.35,66130600,4.26,4.31,3.83,18.75,19.0,0,High School or less
4,Male,0,No,Yes,Month-to-month,Yes,Yes,23,6245.0,2498.0,312.25,4.34,26868200,4.79,4.41,4.08,18.8,19.05,1,High School or less


In [3]:
FEATURES = ["Age", "Bonus", "Months", "Education"]
TARGET = "Salary"

X = df[FEATURES]
y = df[TARGET]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [5]:
num_features = ["Age", "Bonus", "Months"]
cat_features = ["Education"]

In [6]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [10]:
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(
        handle_unknown="ignore",   # REQUIRED FOR DEPLOYMENT
        sparse_output=False
    ))
])


In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_features)
    ]
)


In [12]:
model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)


In [13]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


In [14]:
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
y_pred = pipeline.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)


R2 Score: 0.999867232668284
MAE: 32.414039679526205
RMSE: 303.3997473198642


In [17]:
pipeline.predict(pd.DataFrame([{
    "Age": 35,
    "Bonus": 7000,
    "Months": 60,
    "Education": "SomeNewEducationValue"  # unseen value
}]))


array([139913.2501515])

In [18]:
joblib.dump(
    {"model": pipeline},
    "salary_prediction_model.pkl"
)

['salary_prediction_model.pkl']

In [None]:
v c