# Let's Make a Machine Learning Model

In [1]:
# Step 1: Import important libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Step 2: Load the dataset
data = pd.read_csv('MFGEmployees.csv')
print("Data Loaded Successfully!")
data  # Show first 5 rows

Data Loaded Successfully!


Unnamed: 0,EmployeeNumber,Surname,GivenName,Gender,City,JobTitle,DepartmentName,StoreLocation,Division,Age,LengthService,AbsentHours,BusinessUnit
0,1,Gutierrez,Molly,F,Burnaby,Baker,Bakery,Burnaby,Stores,32.028816,6.018478,36.577306,Stores
1,2,Hardwick,Stephen,M,Courtenay,Baker,Bakery,Nanaimo,Stores,40.320902,5.532445,30.165072,Stores
2,3,Delgado,Chester,M,Richmond,Baker,Bakery,Richmond,Stores,48.822047,4.389973,83.807798,Stores
3,4,Simon,Irene,F,Victoria,Baker,Bakery,Victoria,Stores,44.599357,3.081736,70.020165,Stores
4,5,Delvalle,Edward,M,New Westminster,Baker,Bakery,New Westminster,Stores,35.697876,3.619091,0.000000,Stores
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8335,8336,Salter,Charles,M,Vancouver,Dairy Person,Dairy,Vancouver,Stores,46.192782,5.174722,112.023389,Stores
8336,91,Velez,Colleen,F,Good Hope Lake,Baker,Bakery,Terrace,Stores,37.742424,3.569518,49.846412,Stores
8337,8324,Vesely,Shaun,M,Nelson,Cashier,Customer Service,Nelson,Stores,13.190294,7.667758,0.000000,Stores
8338,8315,Floyd,Aimee,F,Burnaby,Dairy Person,Dairy,Burnaby,Stores,37.525723,2.111874,52.114955,Stores


In [3]:
data.shape

(8340, 13)

In [4]:
data.dtypes

EmployeeNumber      int64
Surname            object
GivenName          object
Gender             object
City               object
JobTitle           object
DepartmentName     object
StoreLocation      object
Division           object
Age               float64
LengthService     float64
AbsentHours       float64
BusinessUnit       object
dtype: object

In [5]:
# Step 3: Clean the data
# Remove rows with missing 'AbsentHours'
data = data.dropna(subset=['AbsentHours'])
# Remove duplicate rows
data = data.drop_duplicates()

print(f"Data shape after cleaning: {data.shape}")

Data shape after cleaning: (8335, 13)


In [6]:
# Step 4: Select features and target
# X -> input features, y -> target variable
X = data.drop(['EmployeeNumber', 'Surname', 'GivenName', 'AbsentHours'], axis=1)
y = data['AbsentHours']

In [7]:
# Step 5: Convert text (categorical) data into numbers
# Label Encoding for all object (string) columns
label_encoders = {}
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Save encoder for future use
print("Label Encoding done!")


Label Encoding done!


### 📚 What is Label Encoding?
#### Label Encoding converts text (categories) into numbers.

#### Example: 'Red' → 0, 'Green' → 1, 'Blue' → 2.

### Feature Engineering

In [8]:
from sklearn.preprocessing import LabelEncoder

colors = ['Red', 'Green', 'Blue', 'Green', 'Red', 'Blue']

# 1. Create the encoder
encoder = LabelEncoder()

# 2. Fit and transform the data
encoded_colors = encoder.fit_transform(colors)

# 3. Print the result
print(encoded_colors)


[2 1 0 1 2 0]


In [9]:
X.head()

Unnamed: 0,Gender,City,JobTitle,DepartmentName,StoreLocation,Division,Age,LengthService,BusinessUnit
0,0,28,4,4,4,5,32.028816,6.018478,1
1,1,51,4,4,17,5,40.320902,5.532445,1
2,1,179,4,4,28,5,48.822047,4.389973,1
3,0,226,4,4,36,5,44.599357,3.081736,1
4,1,143,4,4,20,5,35.697876,3.619091,1


<img src="train_test_split.PNG" style="width:850px;height:350px">

In [10]:
# Step 6: Split data into training and testing sets
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training Set Size: {X_train.shape}")
print(f"Testing Set Size: {X_test.shape}")

Training Set Size: (6668, 9)
Testing Set Size: (1667, 9)


In [11]:
# Step 7: Build the Machine Learning model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# print("Model Training Complete!")

In [12]:
# Step 8: Make predictions
y_pred = model.predict(X_test)

#### The R² score (pronounced "R-squared") is a key metric used to evaluate the performance of a regression model in machine learning.

#### MSE measures the average of the squares of the errors between the actual and predicted values.

In [13]:
# Step 9: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R2 Score: {r2:.2f}")


Mean Squared Error: 642.89
R2 Score: 0.72


In [14]:
import joblib

# Save model to file
joblib.dump(model, 'hranalyticsmodel.pkl')

['hranalyticsmodel.pkl']

# What's Next? 🤔
## Connecting to Flask Server🤩
# But why?🤔
## For Developing stunning Machine Learning web app 🥳