#### Predicts Salary Of Employee Model

In [1]:
import pandas as pd

In [3]:
path = r"https://raw.githubusercontent.com/sindhura-nk/Datasets/refs/heads/main/Salary_dataset.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,YearsExperience,Salary
0,0,1.2,39344.0
1,1,1.4,46206.0
2,2,1.6,37732.0
3,3,2.1,43526.0
4,4,2.3,39892.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       30 non-null     int64  
 1   YearsExperience  30 non-null     float64
 2   Salary           30 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 852.0 bytes


## Data Cleaning

In [5]:
df.isna().sum()

Unnamed: 0         0
YearsExperience    0
Salary             0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

## Separate X and Y Features
    X: Independent Feature => YearsExperience
    Y: Dependent Feature => Salary

In [7]:
X = df[['YearsExperience']]
Y =  df[['Salary']]

In [8]:
X.head()

Unnamed: 0,YearsExperience
0,1.2
1,1.4
2,1.6
3,2.1
4,2.3


In [9]:
Y.head()

Unnamed: 0,Salary
0,39344.0
1,46206.0
2,37732.0
3,43526.0
4,39892.0


### There are no missing values nor any duplicated rows in dataset

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

In [11]:
pipe = make_pipeline(SimpleImputer(strategy='median')).set_output(transform='pandas')

In [12]:
pipe

In [13]:
X_pre = pipe.fit_transform(X)
X_pre.head()

Unnamed: 0,YearsExperience
0,1.2
1,1.4
2,1.6
3,2.1
4,2.3


## Model Building : Linear Regreesion
    Equation of Line: y = B0 + B1.X
    B0 => y intercept
    B1 => Slope of the line

    ProfitPredicted = B0 + B1*RND

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
model = LinearRegression()
model.fit(X,Y)

In [16]:
model.intercept_

array([24848.20396652])

In [17]:
model.coef_

array([[9449.96232146]])

In [None]:
SalaryPredicted = 24848.20 + 9449.96*YearsExperience

In [18]:
Y.head()

Unnamed: 0,Salary
0,39344.0
1,46206.0
2,37732.0
3,43526.0
4,39892.0


In [19]:
Ypreds = model.predict(X)
Ypreds[0:5]

array([[36188.15875227],
       [38078.15121656],
       [39968.14368085],
       [44693.12484158],
       [46583.11730587]])

## Evaluation Metrics
    Mean Squared Error
    Mean Absolute Error
    RMSE
    R2 squared- R2 score

In [20]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [22]:
# Mean Squared Error
MSE = mean_squared_error(Y,Ypreds)
# RMSE  - Root Mean Squared Error
RMSE = MSE**(1/2)
# MAE
MAE = mean_absolute_error(Y,Ypreds)
# R2 Score
r2 = r2_score(Y,Ypreds)

print(f"Mean_Squared_Error: {MSE:.2f}")
print(f"Root_Mean_Squared_Error: {RMSE:.2f}")
print(f"Mean_Absolute_Error: {MAE:.2f}")
print(f"R2 Squared: {r2*100:.2f}%")

Mean_Squared_Error: 31270951.72
Root_Mean_Squared_Error: 5592.04
Mean_Absolute_Error: 4644.20
R2 Squared: 95.70%


## R2 Squared values is grater than 80%, we can consider this model for final model building

In [23]:
Y.head()

Unnamed: 0,Salary
0,39344.0
1,46206.0
2,37732.0
3,43526.0
4,39892.0


In [28]:
salary_sample = [50210,56000,42830,47000,41780]

In [29]:
preds = []
for x in salary_sample:
    pred = model.predict([[x]]).round(2)
    preds.append(pred)



In [30]:
salary_sample

[50210, 56000, 42830, 47000, 41780]

In [31]:
print(preds)

[array([[4.74507456e+08]]), array([[5.29222738e+08]]), array([[4.04766734e+08]]), array([[4.44173077e+08]]), array([[3.94844274e+08]])]
