# Predicting Salary Using Multivariate Linear Regression

In [93]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
import math
from word2number import w2n

### Load the dataset

In [94]:
df = pd.read_csv("hiring.csv")

### Display the initial dataframe

In [95]:
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


### Renaming columns for easy data manipulation

In [96]:
df = df.rename(columns={
    'test_score(out of 10)': 'test_score', 
    'interview_score(out of 10)': 'interview_score', 
    'salary($)': 'salary'
})

# Data Preparation

## Handling Missing Values

### Calculate the median of test_score and fill missing values

In [97]:
median_test_score = math.floor(df.test_score.median())
print(f"Median Test Score: {median_test_score}")

Median Test Score: 8


# Fill missing test_score values with the calculated median

In [98]:
df.test_score = df.test_score.fillna(median_test_score)
df

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


### Fill missing experience values with zero since no data is available

### Function to convert experience strings to numbers

In [99]:
def convert_experience(val):
    try:
        # Try to convert spelled-out numbers to integers
        return w2n.word_to_num(val)
    except:
        # If the conversion fails, return NaN
        return np.nan

# Apply the conversion function to the 'experience' column

In [100]:
df['experience'] = df['experience'].apply(convert_experience)
# for i in range(len(df)):
#     if isinstance(df.loc[i, 'experience'], str):
#         df.loc[i, 'experience'] = convert_experience(df.loc[i, 'experience'])

### Display the DataFrame after handling missing values

In [109]:
# df.experience.fillna(0, inplace=True) or to avoid warnings
df['experience'] = df['experience'].fillna(0) 

### Display the dataframe after handling missing values

In [110]:
print("DataFrame after filling missing values:")
df

DataFrame after filling missing values:


Unnamed: 0,experience,test_score,interview_score,salary
0,0.0,8.0,9,50000
1,0.0,8.0,6,45000
2,5.0,6.0,7,60000
3,2.0,10.0,10,65000
4,7.0,9.0,6,70000
5,3.0,7.0,10,62000
6,10.0,8.0,7,72000
7,11.0,7.0,8,80000


# Train the Multivariate Linear Regression Model

### Feature Selection and fitting the model

In [111]:
reg = linear_model.LinearRegression()
X = df[["experience", "test_score", "interview_score"]]
y = df["salary"]
reg.fit(X, y)

### Display coefficients and intercept

In [112]:
print("Coefficients:", reg.coef_)
print("Intercept:", reg.intercept_)

Coefficients: [2812.95487627 1845.70596798 2205.24017467]
Intercept: 17737.263464337688


# Model Equation

### The model equation can be represented as:
### y = m1 * experience	 + m2 * test_score + m3 * interview_score + b
### where:
### m1, m2, m3 = coefficients
### b = intercept


# Predicting Salary

### Example Prediction
### Predict the salary of a person with experience = 5 years, test_score = 8, and interview_score = 9

In [125]:
predicted_salary = reg.predict([[5, 8, 9]])[0]  # Accessing the first element from the prediction
print(f"Predicted Salary for a person with 5 years of experience, a test score of 8, and an interview score of 9: ${predicted_salary:.2f}")

# doing this We're passing a list (not a DataFrame) to predict, which lacks the column names (experience, test_score, and interview_score). Hence, scikit-learn raises this warning.

Predicted Salary for a person with 5 years of experience, a test score of 8, and an interview score of 9: $66414.85




### Create a DataFrame for the input data and then predict using the DataFrame

In [127]:
input_data = pd.DataFrame([[5, 8, 9]], columns=["experience", "test_score", "interview_score"])

predicted_salary = reg.predict(input_data)[0]
print(f"Predicted Salary for a person with 5 years of experience, a test score of 8, and an interview score of 9: ${predicted_salary:.2f}")

Predicted Salary for a person with 5 years of experience, a test score of 8, and an interview score of 9: $66414.85


# Verifying the prediction using manual calculation

In [114]:
# Verifying the prediction using manual calculation
manual_prediction = (reg.coef_[0] * 5) + (reg.coef_[1] * 8) + (reg.coef_[2] * 9) + reg.intercept_
print(f"Manual Calculation of Predicted Salary: ${manual_prediction:.2f}")

Manual Calculation of Predicted Salary: $66414.85


### Creating a DataFrame to store predictions

In [115]:
predicted_outcomes = pd.DataFrame(columns=["experience", "test_score", "interview_score", "predicted_salary"])

### List of inputs for predictions

In [116]:
inputs = [
    [5, 8, 9],
    [3, 7, 6],
    [10, 9, 8],
    [0, 5, 5],
    [8, 6, 7]
]

### Loop to make predictions and store in the DataFrame

In [130]:
# Loop to make predictions and store in the DataFrame
for experience, test_score, interview_score in inputs:
    input_data = pd.DataFrame([[experience, test_score, interview_score]], columns=["experience", "test_score", "interview_score"])
    predicted_salary = reg.predict(input_data)[0]

    # Create a temporary DataFrame for the new prediction
    temp_df = pd.DataFrame({
        "experience": [experience],
        "test_score": [test_score],
        "interview_score": [interview_score],
        "predicted_salary": [predicted_salary]
    })
    
    # Use pd.concat to combine the new DataFrame with the existing one
    predicted_outcomes = pd.concat([predicted_outcomes, temp_df], ignore_index=True)

In [131]:
predicted_outcomes

Unnamed: 0,experience,test_score,interview_score,predicted_salary
0,5,8,9,66414.847162
1,3,7,6,52327.510917
2,10,9,8,80120.087336
3,0,5,5,37991.994178
4,8,6,7,66751.819505
5,5,8,9,66414.847162
6,3,7,6,52327.510917
7,10,9,8,80120.087336
8,0,5,5,37991.994178
9,8,6,7,66751.819505


In [122]:
predicted_outcomes.to_csv("predicted_outcomes.csv", index=False)

In [123]:
print("Predicted outcomes saved to 'predicted_outcomes.csv'.")

Predicted outcomes saved to 'predicted_outcomes.csv'.


# The model has been trained and is capable of predicting salaries based on experience, test scores, and interview scores.