# With Own Dataset


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
# Load the dataset
df = pd.read_csv("realistic_salary_dataset.csv")

In [3]:
df.head()

Unnamed: 0,City,Education,Industry,YearsExperience,Certification,Salary
0,San Francisco,High School,Healthcare,18,AWS,53753
1,Austin,Masters,Retail,29,,77380
2,Chicago,PhD,Technology,5,,65670
3,Austin,Bachelors,Finance,1,Scrum Master,33650
4,Austin,High School,Education,11,PMP,47668


In [4]:
x = df.drop(columns=['Salary'])
y = df['Salary']

In [5]:
# Encode categorical features using LabelEncoder
le = LabelEncoder()
for col in x.columns:
    if x[col].dtype == 'object':
        x[col + '_Encoded'] = le.fit_transform(x[col])

In [6]:
# Drop original categorical columns (keep only encoded ones and numeric)
x = x.select_dtypes(exclude='object')

In [8]:
x

Unnamed: 0,YearsExperience,City_Encoded,Education_Encoded,Industry_Encoded,Certification_Encoded
0,18,4,1,2,0
1,29,0,2,3,4
2,5,1,3,4,4
3,1,0,0,1,3
4,11,0,1,0,2
...,...,...,...,...,...
463,27,2,0,2,4
464,5,2,1,1,3
465,14,2,2,3,0
466,1,2,1,0,4


In [9]:
# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [10]:
model = LinearRegression()
model.fit(x_train, y_train)

In [11]:
y_pred = model.predict(x_test)

In [12]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [13]:
print(f"R² Score: {r2:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")

R² Score: 0.83
Mean Absolute Error: 5652.39


In [15]:
print(f'Coefficients: {model.coef_}')
print(f"Intercept: {model.intercept_}")

Coefficients: [1492.78932122  411.31127513 7077.01153006  742.06614526  -53.47219398]
Intercept: 27471.247862209282
