In [3]:
import pandas as pd

# Load the dataset
file_path = r"D:\files\employee.csv"
employee_data = pd.read_csv(file_path)

# Display the first few rows of the dataset and its overall info
employee_data.head(), employee_data.info(), employee_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             4277 non-null   int64  
 1   timestamp                      4277 non-null   object 
 2   country                        4277 non-null   object 
 3   employment_status              4277 non-null   object 
 4   job_title                      4277 non-null   object 
 5   job_years                      4277 non-null   float64
 6   is_manager                     4277 non-null   object 
 7   hours_per_week                 4260 non-null   float64
 8   telecommute_days_per_week      4266 non-null   float64
 9   education                      4277 non-null   object 
 10  is_education_computer_related  4261 non-null   object 
 11  certifications                 4277 non-null   object 
 12  salary                         4277 non-null   f

(   id            timestamp        country employment_status  job_title  \
 0   1  12/11/2018 10:52:26       Slovenia         Full time  Developer   
 1   2    1/5/2017 16:57:50  United States         Full time        DBA   
 2   3   12/18/2017 8:13:15         Sweden         Full time        DBA   
 3   4   12/27/2018 4:56:52  United States         Full time        DBA   
 4   5  12/11/2018 14:07:58  United States         Full time  Developer   
 
    job_years is_manager  hours_per_week  telecommute_days_per_week  \
 0    4.78393        Yes            40.0                        0.0   
 1    5.00000         No            40.0                        5.0   
 2    1.00000         No            40.0                        0.0   
 3    1.00000         No            40.0                        2.0   
 4    3.00000         No            40.0                        2.0   
 
              education is_education_computer_related certifications  \
 0  Bachelors (4 years)                         

In [5]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((3421, 101), (856, 101))

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Re-implementing imputation and one-hot encoding with correct imports
imputer = SimpleImputer(strategy='mean')
employee_data['hours_per_week'] = imputer.fit_transform(employee_data[['hours_per_week']])
employee_data['telecommute_days_per_week'] = imputer.fit_transform(employee_data[['telecommute_days_per_week']])

# Assuming 'is_education_computer_related' can be filled with the mode
mode_value = employee_data['is_education_computer_related'].mode()[0]
employee_data['is_education_computer_related'].fillna(mode_value, inplace=True)

# Define columns to be one-hot encoded
categorical_cols = ['country', 'employment_status', 'job_title', 'is_manager', 'education', 
                    'is_education_computer_related', 'certifications']

# Applying OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse=False)  # Dropping first to avoid dummy variable trap
ct = ColumnTransformer(transformers=[('encoder', encoder, categorical_cols)], remainder='passthrough')

# Preparing the features and target variable
X = employee_data.drop(['id', 'timestamp', 'salary'], axis=1)
y = employee_data['salary']

# Fit and transform the feature data
X_encoded = ct.fit_transform(X)

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape




((3421, 101), (856, 101))

In [7]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_predictions = linear_model.predict(X_test)
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_mse = mean_squared_error(y_test, linear_predictions)

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ridge_predictions = ridge_model.predict(X_test)
ridge_mae = mean_absolute_error(y_test, ridge_predictions)
ridge_mse = mean_squared_error(y_test, ridge_predictions)

# Lasso Regression
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
lasso_predictions = lasso_model.predict(X_test)
lasso_mae = mean_absolute_error(y_test, lasso_predictions)
lasso_mse = mean_squared_error(y_test, lasso_predictions)

linear_mae, linear_mse, ridge_mae, ridge_mse, lasso_mae, lasso_mse


(6108345683095.598,
 2.2814640509345217e+28,
 867.2317673689096,
 1341579.4476108407,
 868.524102815066,
 1344177.1025922527)