In [1]:
pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [5]:
df = pd.read_csv('linkdin_job_posting.csv')

# Display the first few rows to understand the structu
df.head(2)

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,min_salary,formatted_work_type,applies,original_listed_time,remote_allowed,job_posting_url,application_url,application_type,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,17.0,Full-time,2.0,1713400000000.0,,https://www.linkedin.com/jobs/view/921716/?trk...,,ComplexOnsiteApply,1715990000000.0,,,Requirements: \n\nWe are seeking a College or ...,1713400000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,30.0,Full-time,,1712860000000.0,,https://www.linkedin.com/jobs/view/1829192/?tr...,,ComplexOnsiteApply,1715450000000.0,,,,1712860000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                122130 non-null  object 
 2   title                       123849 non-null  object 
 3   description                 123842 non-null  object 
 4   max_salary                  29793 non-null   float64
 5   pay_period                  36073 non-null   object 
 6   location                    123849 non-null  object 
 7   company_id                  122132 non-null  float64
 8   views                       122160 non-null  float64
 9   med_salary                  6280 non-null    float64
 10  min_salary                  29793 non-null   float64
 11  formatted_work_type         123849 non-null  object 
 12  applies                     23320 non-null   float64
 13  original_liste

In [6]:
# Selecting relevant columns for the recommendation task
relevant_columns = ['job_id', 'views', 'applies']

# Dropping rows with missing values in the relevant columns
cleaned_data = df[relevant_columns].dropna()

# Display the cleaned dataset to confirm
cleaned_data.head()

Unnamed: 0,job_id,views,applies
0,921716,20.0,2.0
5,91700727,9.0,4.0
6,103254301,7.0,1.0
10,9615617,4.0,1.0
18,111513530,10.0,1.0


In [7]:
cleaned_data.shape

(23319, 3)

In [8]:
# For simplicity, we'll simulate 1000 unique users
np.random.seed(42)  # For reproducibility
num_users = 1000
user_ids = np.random.randint(1, num_users + 1, size=len(cleaned_data))

In [9]:
# Add the simulated User_ID to the dataset
cleaned_data['User_ID'] = user_ids

In [10]:
#Fill missing values in 'applies' or 'views' with 0 (representing no interaction)
cleaned_data['applies'].fillna(0, inplace=True)
cleaned_data['views'].fillna(0, inplace=True)

In [11]:
#Decide which interaction to use (we'll use 'applies' for this example)
cleaned_data['interaction'] = cleaned_data['applies']

In [12]:
# Drop rows where interaction data is missing or 0 (optional step based on model needs)
cleaned_data = cleaned_data[cleaned_data['interaction'] > 0]
cleaned_data.head()

Unnamed: 0,job_id,views,applies,User_ID,interaction
0,921716,20.0,2.0,103,2.0
5,91700727,9.0,4.0,436,4.0
6,103254301,7.0,1.0,861,1.0
10,9615617,4.0,1.0,271,1.0
18,111513530,10.0,1.0,107,1.0


In [13]:
# Final dataset ready for SVD training
cleaned_data = cleaned_data[['User_ID', 'job_id', 'interaction']]

cleaned_data.head()

Unnamed: 0,User_ID,job_id,interaction
0,103,921716,2.0
5,436,91700727,4.0
6,861,103254301,1.0
10,271,9615617,1.0
18,107,111513530,1.0


In [14]:
# Save the cleaned dataset (optional, if you want to save the preprocessed data)
cleaned_data.to_csv('cleaned_job_posting_for_svd.csv', index=False)

**Model Training**

In [16]:
# Check the range of 'applies' to determine the correct rating scale
print(cleaned_data['interaction'].min(), cleaned_data['interaction'].max())


1.0 967.0


In [17]:
# Step 1: Prepare the data for SVD in Surprise format
reader = Reader(rating_scale=(1, 967))  # Assuming the rating scale ranges from 1 to 10
data = Dataset.load_from_df(cleaned_data[['User_ID', 'job_id', 'interaction']], reader)

In [18]:
# Step 2: Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2)

In [19]:
# Step 3: Train the SVD model
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc42fbca110>

In [20]:
# Step 4: Evaluate the model on the test set using RMSE (optional)
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 117.3682


117.36822323038629

In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Extract the true ratings and predicted ratings
true_ratings = [pred.r_ui for pred in predictions]  # Actual ratings
predicted_ratings = [pred.est for pred in predictions]  # Predicted ratings

# Mean Absolute Error (MAE)
mae = mean_absolute_error(true_ratings, predicted_ratings)

# Mean Squared Error (MSE)
mse = mean_squared_error(true_ratings, predicted_ratings)

# R-squared (R²)
r2 = r2_score(true_ratings, predicted_ratings)

# RMSE
rmse = accuracy.rmse(predictions)

# Print the evaluation metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R²: {r2}")

RMSE: 117.3682
RMSE: 117.36822323038629
MAE: 25.05255164164069
MSE: 13775.299824257789
R²: -19.147708113014954


I will run hyperparameter optimization

In [22]:
pip install optuna

Collecting optuna
  Using cached optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Using cached Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Using cached optuna-4.0.0-py3-none-any.whl (362 kB)
Using cached alembic-1.13.3-py3-none-any.whl (233 kB)
Using cached colorlog-6.8.2-py3-none-any.whl (11 kB)
Using cached Mako-1.3.5-py3-none-any.whl (78 kB)
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.3 colorlog-6.8.2 optuna-4.0.0
Note: you may need to restart the kernel to use updated packages.


In [23]:
import optuna
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
import pandas as pd

# Prepare the data for Surprise
reader = Reader(rating_scale=(1, 967))  # Assuming the rating scale ranges from 1 to 10
data = Dataset.load_from_df(cleaned_data[['User_ID', 'job_id', 'interaction']], reader)

# Split the data into trainset and testset (this will be done within the cross-validation)
trainset = data.build_full_trainset()

# Objective function for Optuna
def objective(trial):
    # Define hyperparameters to be tuned
    param = {
        'n_factors': trial.suggest_int('n_factors', 10, 200),  # Number of latent factors
        'n_epochs': trial.suggest_int('n_epochs', 5, 50),  # Number of epochs
        'lr_all': trial.suggest_float('lr_all', 0.001, 0.05),  # Learning rate for SGD
        'reg_all': trial.suggest_float('reg_all', 0.001, 0.1)  # Regularization term
    }

    # Create the SVD model with the suggested hyperparameters
    svd_model = SVD(**param)

    # Perform cross-validation and return the RMSE score
    results = cross_validate(svd_model, data, measures=['rmse'], cv=5, verbose=False)
    
    # Return the average RMSE score (minimizing this)
    return np.mean(results['test_rmse'])

# Enable the default logger of Optuna
optuna.logging.enable_default_handler()

# Set the logging level (for more detailed information)
optuna.logging.set_verbosity(optuna.logging.INFO)

# Create an Optuna study for minimizing the RMSE (Root Mean Squared Error)
study = optuna.create_study(direction='minimize')

# Perform optimization
study.optimize(objective, n_trials=100)

# Best hyperparameters
print('Best trial:', study.best_trial.params)

# Best RMSE value
print('Best RMSE:', study.best_value)



[I 2024-10-20 07:17:02,028] A new study created in memory with name: no-name-33f208b4-22a7-4ae6-9896-88ad5ef3810f
[I 2024-10-20 07:17:06,076] Trial 0 finished with value: 312.0858908433214 and parameters: {'n_factors': 142, 'n_epochs': 28, 'lr_all': 0.01122651588698382, 'reg_all': 0.01953423416808653}. Best is trial 0 with value: 312.0858908433214.
[I 2024-10-20 07:17:11,046] Trial 1 finished with value: 228.46501163405637 and parameters: {'n_factors': 156, 'n_epochs': 35, 'lr_all': 0.008371834070993178, 'reg_all': 0.020798468192352763}. Best is trial 1 with value: 228.46501163405637.
[I 2024-10-20 07:17:14,316] Trial 2 finished with value: 214.5314116446863 and parameters: {'n_factors': 187, 'n_epochs': 17, 'lr_all': 0.007880954295257325, 'reg_all': 0.031148124683189823}. Best is trial 2 with value: 214.5314116446863.
[I 2024-10-20 07:17:16,690] Trial 3 finished with value: 486.2443812041637 and parameters: {'n_factors': 73, 'n_epochs': 21, 'lr_all': 0.0228702126126431, 'reg_all': 0.0

Best trial: {'n_factors': 113, 'n_epochs': 7, 'lr_all': 0.0010498508786987025, 'reg_all': 0.06194429830638341}
Best RMSE: 28.83923168750865


In [24]:
best_params = study.best_trial.params
print("Best hyperparameters: ", best_params)

Best hyperparameters:  {'n_factors': 113, 'n_epochs': 7, 'lr_all': 0.0010498508786987025, 'reg_all': 0.06194429830638341}


In [25]:
# retrain the model with the tuned hyperparameters

# Step 3: Train the SVD model
model_tune = SVD(**best_params)
model_tune.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc42446e800>

In [26]:
# Step 4: Evaluate the model on the test set using RMSE (optional)
predictions = model_tune.test(testset)
accuracy.rmse(predictions)

RMSE: 23.1720


23.1720372513467

In [27]:
# Extract the true ratings and predicted ratings
true_ratings = [pred.r_ui for pred in predictions]  # Actual ratings
predicted_ratings = [pred.est for pred in predictions]  # Predicted ratings

# Mean Absolute Error (MAE)
mae = mean_absolute_error(true_ratings, predicted_ratings)

# Mean Squared Error (MSE)
mse = mean_squared_error(true_ratings, predicted_ratings)

# R-squared (R²)
r2 = r2_score(true_ratings, predicted_ratings)

# RMSE
rmse = accuracy.rmse(predictions)

# Print the evaluation metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R²: {r2}")

RMSE: 23.1720
RMSE: 23.1720372513467
MAE: 10.994374400387773
MSE: 536.9433103777992
R²: 0.21466848426213658


The hyperparameter tuning improved the model

In [28]:
# Step 5: Save the trained model using joblib for later use (e.g., deployment)

import joblib
model_filename = 'svd_model_job.joblib'
joblib.dump(model_tune, model_filename)
print(f"Model saved as {model_filename}")

Model saved as svd_model_job.joblib
