## Regression

Model purpose: given a job description, predict a score for the job description.

### Download and loading of dataset

In [2]:
import kagglehub
import os
import pandas as pd
# Download latest version to the specified directory
# path = kagglehub.dataset_download("arshkon/linkedin-job-postings")

path = "/home/leon/.cache/kagglehub/datasets/arshkon/linkedin-job-postings/versions/13"

print(f"Path to dataset files: {path}")
print(f"List of files in the dataset: {os.listdir(path)}")





**Drop indexes with NaN values**

In [3]:
postings_path = path + "/postings.csv"
postings_df = pd.read_csv(postings_path)

In [4]:
import matplotlib.pyplot as plt
import numpy as np

# print(postings_df['pat_period'].value_counts())
# print(f"Nb of YEARLY salary: {len(postings_df['pay_period'] == 'YEARLY')}")
# print(f"Nb of HOURLY salary: {len(postings_df['pay_period'] == 'HOURLY')}")


max_sal_df = postings_df.copy()
max_sal_df.dropna(subset=["normalized_salary"], inplace=True)
print(f"Nb of single posting normalized salary: {max_sal_df['normalized_salary'].unique()}")


unique, counts = np.unique(max_sal_df["normalized_salary"], return_counts=True)
print(f"Unique normalized salaries: {len(unique)}")
print(f"Nb rows: {len(max_sal_df)}")
# Show job posting with max salary


plt.stem(unique, counts)
plt.xlim(0, 1000000)
plt.xlabel("Normalized Salary")
plt.yscale("log")
plt.ylabel("Nb of Postings")
plt.title("Histogram of lViews")
plt.show()


postings_df.sort_values(by='normalized_salary', ascending=False).head(1)








In [4]:
# Make a copy of useful columns
useful_cols = ["job_id", "company_name", "title", "description", "views", "skills_desc"]
views_df = postings_df[useful_cols].copy()
views_df.dropna(subset=["description", "views"], inplace=True)

**Cleaning the descriptions of unwanted characters such as emojis etc**

In [5]:
import re

# Nettoyage de base
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d{10,}', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the clean_text function and ASSIGN the result back
views_df['description'] = views_df['description'].apply(lambda x: clean_text(x))

### Data analysis (views column)

In [11]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import PowerTransformer


# Quick overview of the dataset
print(f"Number of rows: {views_df.shape[0]}")
print(f"Number of columns: {views_df.shape[1]}")

# display basic statistics on useful columns
# for col in useful_cols:
#     print(f"{col}: {postings_df[col].dtype}")
#     print(f"Statistics for {col}:")
#     print(postings_df[col].describe())
#     print()

# postings_df["views"] = postings_df["views"].astype("Int64", errors="raise")

# Display the first few rows of the DataFrame
# print(postings_df.head())
# print(postings_df["description"][0])

# Dropna for the views and description columns
# print(f"Number of rows before dropping NaN values: {postings_df.shape[0]}")
# postings_df.dropna(subset=["views", "description"], inplace=True)
# print(f"Number of rows after dropping NaN values: {postings_df.shape[0]}")

# Basic statistics on the views column
print(views_df["views"].describe())
print(f"Number of unique values in the 'views' column: {views_df["views"].nunique()}")
print("Number of rows with 1 views:", len(views_df[views_df["views"] == 1]))
print("Number of rows with 2 views:", len(views_df[views_df["views"] == 2]))
print("Number of rows with 100 views:", len(views_df[views_df["views"] == 100]))

test_views = np.log1p(views_df["views"])
unique, counts = np.unique(test_views, return_counts=True)

plt.stem(unique, counts)
plt.xlim(0, 10)
plt.xlabel("Views")
plt.ylabel("Frequency")
# plt.yscale("log")
plt.title("Stem plot for Views (log1p transformation)")
plt.show()


pt = PowerTransformer(method='box-cox')
data_bc = pt.fit_transform(np.array(views_df["views"]).reshape(-1, 1))

unique, counts = np.unique(data_bc, return_counts=True)
plt.stem(unique, counts)
# plt.xlim(9000, 10000)
plt.xlabel("Views")
plt.ylabel("Frequency")
# plt.yscale("log")
plt.title("Stem plot of Views (Box-Cox transformation)")
plt.show()







### Transform data

In [6]:

views_df["views"] = views_df["views"].astype("Int64", errors="raise")

### Test Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, PoissonRegressor, GammaRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, PowerTransformer

X_train, X_test, y_train, y_test = train_test_split(views_df["description"], views_df["views"], test_size=0.2, random_state=42)

pt = PowerTransformer(method='box-cox')

y_train_bc = pt.fit_transform(np.array(y_train).reshape(-1, 1)).flatten()
y_test_bc = pt.transform(np.array(y_test).reshape(-1, 1)).flatten()

tfidf = TfidfVectorizer(max_features=2000, min_df=2, max_df=0.90, ngram_range=(1, 3), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train_tfidf, y_train_bc)

y_pred_bc = linear_reg_model.predict(X_test_tfidf)
y_pred_original = pt.inverse_transform(y_pred_bc.reshape(-1, 1)).flatten()

rmse = mean_squared_error(y_test, y_pred_original)
print(f"MSE: {rmse:.2f}")

# Optional: Check first few predictions vs actual
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred_original
})



In [None]:
# Test with 

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, LinearRegression, Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import PowerTransformer

# Apply standard scaler to the views column
# test_views = StandardScaler().fit_transform(views_df["views"].values.reshape(-1, 1))

filtered_views = np.where(views_df["views"] > views_df["views"].quantile(0.95),
                views_df["views"].quantile(0.95),
                views_df["views"])

# Split data (X = text, y = views)
X_train, X_test, y_train, y_test = train_test_split(
    views_df["description"].to_numpy(),
    filtered_views, 
    test_size=0.2, 
    random_state=42
)

print("dtype X_train:", type(X_train))
print("dtype views:", type(filtered_views))

# Define the text preprocessing pipeline
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,
        stop_words='english',
        ngram_range=(1, 3),
        min_df=3,
        max_df=0.8
    )),
])

model_pipeline = Pipeline([
    ('regressor', HistGradientBoostingRegressor(
    max_iter=200,
    learning_rate=0.05,
    max_depth=10
))])

# Full pipeline
full_pipeline = Pipeline([
    ('text_preprocessing', text_pipeline),
    ('regressor', model_pipeline)
])

X_train_dense = text_pipeline.named_steps['tfidf'].fit_transform(X_train).toarray()
X_test_dense = text_pipeline.named_steps['tfidf'].transform(X_test).toarray()

# Train
full_pipeline.fit(X_train_dense, y_train)

# Predict (automatically applies inverse Box-Cox)
y_pred = full_pipeline.predict(X_test_dense)

# Round to integers (since views are counts)
# y_pred = np.round(y_pred).astype(int)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {np.sqrt(mse):.2f}")

# Results
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(results.head(10))




In [8]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, LinearRegression, Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import PowerTransformer

# Apply standard scaler to the views column
# test_views = StandardScaler().fit_transform(views_df["views"].values.reshape(-1, 1))

test = np.where(views_df["views"] > views_df["views"].quantile(0.99), views_df["views"].quantile(0.99), views_df["views"])
print(test)
# Split data (X = text, y = views)
X_train, X_test, y_train, y_test = train_test_split(
    views_df["description"],
    test, 
    test_size=0.2, 
    random_state=42
)

print("dtype X_train:", type(X_train.values))
print("dtype views:", type(views_df["views"]))

# Train (use toarray)
full_pipeline.fit(X_train, y_train)

# Predict (automatically applies inverse Box-Cox)
y_pred = full_pipeline.predict(X_test)

# Round to integers (since views are counts)
# y_pred = np.round(y_pred).astype(int)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {np.sqrt(mse):.2f}")

# Results
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(results.head(10))






In [25]:
# Results
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(results.head(50))

# Plot the difference between actual and predicted values
# Plot two curves: one for actual values and one for predicted values, use stem plots
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 6))
# plt.stem(y_test.index, y_test, linefmt='b-', markerfmt='bo', basefmt=' ', label='Actual')
# plt.stem(y_test.index, y_pred, linefmt='r-', markerfmt='ro', basefmt=' ', label='Predicted')
# plt.xlabel('Index')
# plt.xlim(0, 10000)
# plt.ylabel('Views')
# plt.yscale('log')
# plt.title('Actual vs Predicted Views')
# plt.legend()
# plt.show()




In [11]:
y_pred_int = np.round(y_pred_original).astype(int)
# Optional: Check first few predictions vs actual
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred_int
})

print(results.head(50))



In [None]:
# Other models tests

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, PoissonRegressor, GammaRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(views_df["description"], views_df["views"], test_size=0.2, random_state=42)

pt = StandardScaler()

# y_train_bc = pt.fit_transform(np.array(y_train).reshape(-1, 1)).flatten()
# y_test_bc = pt.transform(np.array(y_test).reshape(-1, 1)).flatten()

tfidf = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.90, ngram_range=(1, 3), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

model = HistGradientBoostingRegressor(
    loss="poisson",  # Poisson loss for counts
    max_iter=200,
    learning_rate=0.1
)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
y_pred_int = np.round(y_pred).astype(int)


rmse = mean_squared_error(y_test, y_pred_int)
print(f"MSE: {rmse:.2f}")

# Optional: Check first few predictions vs actual
results = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred_int
})