# Baseline by predicting the average

### Getting the data

#### Train/test data

In [1]:
!mkdir data
!mkdir data/history
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1iqg1FIPfbrZWlung6gZqve1MeQWc0Je4&export=download&authuser=1&confirm=t' -O './data/dataset.csv'

--2024-12-17 22:58:55--  https://drive.usercontent.google.com/download?id=1iqg1FIPfbrZWlung6gZqve1MeQWc0Je4&export=download&authuser=1&confirm=t
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.197.132, 2607:f8b0:400e:c03::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.197.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 218678075 (209M) [application/octet-stream]
Saving to: './data/dataset.csv'


2024-12-17 22:58:58 (196 MB/s) - './data/dataset.csv' saved [218678075/218678075]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22224 entries, 0 to 22223
Data columns (total 13 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   title                               22224 non-null  object 
 1   location                            22224 non-null  object 
 2   company                            

In [2]:
import pandas as pd

df = pd.read_csv('../data/dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22224 entries, 0 to 22223
Data columns (total 15 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   title                                 22224 non-null  object 
 1   company                               22224 non-null  object 
 2   location                              22224 non-null  object 
 3   skills                                14384 non-null  object 
 4   source                                22224 non-null  object 
 5   description_no_numbers_with_skills    22224 non-null  object 
 6   experience_from                       22224 non-null  float64
 7   experience_to_adjusted_10             22224 non-null  float64
 8   description_size                      22224 non-null  int64  
 9   description                           22224 non-null  object 
 10  description_no_numbers                22224 non-null  object 
 11  description_no_

### Service functions

In [3]:
import numpy as np


def set_seed(seed: int) -> None:
    "Set seed for reproducibility"
    np.random.seed(seed)

### Training-eval loop with experiments

##### Define text feature/target columns

In [4]:
text_col_1 = 'description_no_numbers'
text_col_1_with_prompt = text_col_1 + '_with_prompt' # Add prompt to text column

text_col_2 = 'title_company_location_skills_source' # Merged text column, second feature

target_col = 'log_salary_from' # regression target

##### Create merged title/skills/location/source feature

In [5]:
df['skills'] = df['skills'].fillna('Не указаны')

title_company_location_skills_feature_template = """
Позиция: {position}
Компания: {company}
Место: {location}
Навыки: {skills}
Источник: {source}
"""

df['title_company_location_skills_source'] = df.apply(lambda x: title_company_location_skills_feature_template.format(
    position=x['title'],
    company=x['company'],
    location=x['location'],
    skills=x['skills'],
    source=x['source']
), axis=1)

##### Add a prompt to the feature 1

In [6]:
# prompt to be added to feature 1 for [MASK] token embedding regression
# (may be not used here)
prompt = """\
[CLS] Далее указано описание вакансии. \
Судя по описанию, зарплата на этой позиции составляет [MASK].[SEP]\
"""

df[text_col_1_with_prompt] = prompt + df[text_col_1]

#### Training code

##### Experiment 0: Prediction by average

In [7]:
import pickle
import warnings
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


experiment_name = 'average'
print(experiment_name.upper())
print('='*100)
print()

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

seeds = [42, 78687, 123123]
combined_history = {}

# Dataset
# Prepare data
X = df[[text_col_1, text_col_1_with_prompt, text_col_2]]
y = df[[target_col,]]

start_time = time.time()

for seed in seeds:
    print()
    print(f'Starting for seed {str(seed)}...')
    print('-' * 100)
    print()

    combined_history[seed] = {}

    set_seed(seed)

    # Split train-test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    y_train_reg = y_train[target_col]
    y_test_reg = y_test[target_col]

    y_pred = y_train_reg.mean().repeat(len(y_test_reg))
    y_train_by_average = y_train_reg.mean().repeat(len(y_train_reg))
    train_r2 = r2_score(y_train_reg, y_train_by_average)
    train_mae = mean_absolute_error(y_train_reg, y_train_by_average)
    train_rmse = mean_squared_error(y_train_reg, y_train_by_average, squared=False)
    test_r2 = r2_score(y_test_reg, y_pred)
    test_mae = mean_absolute_error(y_test_reg, y_pred)
    test_rmse = mean_squared_error(y_test_reg, y_pred, squared=False)
    history = {
        'train_r2': train_r2,
        'train_mae': train_mae,
        'train_rmse': train_rmse,
        'test_r2': test_r2,
        'test_mae': test_mae,
        'test_rmse': test_rmse,
    }


    combined_history[seed] = history    


# save the history as pickle
with open(f'./data/history/baseline_{experiment_name}.pickle', 'wb') as handle:
    pickle.dump(combined_history, handle, protocol=pickle.HIGHEST_PROTOCOL)

# print average results
print('Average results:')
for metric in ['r2', 'mae', 'rmse']:
    train_metric = [combined_history[seed][f'train_{metric}'] for seed in seeds]
    test_metric = [combined_history[seed][f'test_{metric}'] for seed in seeds]
    print(f'{metric.upper()}: mean = {np.mean(test_metric):.4f}, 95% CI = [{np.percentile(test_metric, 2.5):.4f}, {np.percentile(test_metric, 97.5):.4f}]')

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

AVERAGE


Starting for seed 42...
----------------------------------------------------------------------------------------------------


Starting for seed 78687...
----------------------------------------------------------------------------------------------------


Starting for seed 123123...
----------------------------------------------------------------------------------------------------

Average results:
R2: mean = -0.0001, 95% CI = [-0.0003, -0.0000]
MAE: mean = 0.5130, 95% CI = [0.5122, 0.5143]
RMSE: mean = 0.6286, 95% CI = [0.6280, 0.6290]
Execution time: 0.04 seconds
