In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import catboost as cbt
from catboost import Pool, cv
import lightgbm as lgb
from lightgbm import LGBMRegressor
import numpy as np
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [None]:
df = pd.read_csv('bundles_858.csv', sep = ',').drop_duplicates()

null = df['name'].isnull()
# удаление строк с пустыми значениями
df = df[~null]
df

In [None]:
pd.options.display.max_columns = None
zero_cols = df.columns[(df == 0).all()]

# удаляем найденные столбцы
df.drop(zero_cols, axis=1, inplace=True)

df.dropna(subset=['salary_from_rub', 'salary_to_rub'], how='all', inplace=True)
df.head()

In [7]:
#df = df[(df['salary_from_rub'] <= 300000) & (df['salary_from_rub'] >= 14000)]
#df = df[(df['salary_to_rub'] <= 300000) & (df['salary_to_rub'] >= 14000)]
#
#df.head()

In [None]:
df['salary'] = (df['salary_from_rub'] + df['salary_to_rub']) / 2
df.loc[df['salary'].isnull(), 'salary'] = df.loc[df['salary'].isnull(), ['salary_from_rub', 'salary_to_rub']].apply(lambda x: x.dropna().iloc[0], axis=1)
df.head()

In [None]:
df['salary'].describe().astype(int)

In [None]:
plt.figure(figsize=(6, 8))
y = df['salary']
sns.boxplot(y=y, orient='v')

plt.title('Распределение зарплат', pad=20)
plt.ylim(0, 300000)
plt.ylabel('Зарплата, руб.')

print("Медианная зарплата:", df['salary'].quantile(0.5))

plt.show()

In [None]:
q_min = df.salary.quantile(0.03)
q_max = df.salary.quantile(0.97)
df = df[(df['salary'] > q_min) & (df['salary'] < q_max)]

df.salary.describe().round().astype('Int64')

In [None]:
plt.figure(figsize=(6, 8))
y = df['salary']
sns.boxplot(y=y, orient='v')

plt.title('Распределение зарплат', pad=20)
plt.ylim(None, 80000)
plt.ylabel('Зарплата, руб.')

print("Медианная зарплата:", df['salary'].quantile(0.5))

plt.show()

In [None]:
end = df.columns.get_loc('source_index')
counts = df.iloc[:, df.columns.get_loc('id')+1:end].sum()
top = counts.sort_values(ascending=False)[:25]
fig, ax = plt.subplots(1, 1, figsize=(7,7))
fig.tight_layout(w_pad=5)
sns.barplot(ax = ax, x=top.values, y=top.index, palette=sns.color_palette('magma', 25), alpha=0.7)
ax.set_title(label='Топ навыков', loc='left', size=14, pad=15)
ax.set_xlabel('Количество вакансий')
ax.set_ylabel('')

In [None]:
exp = df.groupby('experience_id')['salary'].agg(['mean', 'count'])
exp_dic = ['Нет опыта','От 1 года до 3 лет', 'От 3 до 6 лет','Более 6 лет']

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(exp['count'],labels=exp_dic, autopct='%1.1f%%')
plt.title('Распределение вакансий в зависимости от опыта работы')
plt.show()

In [None]:
sns.set_palette("pastel")
#experience = dg['experience'].map(exp_dic)


fig = plt.figure(figsize=(11, 8), dpi=100)
plt.title('Распределение зарплаты в зависимости от опыта работы', y=1.02, fontsize=16)
sns.regplot(x=df['experience_id'], y=df['salary'], color="C0", x_jitter=.2, scatter_kws={'alpha':0.08});
plt.xticks(list(range(4)), labels=exp_dic, rotation='vertical');

In [None]:
noexp = df.loc[df['experience_id'] == 3.0]
skills = noexp.columns[noexp.columns.get_loc('id')+1:noexp.columns.get_loc('source_index')].tolist()

In [None]:

X = noexp.iloc[:,noexp.columns.get_loc('id')+1:noexp.columns.get_loc('source_index')]
y = noexp['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
skills = df.columns[df.columns.get_loc('id')+1:df.columns.get_loc('source_index')].tolist()

X = df.iloc[:,df.columns.get_loc('id')+1:df.columns.get_loc('source_index')]
y = df['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
xgb1 = XGBRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=1000)

xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [None]:
from xgboost import XGBRegressor
model_xgb = XGBRegressor(
    colsample_bytree = 0.7,
    min_child_weight = 4,
    learning_rate = 0.03,
    max_depth = 5,
    n_estimators = 500,
    n_jobs = -1,
    nthread =4
)
model_xgb.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric = ['rmse', 'mae', 'mape'], verbose = 1000, early_stopping_rounds = 50)

In [None]:
model_xgb.feature_importances_

In [None]:
xgb.plot_importance(model_xgb, max_num_features=25)
plt.figure(figsize = (16, 12))
plt.show()

In [None]:
df_xgb = pd.DataFrame([*zip(skills, model_xgb.feature_importances_[:len(skills)]*100)], columns = ['features', 'values'])
df_xgb.sort_values(by='values', ascending=False, inplace=True)

fig, ax = plt.subplots(1, 1, figsize=(7,8))
sns.barplot(data=df_xgb.head(25), x='values', y='features', palette=sns.color_palette('magma', 25), 
            orient='h', alpha=0.7, ax=ax)
ax.set_title(label=f'Доплата за владение технологией без опыта', loc='left', size=14, pad=15)
ax.set_xlabel('Доплата тыс. руб.')
ax.set_ylabel('')
plt.show()

In [None]:
y_pred = model_xgb.predict(X_test)

# Вычисление метрик MAE и RMSE
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print("MAPE:", mape)
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
model_xgb.evals_result()

In [None]:
train_metrics = model_xgb.evals_result()['validation_0']['mae']
valid_metrics = model_xgb.evals_result()['validation_1']['mae']

# Построение графиков метрик
epochs = range(1, len(train_metrics) + 1)
plt.plot(epochs, train_metrics, label='train')
plt.plot(epochs, valid_metrics, label='test')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.title('MAE')
plt.legend()

In [None]:
train_metrics = model_xgb.evals_result()['validation_0']['mape']
valid_metrics = model_xgb.evals_result()['validation_1']['mape']

# Построение графиков метрик
epochs = range(1, len(train_metrics) + 1)
plt.plot(epochs, train_metrics, label='train')
plt.plot(epochs, valid_metrics, label='test')
plt.xlabel('Epochs')
plt.ylabel('MAPE')
plt.title('MAPE')
plt.legend()

In [None]:
train_metrics = model_xgb.evals_result()['validation_0']['rmse']
valid_metrics = model_xgb.evals_result()['validation_1']['rmse']

# Построение графиков метрик
epochs = range(1, len(train_metrics) + 1)
plt.plot(epochs, train_metrics, label='train')
plt.plot(epochs, valid_metrics, label='test')
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.title('RMSE')
plt.legend()
plt.show()

In [None]:
print(df_xgb)

In [None]:
pools = {}

pools['train'] = cbt.Pool(
    data=X_train,
    label=y_train
)

pools['test'] = cbt.Pool(
    data=X_test,
    label=y_test
)

parameters = {'depth'         : [2,5,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [100, 1000, 5000],
              'l2_leaf_reg':[3,1,5,10,100]
                 }
model = CatBoostRegressor(eval_metric = 'MAE')

grid = GridSearchCV(estimator=model, param_grid = parameters, cv = 2)
    
grid.fit(X_train, y_train)


In [None]:
best_param = grid.best_params_
best_param

In [None]:
model = CatBoostRegressor(iterations = 5000, depth = 2, l2_leaf_reg=5, learning_rate = 0.05,eval_metric = 'MAE')
model.fit(
    pools['train'],
    eval_set=pools['test'],
    use_best_model=True,
    plot=True,
    verbose=1000,
    early_stopping_rounds=50
)
model.get_feature_importance(prettified=True)

In [None]:
y_pred = model.predict(X_test)

# Вычисление метрик MAE и RMSE
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print("MAPE:", mape)
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
model_cat = pd.DataFrame([*zip(skills, model.feature_importances_[:len(skills)])], columns = ['features', 'values'])

model_cat.sort_values(by='values', ascending=False, inplace=True)

fig, ax = plt.subplots(1, 1, figsize=(7,8))
sns.barplot(data=model_cat.head(25), x='values', y='features', palette=sns.color_palette('magma', 25), 
            orient='h', alpha=0.7, ax=ax)
ax.set_title(label=f'Доплата за владение технологией без опыта', loc='left', size=14, pad=15)
ax.set_xlabel('Доплата тыс. руб.')
ax.set_ylabel('')
plt.show()

In [None]:
train_metrics = model.get_evals_result()['learn']['MAE']
valid_metrics = model.get_evals_result()['validation']['MAE']

# Построение графиков метрик
epochs = range(1, len(train_metrics) + 1)
plt.plot(epochs, train_metrics, label='train')
plt.plot(epochs, valid_metrics, label='test')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.title('MAE')
plt.legend()

In [None]:
train_metrics = model.get_evals_result()['learn']['RMSE']
valid_metrics = model.get_evals_result()['validation']['RMSE']

# Построение графиков метрик
epochs = range(1, len(train_metrics) + 1)
plt.plot(epochs, train_metrics, label='train')
plt.plot(epochs, valid_metrics, label='test')
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.title('RMSE')
plt.legend()
plt.show()

In [None]:
newcolumns = df.columns.str.replace(r',', '', regex=True).to_list()

df_new = pd.DataFrame(df.values, columns=newcolumns)
df_new.head()

In [None]:
columns_to_convert = df_new.columns[df_new.columns.get_loc('id')+1:df_new.columns.get_loc('source_index')]
df_new[columns_to_convert] = df_new[columns_to_convert].astype(int)
df_new['salary'] = df_new['salary'].astype(int)
print(df_new.dtypes)

In [None]:
X = df_new.iloc[:,df_new.columns.get_loc('id')+1:df_new.columns.get_loc('source_index')]
y = df_new['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

parameters = {
    'metric': ['l1','rmse', 'r2'],  
     'num_iterations': [  500, 2000,5000  ],
     'learning_rate':[  0.05, 0.005 ],
    'num_leaves':[ 7, 15, 31  ],
    'max_depth' :[ 10,15,25],
    'min_data_in_leaf':[15,25 ],
 }
model_lgb = LGBMRegressor()

gsearch_lgb = GridSearchCV(model_lgb, param_grid = parameters, verbose=100)
gsearch_lgb.fit(X_train,y_train)

print(gsearch_lgb.best_params_)

In [None]:
X = df_new.iloc[:,df_new.columns.get_loc('id')+1:df_new.columns.get_loc('source_index')]
y = df_new['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'regression',
    'metric': ['l1','rmse'],
    'num_leaves':15,
    'max_depth' :15,
    'num_iterations': 5000,
    'min_data_in_leaf': 15,
    'learning_rate':0.05,
    'early_stopping': 40
}
evals={}
evals_result = {}
model_lgb = LGBMRegressor()
# Обучение модели

model_lgb = lgb.train(params, train_data, evals_result=evals_result,valid_sets = [test_data, train_data],
               valid_names = ['valid', 'train'])


In [None]:
# Прогнозирование на тестовых данных
y_pred = model_lgb.predict(X_test)

# Вычисление метрик
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("MAPE:", mape)

In [None]:
features=model_lgb.feature_importance().tolist()
features = [value / 1000 for value in features]

df_lgb = pd.DataFrame([*zip(skills, features[:len(skills)])], columns = ['features', 'values'])
df_lgb.sort_values(by='values', ascending=False, inplace=True)

fig, ax = plt.subplots(1, 1, figsize=(7,8))
sns.barplot(data=df_lgb.head(25), x='values', y='features', palette=sns.color_palette('magma', 25), 
            orient='h', alpha=0.7, ax=ax)
ax.set_title(label=f'Доплата за владение технологией', loc='left', size=14, pad=15)
ax.set_xlabel('Доплата тыс. руб.')
ax.set_ylabel('')
plt.show()

In [None]:
train_metrics = evals_result['train']['l1']
valid_metrics = evals_result['valid']['l1']

epochs = np.arange(len(train_metrics))

plt.plot(epochs, train_metrics, label='train')
plt.plot(epochs, valid_metrics, label='test')
plt.xlabel('Epochs')
plt.ylabel('Metric Value')
plt.title('MAE')
plt.legend()
plt.show()

In [None]:
train_metrics = evals_result['train']['rmse']
valid_metrics = evals_result['valid']['rmse']

epochs = np.arange(len(train_metrics))

plt.plot(epochs, train_metrics, label='train')
plt.plot(epochs, valid_metrics, label='test')
plt.xlabel('Epochs')
plt.ylabel('Metric Value')
plt.title('RMSE')
plt.legend()
plt.show()