In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

## Формируем датасет с параметрами:
- proj_cnt: количество проектов
- education_lvl: уровень образования
- employee_grade: грейд сотрудника
- employee_age: возраст сотрудника
- employee_salary: оклад сотрудника. Целевая переменная для предсказания

In [36]:
def _get_test_sample():
    n_samples = 100

    proj_cnt = np.random.choice(9, n_samples) + 1
    education_lvl = np.random.choice(2, n_samples) + 1
    employee_grade = np.random.choice(10, n_samples) + 1
    employee_age = np.random.choice(30, n_samples) + 18

    employee_salary = education_lvl * proj_cnt * employee_grade * 1000

    data = pd.DataFrame({'proj_cnt': proj_cnt, 'education_lvl': education_lvl, 'employee_grade': employee_grade, 'employee_age': employee_age, 'employee_salary': employee_salary})
    return data

In [37]:
d = _get_test_sample()
d.head()

Unnamed: 0,proj_cnt,education_lvl,employee_grade,employee_age,employee_salary
0,8,1,3,21,24000
1,5,1,7,40,35000
2,6,1,4,41,24000
3,2,1,1,36,2000
4,7,2,8,27,112000


## Обучаем модель, используя все переменные

In [39]:
def _get_model(vl):
    X = vl
    y = d['employee_salary']
    reg = LinearRegression().fit(X, y)
    
    print('Weights: {}'.format(reg.coef_))
    print('Bias: {}'.format(reg.intercept_))
    
    pred_values = reg.predict(vl)
    print('Error: {}'.format(mean_absolute_error(pred_values, y)))
    print('Mean_value:', d.employee_salary.mean())

In [40]:
_get_model(d[['proj_cnt', 'education_lvl', 'employee_grade', 'employee_age']])

Weights: [10283.42050367 30774.68569859  8033.69502303   -37.36172516]
Bias: -97519.90782276602
Error: 14661.309018547438
Mean_value: 50520.0


## Сокращаем количество используемых переменных

In [41]:
_get_model(d[['proj_cnt', 'education_lvl', 'employee_grade']])

Weights: [10288.30117598 30739.04401923  8023.01217857]
Bias: -98682.52729453589
Error: 14684.550333763313
Mean_value: 50520.0


## Добавляем признак опыт

In [42]:
d['experience'] = d.proj_cnt * d.education_lvl * d.employee_grade
d.head()

Unnamed: 0,proj_cnt,education_lvl,employee_grade,employee_age,employee_salary,experience
0,8,1,3,21,24000,24
1,5,1,7,40,35000,35
2,6,1,4,41,24000,24
3,2,1,1,36,2000,2
4,7,2,8,27,112000,112


In [44]:
_get_model(d[['experience']])

Weights: [1000.]
Bias: 0.0
Error: 0.0
Mean_value: 50520.0
