In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression

In [3]:
# Создаём сэмпл
n_samples = 500

age_owner = np.random.choice(90, n_samples) + 21
length = np.random.choice(120, n_samples) + 15
width = np.random.choice(80, n_samples) + 10

price = length * width * 100

data = pd.DataFrame({'age_owner': age_owner, 'length': length, 'width': width, 'price': price})
data.head(5)

Unnamed: 0,age_owner,length,width,price
0,62,128,68,870400
1,65,41,30,123000
2,89,82,67,549400
3,72,40,60,240000
4,49,121,45,544500


In [4]:
from sklearn.metrics import mean_absolute_error

X = data[['age_owner', 'length', 'width']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['age_owner', 'length', 'width']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [ -49.80693623 5067.32969807 7390.51172433]
Bias: -369766.74743760034
Error: 54836.1029896329


In [5]:
X = data[['length', 'width']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['length', 'width']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [5067.64649917 7394.30972973]
Bias: -373206.74817026086
Error: 54849.36655323327


In [6]:
# Создаем новый признак
data['mult'] = data['length'] * data['width']
data.head(5)

Unnamed: 0,age_owner,length,width,price,mult
0,62,128,68,870400,8704
1,65,41,30,123000,1230
2,89,82,67,549400,5494
3,72,40,60,240000,2400
4,49,121,45,544500,5445


In [7]:
X = data[['mult']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['mult']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [100.]
Bias: 2.3283064365386963e-10
Error: 1.2068630894646048e-10


## Задание
## Способ 1

In [8]:
# Создаём датасет число визитов, год рождения и дата рождения
n_samples = 500
population = range(1946,2019,1)
number_of_visits = np.random.choice(10000, n_samples) + 100
year_of_birth = np.random.choice(population,n_samples)
birth_number = np.random.choice(28, n_samples) + 3

##number = length * width * 100

df = pd.DataFrame({'visits': number_of_visits, 'year_of_birth': year_of_birth, 'birth_number': birth_number})
df.head(5)

Unnamed: 0,visits,year_of_birth,birth_number
0,6564,1987,14
1,3798,1969,29
2,7973,2017,7
3,6478,2017,11
4,8638,1969,20


In [9]:
#функция по вычислению суммы цифр года рождения
def calculate_sum(row):    
    s = str(row.year_of_birth)
    a = len(s)
    res = 0
    n = len(s)
    for i in range(n):
        res += int(s[i])
    if res > 10:
        res = res//10 + res %10  
    return int(res)

In [10]:
#поместим вычисления в отдельный столбец
df['sum_year'] = df.apply(calculate_sum,axis = 1)
df.head()

Unnamed: 0,visits,year_of_birth,birth_number,sum_year
0,6564,1987,14,7
1,3798,1969,29,7
2,7973,2017,7,10
3,6478,2017,11,10
4,8638,1969,20,7


In [11]:
#оставим все три признака
X = df[['visits', 'year_of_birth' , 'birth_number']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['visits', 'year_of_birth', 'birth_number']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [ 6.08251632e-06  7.22159975e-03 -2.01753198e-02]
Bias: -8.134328681263632
Error: 2.1756158099217218


In [12]:
#оставим два признака
X = df[['year_of_birth' , 'birth_number']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['year_of_birth', 'birth_number']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [ 0.00715602 -0.01998621]
Bias: -7.976380032180041
Error: 2.1748455193644967


In [13]:
#Оставим дату рождения(не от которой считали)
X = df[['birth_number']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))
pred_values = reg.predict(df[['birth_number']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [-0.02087922]
Bias: 6.212548856578599
Error: 2.1797315275198694


In [14]:
#Оставим столбец (по которому вычисляли)
X = df[['year_of_birth']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['year_of_birth']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [0.00752031]
Bias: -9.027765874105668
Error: 2.1820235523504823


## Вывод
Для такой зависимости программа не может точно предсказать, сдвиг и ошибка практически одинаковы. Может быть не линейная модель, но хотелось попробовать

## Способ 2

In [15]:
#Перемножим дату и год  и прибавим  888
def calculate_sum(row): 
    res = row.year_of_birth * row.birth_number + 888  
    return res

In [16]:
df['sum_year'] = df.apply(calculate_sum,axis = 1)
df.head()

Unnamed: 0,visits,year_of_birth,birth_number,sum_year
0,6564,1987,14,28706
1,3798,1969,29,57989
2,7973,2017,7,15007
3,6478,2017,11,23075
4,8638,1969,20,40268


In [17]:
#оставим все три признака
X = df[['visits', 'year_of_birth' , 'birth_number']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['visits', 'year_of_birth', 'birth_number']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [-2.59812633e-03  1.61856542e+01  1.98223371e+03]
Bias: -31191.14506073566
Error: 131.19416990597983


Отклонение значительное

In [18]:
#оставим два признака
X = df[['year_of_birth' , 'birth_number']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['year_of_birth', 'birth_number']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [  16.21366817 1982.15292914]
Bias: -31258.612293384773
Error: 131.11935607050523


Отклонение осталось почти такое же.
Добавим новый столбец

In [19]:
df['product'] = df['year_of_birth'] * df['birth_number']
df.head()

Unnamed: 0,visits,year_of_birth,birth_number,sum_year,product
0,6564,1987,14,28706,27818
1,3798,1969,29,57989,57101
2,7973,2017,7,15007,14119
3,6478,2017,11,23075,22187
4,8638,1969,20,40268,39380


In [20]:
#оставим  три признака
X = df[['product', 'year_of_birth' , 'birth_number']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['product', 'year_of_birth', 'birth_number']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [ 1.00000000e+00 -4.20767783e-15  2.00528249e-13]
Bias: 887.9999999999854
Error: 1.1816155165433884e-11


Отклонение уменьшилось

In [21]:
#оставим  один
X = df[['product']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['product']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [1.]
Bias: 887.9999999999891
Error: 7.161361281760037e-12


Отклонение осталось прежним, хотя на оси х оставили только один признак

In [22]:
df['product'] = df['year_of_birth'] * df['birth_number'] +500
df.head()

Unnamed: 0,visits,year_of_birth,birth_number,sum_year,product
0,6564,1987,14,28706,28318
1,3798,1969,29,57989,57601
2,7973,2017,7,15007,14619
3,6478,2017,11,23075,22687
4,8638,1969,20,40268,39880


In [23]:
#оставим  один
X = df[['product']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['product']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [1.]
Bias: 388.0
Error: 0.0


In [24]:
df['product'] = df['year_of_birth'] * df['birth_number'] + 888
df.head()

Unnamed: 0,visits,year_of_birth,birth_number,sum_year,product
0,6564,1987,14,28706,28706
1,3798,1969,29,57989,57989
2,7973,2017,7,15007,15007
3,6478,2017,11,23075,23075
4,8638,1969,20,40268,40268


Прибавив цифру 888,  отклонение 0

In [26]:
#оставим  один
X = df[['product']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['product']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [1.]
Bias: 0.0
Error: 0.0


In [27]:
df['product'] = df['year_of_birth'] * df['birth_number'] + 895
df.head()

Unnamed: 0,visits,year_of_birth,birth_number,sum_year,product
0,6564,1987,14,28706,28713
1,3798,1969,29,57989,57996
2,7973,2017,7,15007,15014
3,6478,2017,11,23075,23082
4,8638,1969,20,40268,40275


А если добавим 7, то отклонение изменилось ровно на 7

In [29]:
#оставим  один
X = df[['product']]
y = df['sum_year']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['product']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [1.]
Bias: -7.0
Error: 0.0


## Задание 2
Попробуем другой датасет, Возраст сотрудника, оклад, стаж, надбавка за год. Вычислим зарплату

In [32]:
# Создаём датасет число визитов, год рождения и дата рождения
n_samples = 500
population = range(0,15,1)
age = np.random.choice(range(18,65,1), n_samples)
experience = np.random.choice(population,n_samples)
premium = np.random.choice(range(1000,5000,500), n_samples)
salary =  np.random.choice(range(50000,150000,5000), n_samples)
res_salary = experience * premium + salary

df = pd.DataFrame({'age': age, 'experience': experience, 'premium': premium, 'salary':salary,'res_salary': res_salary})
df.head(5)

Unnamed: 0,age,experience,premium,salary,res_salary
0,27,14,4000,130000,186000
1,24,4,4000,110000,126000
2,22,2,2000,135000,139000
3,50,12,3500,85000,127000
4,34,12,4000,110000,158000


In [33]:
#оставим  все
X = df[['age', 'experience' , 'premium', 'salary']]
y = df['res_salary']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['age', 'experience' , 'premium', 'salary']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [2.37622865e+01 2.72574170e+03 7.12188714e+00 9.92675870e-01]
Bias: -19513.842044734483
Error: 3685.3426169392164


In [34]:
#оставим  от которых зависит
X = df[['experience' , 'premium', 'salary']]
y = df['res_salary']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(df[['experience' , 'premium', 'salary']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [2.72690926e+03 7.11567948e+00 9.92374601e-01]
Bias: -18493.28490348498
Error: 3678.9110643341587


Вывод: с помощью этой функции подбираешь линейную зависимость, формулу