In [32]:
# Common imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10, 5)

from sklearn.linear_model import LinearRegression
import math
from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler
from sklearn.model_selection import train_test_split

# Рекламный бюджет

In [5]:
adv_df = pd.read_csv('Advertising.csv')#, usecols=[1,2,3,4])
adv_df.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [6]:
adv_df = adv_df.drop('Unnamed: 0', axis = 1)

In [7]:
adv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
TV           200 non-null float64
radio        200 non-null float64
newspaper    200 non-null float64
sales        200 non-null float64
dtypes: float64(4)
memory usage: 6.3 KB


In [69]:
adv_df['log_tv'] = adv_df.TV.apply(lambda x: math.log(x, 2))
adv_df['pow_tv'] = adv_df.TV.apply(lambda x: math.pow(x, 0.4))

In [70]:
adv_df.head()

Unnamed: 0,TV,radio,newspaper,sales,log_tv,pow_tv
0,230.1,37.8,69.2,22.1,7.846117,8.805756
1,44.5,39.3,45.1,10.4,5.475733,4.563983
2,17.2,45.9,69.3,9.3,4.104337,3.120408
3,151.5,41.3,58.5,18.5,7.243174,7.450151
4,180.8,10.8,58.4,12.9,7.498251,7.996121


In [91]:
df_train, df_test, y_train, y_test = train_test_split(adv_df, adv_df['sales'], test_size=0.20, random_state=42)

In [92]:
lm = LinearRegression().fit(df_train[['log_tv', 'radio']].to_numpy(), df_train['sales'])
print('R^2: (on train)',lm.score(df_train[['log_tv', 'radio']].to_numpy(), df_train['sales']))
print('R^2: (on test)',lm.score(df_test[['log_tv', 'radio']].to_numpy(), df_test['sales']))

R^2: (on train) 0.9011051213818659
R^2: (on test) 0.9235657691840907


In [93]:
lm = LinearRegression().fit(df_train[['TV', 'radio']].to_numpy(), df_train['sales'])
print('R^2: (on train)',lm.score(df_train[['TV', 'radio']].to_numpy(), df_train['sales']))
print('R^2: (on test)',lm.score(df_test[['TV', 'radio']].to_numpy(), df_test['sales']))

R^2: (on train) 0.8955982149747163
R^2: (on test) 0.9005833101920357


In [94]:
lm = LinearRegression().fit(df_train[['TV', 'radio','newspaper']].to_numpy(), df_train['sales'])
print('R^2: (on train)',lm.score(df_train[['TV', 'radio','newspaper']].to_numpy(), df_train['sales']))
print('R^2: (on test)',lm.score(df_test[['TV', 'radio','newspaper']].to_numpy(), df_test['sales']))

R^2: (on train) 0.8957008271017818
R^2: (on test) 0.899438024100912


In [95]:
df_train, df_test, y_train, y_test = train_test_split(adv_df, adv_df['sales'], test_size=0.30, random_state=42)

In [96]:
lm = LinearRegression().fit(df_train[['log_tv', 'radio']].to_numpy(), df_train['sales'])
print('R^2: (on train)',lm.score(df_train[['log_tv', 'radio']].to_numpy(), df_train['sales']))
print('R^2: (on test)',lm.score(df_test[['log_tv', 'radio']].to_numpy(), df_test['sales']))

R^2: (on train) 0.8993482542237392
R^2: (on test) 0.9143970569382007


In [97]:
lm = LinearRegression().fit(df_train[['TV', 'radio']].to_numpy(), df_train['sales'])
print('R^2: (on train)',lm.score(df_train[['TV', 'radio']].to_numpy(), df_train['sales']))
print('R^2: (on test)',lm.score(df_test[['TV', 'radio']].to_numpy(), df_test['sales']))

R^2: (on train) 0.9048377867980043
R^2: (on test) 0.8656253548947075


In [98]:
lm = LinearRegression().fit(df_train[['TV', 'radio','newspaper']].to_numpy(), df_train['sales'])
print('R^2: (on train)',lm.score(df_train[['TV', 'radio','newspaper']].to_numpy(), df_train['sales']))
print('R^2: (on test)',lm.score(df_test[['TV', 'radio','newspaper']].to_numpy(), df_test['sales']))

R^2: (on train) 0.9055159502227753
R^2: (on test) 0.8609466508230368


In [99]:
df_train, df_test, y_train, y_test = train_test_split(adv_df, adv_df['sales'], test_size=0.50, random_state=42)

In [100]:
lm = LinearRegression().fit(df_train[['log_tv', 'radio']].to_numpy(), df_train['sales'])
print('R^2: (on train)',lm.score(df_train[['log_tv', 'radio']].to_numpy(), df_train['sales']))
print('R^2: (on test)',lm.score(df_test[['log_tv', 'radio']].to_numpy(), df_test['sales']))

R^2: (on train) 0.8997594187070235
R^2: (on test) 0.9017771480682202


In [101]:
lm = LinearRegression().fit(df_train[['TV', 'radio']].to_numpy(), df_train['sales'])
print('R^2: (on train)',lm.score(df_train[['TV', 'radio']].to_numpy(), df_train['sales']))
print('R^2: (on test)',lm.score(df_test[['TV', 'radio']].to_numpy(), df_test['sales']))

R^2: (on train) 0.9020506014720118
R^2: (on test) 0.8826436017134701


In [102]:
lm = LinearRegression().fit(df_train[['TV', 'radio','newspaper']].to_numpy(), df_train['sales'])
print('R^2: (on train)',lm.score(df_train[['TV', 'radio','newspaper']].to_numpy(), df_train['sales']))
print('R^2: (on test)',lm.score(df_test[['TV', 'radio','newspaper']].to_numpy(), df_test['sales']))

R^2: (on train) 0.9042613648908894
R^2: (on test) 0.8721004816045136
