In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LinearRegression

In [35]:
df = pd.DataFrame({
    'year': [1985, 1990, 1991, 2001, 2004, 2007, 2007, 2008, 2008, 2010, 2013, 2014 , 2015, 2017, 2018],
    'price': [11800, 6600, 21800, 4400, 13000, 14600, 14700, 15600, 13200, 17400, 19200, 8200, 21500, 27300, 24700]
})
df.shape

(15, 2)

In [36]:
print('Bin 1:\n', df[:5].mean().round())
print('Bin 2:\n', df[5:10].mean().round())
print('Bin 3:\n', df[10:].mean().round())

Bin 1:
 year      1994.0
price    11520.0
dtype: float64
Bin 2:
 year      2008.0
price    15100.0
dtype: float64
Bin 3:
 year      2015.0
price    20180.0
dtype: float64


In [37]:
print('Bin 1:\n', df[:5].std().round(1))
print('Bin 2:\n', df[5:10].std().round(1))
print('Bin 3:\n', df[10:].std().round(1))

Bin 1:
 year        8.0
price    6761.1
dtype: float64
Bin 2:
 year        1.2
price    1546.0
dtype: float64
Bin 3:
 year        2.1
price    7371.4
dtype: float64


In [38]:
df_bin_std = pd.DataFrame({
    'year': [8]*5 + [1.2]*5 + [2.1]*5,
    'price': [6761.1]*5 + [1546.0]*5 + [7371.4]*5,
})

In [39]:
df_bin_std*2

Unnamed: 0,year,price
0,16.0,13522.2
1,16.0,13522.2
2,16.0,13522.2
3,16.0,13522.2
4,16.0,13522.2
5,2.4,3092.0
6,2.4,3092.0
7,2.4,3092.0
8,2.4,3092.0
9,2.4,3092.0


In [40]:
df_bin = pd.DataFrame({
    'year': [1994]*5 + [2008]*5 + [2015]*5,
    'price': [11520.0]*5 + [15100.0]*5 + [20180.0]*5,
})

In [41]:
df_err = (df - df_bin).abs()
df_err

Unnamed: 0,year,price
0,9,280.0
1,4,4920.0
2,3,10280.0
3,7,7120.0
4,10,1480.0
5,1,500.0
6,1,400.0
7,0,500.0
8,0,1900.0
9,2,2300.0


In [43]:
model = LinearRegression().fit(df[['year']], df[['price']])
model.predict(pd.DataFrame({ 'year': [2022]}))

array([[20651.0308321]])

In [44]:
model.coef_

array([[313.08042348]])

In [45]:
model.intercept_

array([-612397.58543834])

In [47]:
2022 * 313.1 -612397.6

20690.600000000093

In [64]:
df = pd.DataFrame({
    'year': [1985, 1990, 1991, 2001, 2004, 2007, 2007, 2008, 2008, 2010, 2013, 2014 , 2015, 2017, 2018],
    'price': [11800, 6600, 21800, 4400, 13000, 14600, 14700, 15600, 13200, 17400, 19200, 8200, 21500, 27300, 24700]
})
df['engine'] = [3., np.nan, 2.5, np.nan, 2., 3., 2.5, 2., 3., 2., 2., np.nan, 2., 3., 2.]
df

Unnamed: 0,year,price,engine
0,1985,11800,3.0
1,1990,6600,
2,1991,21800,2.5
3,2001,4400,
4,2004,13000,2.0
5,2007,14600,3.0
6,2007,14700,2.5
7,2008,15600,2.0
8,2008,13200,3.0
9,2010,17400,2.0


In [68]:
idx_na = np.where(df[['engine']].isna())[0]
idx_na

array([ 1,  3, 11], dtype=int64)

In [69]:
df['engine'].mode()

0    2.0
Name: engine, dtype: float64

In [70]:
df['engine'].median()

2.25

In [73]:
df['engine'].mode()

0    2.0
Name: engine, dtype: float64

In [75]:
df['engine'] = df['engine'].fillna(2.0)
df

Unnamed: 0,year,price,engine
0,1985,11800,3.0
1,1990,6600,2.0
2,1991,21800,2.5
3,2001,4400,2.0
4,2004,13000,2.0
5,2007,14600,3.0
6,2007,14700,2.5
7,2008,15600,2.0
8,2008,13200,3.0
9,2010,17400,2.0


In [80]:
df['price'][idx_na].mean()

6400.0

In [85]:
idx = np.arange(len(df))
idx_not_na = np.delete(idx, idx_na)
idx_not_na

array([ 0,  2,  4,  5,  6,  7,  8,  9, 10, 12, 13, 14])

In [86]:
df['price'][idx_not_na].mean()

17900.0

In [87]:
df['price'].mean()

15600.0

In [90]:
df['price'].std()

6541.734151911534

In [88]:
df['price'][idx_na].std()

1907.8784028338912

In [89]:
df['price'][idx_not_na].std()

4994.906496531333

In [91]:
df[['engine', 'price']].corr('spearman')

Unnamed: 0,engine,price
engine,1.0,0.112212
price,0.112212,1.0


In [94]:
df.corr()

Unnamed: 0,year,price,engine
year,1.0,0.484659,-0.209051
price,0.484659,1.0,0.17354
engine,-0.209051,0.17354,1.0


In [95]:
df.corr('spearman')

Unnamed: 0,year,price,engine
year,1.0,0.622541,-0.215629
price,0.622541,1.0,0.112212
engine,-0.215629,0.112212,1.0


In [2]:
X = np.array([8, 3, 2, 10, 11, 3, 6, 5])
y = np.array([4, 12, 1, 12, 9, 4, 9, 6])

In [7]:
print(X.sum(), y.sum(), (X**2).sum(), (X*y).sum())

48 57 368 385


In [11]:
X.reshape(-1, 1)

array([[ 8],
       [ 3],
       [ 2],
       [10],
       [11],
       [ 3],
       [ 6],
       [ 5]])

In [12]:
model = LinearRegression().fit(X.reshape(-1, 1), y)
print(model.coef_, model.intercept_)

[0.5375] 3.900000000000001


In [13]:
n = len(X)
a = (n*(X*y).sum() - X.sum() * y.sum()) / (n*(X**2).sum() - X.sum()**2)
b = (y.sum() - a * X.sum()) / n
print(a, b)

0.5375 3.9000000000000004


In [14]:
model.predict([[9]])

array([8.7375])

In [15]:
x = 9
0.5375*x + 3.90

8.737499999999999