# linear regression in python

- searches for relationship amonth variables
- (DV, y) salary -> (IV,x) experience, edn level, role, city of employment
- assumption : salary depends on IVs
- DV are continous
- Does gender impact salary ?
- Does electricity depend on outdoor temp, time of day, no of residents ?
- y = f(x) = bo + b1x1, b2x2....
- R2 = coeff of determination : amount of variation in y which can be explained by dependence on x ; Larger R2 better explainability (0-1)

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [5]:
#sample data
x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1,1))
#x should be 2 dim
y = np.array([5, 20, 14, 32, 22, 38])
x, y, x.shape, y.shape

(array([[ 5],
        [15],
        [25],
        [35],
        [45],
        [55]]),
 array([ 5, 20, 14, 32, 22, 38]),
 (6, 1),
 (6,))

In [11]:
# model
model = LinearRegression(fit_intercept=True).fit(x, y)
model

In [12]:
r2 = model.score(x,y)
print(r2)

0.715875613747954


In [13]:
print('Intercept ', model.intercept_)

Intercept  5.633333333333333


In [14]:
print('Coefficients ', model.coef_)

Coefficients  [0.54]


In [16]:
print('Predict ', model.predict(x))
print('Acutal Values ', y)

Predict  [ 8.33333333 13.73333333 19.13333333 24.53333333 29.93333333 35.33333333]
Acutal Values  [ 5 20 14 32 22 38]


In [17]:
# new values
x_new = np.arange(5).reshape((-1,1))
x_new

array([[0],
       [1],
       [2],
       [3],
       [4]])

In [37]:
y_new = model.predict(x_new)
y_new

array([5.63333333, 6.17333333, 6.71333333, 7.25333333, 7.79333333])

In [38]:
# Multiple Linear Regression
x= [0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x, y

(([0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]),
 [4, 5, 20, 14, 32, 22, 38, 43])

In [39]:
x,y = np.array(x), np.array(y)
x, y, x.shape, y.shape

(array([[ 0,  1],
        [ 5,  1],
        [15,  2],
        [25,  5],
        [35, 11],
        [45, 15],
        [55, 34],
        [60, 35]]),
 array([ 4,  5, 20, 14, 32, 22, 38, 43]),
 (8, 2),
 (8,))

In [40]:
model2 = LinearRegression().fit(x,y)
model2

In [42]:
r2 = model2.score(x,y)
print(r2, model2.intercept_, '\t', model2.coef_)

0.8615939258756776 5.52257927519819 	 [0.44706965 0.25502548]


In [43]:
model2.predict(x)

array([ 5.77760476,  8.012953  , 12.73867497, 17.9744479 , 23.97529728,
       29.4660957 , 38.78227633, 41.27265006])

In [44]:
x_new = np.arange(10).reshape((-1,2))
x_new

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [45]:
y_new = model2.predict(x_new)
y_new

array([ 5.77760476,  7.18179502,  8.58598528,  9.99017554, 11.3943658 ])

# Linear Regression with Stats Model
-  when you need more output values

In [19]:
import statsmodels.api as sm

In [46]:
x = sm.add_constant(x)
x

array([[ 1.,  0.,  1.],
       [ 1.,  5.,  1.],
       [ 1., 15.,  2.],
       [ 1., 25.,  5.],
       [ 1., 35., 11.],
       [ 1., 45., 15.],
       [ 1., 55., 34.],
       [ 1., 60., 35.]])

In [47]:
model3 = sm.OLS(y, x)
model3

<statsmodels.regression.linear_model.OLS at 0x28d483450>

In [48]:
results = model3.fit()
results

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x28d4c2890>

In [50]:
results.summary()



0,1,2,3
Dep. Variable:,y,R-squared:,0.862
Model:,OLS,Adj. R-squared:,0.806
Method:,Least Squares,F-statistic:,15.56
Date:,"Mon, 04 Mar 2024",Prob (F-statistic):,0.00713
Time:,14:36:42,Log-Likelihood:,-24.316
No. Observations:,8,AIC:,54.63
Df Residuals:,5,BIC:,54.87
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.5226,4.431,1.246,0.268,-5.867,16.912
x1,0.4471,0.285,1.567,0.178,-0.286,1.180
x2,0.2550,0.453,0.563,0.598,-0.910,1.420

0,1,2,3
Omnibus:,0.561,Durbin-Watson:,3.268
Prob(Omnibus):,0.755,Jarque-Bera (JB):,0.534
Skew:,0.38,Prob(JB):,0.766
Kurtosis:,1.987,Cond. No.,80.1


In [53]:
print('R2 ', results.rsquared)

R2  0.8615939258756776


In [54]:
print('R2 Adjusted ', results.rsquared_adj)

R2 Adjusted  0.8062314962259487


In [52]:
print('Coeff ', results.params)

Coeff  [5.52257928 0.44706965 0.25502548]


In [56]:
print('Fitted values ', results.fittedvalues)

Fitted values  [ 5.77760476  8.012953   12.73867497 17.9744479  23.97529728 29.4660957
 38.78227633 41.27265006]


In [57]:
print('Predicted Values ', results.predict(x))

Predicted Values  [ 5.77760476  8.012953   12.73867497 17.9744479  23.97529728 29.4660957
 38.78227633 41.27265006]


In [58]:
x_new = sm.add_constant(np.arange(10).reshape((-1, 2)))
x_new

array([[1., 0., 1.],
       [1., 2., 3.],
       [1., 4., 5.],
       [1., 6., 7.],
       [1., 8., 9.]])

In [60]:
y_new = results.predict(x_new)
y_new

array([ 5.77760476,  7.18179502,  8.58598528,  9.99017554, 11.3943658 ])