# Multiple Linear Regression

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# import dataset
data = pd.read_csv('veriler.csv')
data.head()

Unnamed: 0,ulke,boy,kilo,yas,cinsiyet
0,tr,130,30,10,e
1,tr,125,36,11,e
2,tr,135,34,10,k
3,tr,133,30,9,k
4,tr,129,38,12,e


In [3]:
data.isnull().sum()

ulke        0
boy         0
kilo        0
yas         0
cinsiyet    0
dtype: int64

## Data Preprocessing, Encoding Categorical Data 

In [4]:
from sklearn import preprocessing

ulke = data["ulke"]
le = preprocessing.LabelEncoder()
ulke = le.fit_transform(ulke)

ulke = ulke.reshape(len(ulke), 1)
ohe = preprocessing.OneHotEncoder()
ulke = ohe.fit_transform(ulke).toarray()

# Label Encoding is sufficient as there are only two types of gender. Avoiding dummy variable trap.
cinsiyet = data["cinsiyet"]
cinsiyet = le.fit_transform(cinsiyet)

#### Avoiding the Dummy Variable Trap

In [5]:
ulke = ulke[:,1:]

## Dataframe Creation and Merging

In [6]:
ulke = pd.DataFrame(data = ulke, index = range(len(ulke)), columns = ["fr", "tr"])
kiloyas = pd.DataFrame(data = data.iloc[:, 2:4].values, index = range(len(data)), columns = ["kilo", "yas"])
cinsiyet = pd.DataFrame(data = cinsiyet, index = range(len(cinsiyet)), columns = ["cinsiyet"])

x = pd.concat([ulke, kiloyas, cinsiyet], axis = 1)
y = data.iloc[:,1].values

## Splitting Dataset into Training and Test Set

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 0)

## Fitting Linear Regression Model to the Training Set

In [8]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression()

## Predicting Test Set Results

In [9]:
y_pred = regressor.predict(x_test)

from sklearn.metrics import r2_score
acc = r2_score(y_test, y_pred)
print(acc)

0.5269561623575657


## Backward Elimination

In [10]:
import statsmodels.api as sm

# axis = 1  --> Kolonlara eklemek için
X = np.append(arr = np.ones((22, 1)).astype(int), values = x, axis = 1)

# OLS Regression Result
X_l = x.iloc[:,[0,1,2,3,4]].values
X_l = np.array(X_l, dtype = float)
model = sm.OLS(y, X_l).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.986
Model:,OLS,Adj. R-squared (uncentered):,0.982
Method:,Least Squares,F-statistic:,237.7
Date:,"Mon, 13 Dec 2021",Prob (F-statistic):,4.17e-15
Time:,21:52:22,Log-Likelihood:,-96.626
No. Observations:,22,AIC:,203.3
Df Residuals:,17,BIC:,208.7
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,30.1564,10.218,2.951,0.009,8.599,51.713
x2,-12.3143,13.652,-0.902,0.380,-41.116,16.488
x3,1.7522,0.241,7.258,0.000,1.243,2.262
x4,0.7649,0.573,1.335,0.199,-0.444,1.974
x5,40.6943,11.301,3.601,0.002,16.851,64.538

0,1,2,3
Omnibus:,0.405,Durbin-Watson:,1.468
Prob(Omnibus):,0.817,Jarque-Bera (JB):,0.433
Skew:,-0.276,Prob(JB):,0.805
Kurtosis:,2.589,Cond. No.,231.0


#### En yükek p-value değerine sahip olan değişken elenir.

In [11]:
X_l = x.iloc[:,[0,2,3,4]].values
X_l = np.array(X_l, dtype = float)
model = sm.OLS(y, X_l).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.985
Model:,OLS,Adj. R-squared (uncentered):,0.982
Method:,Least Squares,F-statistic:,300.0
Date:,"Mon, 13 Dec 2021",Prob (F-statistic):,3.32e-16
Time:,21:52:22,Log-Likelihood:,-97.14
No. Observations:,22,AIC:,202.3
Df Residuals:,18,BIC:,206.6
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,32.9403,9.690,3.400,0.003,12.583,53.298
x2,1.7616,0.240,7.342,0.000,1.257,2.266
x3,0.6319,0.551,1.147,0.266,-0.525,1.789
x4,38.0829,10.867,3.504,0.003,15.251,60.914

0,1,2,3
Omnibus:,0.366,Durbin-Watson:,1.435
Prob(Omnibus):,0.833,Jarque-Bera (JB):,0.515
Skew:,-0.202,Prob(JB):,0.773
Kurtosis:,2.369,Cond. No.,181.0


#### En yükek p-value değerine sahip olan değişken elenir.

In [12]:
X_l = x.iloc[:,[0,3,4]].values
X_l = np.array(X_l, dtype = float)
model = sm.OLS(y, X_l).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.941
Model:,OLS,Adj. R-squared (uncentered):,0.932
Method:,Least Squares,F-statistic:,101.0
Date:,"Mon, 13 Dec 2021",Prob (F-statistic):,7.43e-12
Time:,21:52:22,Log-Likelihood:,-112.37
No. Observations:,22,AIC:,230.7
Df Residuals:,19,BIC:,234.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,73.5400,15.478,4.751,0.000,41.143,105.937
x2,4.3321,0.432,10.029,0.000,3.428,5.236
x3,0.5588,18.656,0.030,0.976,-38.490,39.607

0,1,2,3
Omnibus:,0.124,Durbin-Watson:,1.17
Prob(Omnibus):,0.94,Jarque-Bera (JB):,0.117
Skew:,0.117,Prob(JB):,0.943
Kurtosis:,2.732,Cond. No.,64.0


In [13]:
X_l = x.iloc[:,[0,3]].values
X_l = np.array(X_l, dtype = float)
model = sm.OLS(y, X_l).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.941
Model:,OLS,Adj. R-squared (uncentered):,0.935
Method:,Least Squares,F-statistic:,159.4
Date:,"Mon, 13 Dec 2021",Prob (F-statistic):,5.14e-13
Time:,21:52:22,Log-Likelihood:,-112.37
No. Observations:,22,AIC:,228.7
Df Residuals:,20,BIC:,230.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,73.5651,15.065,4.883,0.000,42.141,104.990
x2,4.3409,0.307,14.130,0.000,3.700,4.982

0,1,2,3
Omnibus:,0.106,Durbin-Watson:,1.174
Prob(Omnibus):,0.949,Jarque-Bera (JB):,0.115
Skew:,0.108,Prob(JB):,0.944
Kurtosis:,2.72,Cond. No.,52.8
