In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold,cross_val_score
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
os.chdir("D:/Machine Learning/Datasets/Datasets")

In [21]:
df = pd.read_csv('Housing.csv')
print(df.shape)

(546, 12)


In [4]:
dum_df = pd.get_dummies(df,drop_first=True)

#### Trying for linear, Ridge and  Lasso and Elastic Net

In [5]:
### for the linear Regression

x = dum_df.drop(['price'],axis=1)  # independents columns 
y = dum_df['price'] # target 

# setting the k fold cross validation 
kfold = KFold(n_splits=5,random_state=24,shuffle=True)


lr = LinearRegression()

results = cross_val_score(lr,x,y,cv=kfold)

print("r2 squares for 5 fold cross validation is :",results)
print("")
print("Mean of r2 squares for the k fold cros validation ",results.mean())

r2 squares for 5 fold cross validation is : [0.61570595 0.69337635 0.64279797 0.68809701 0.64542714]

Mean of r2 squares for the k fold cros validation  0.6570808836039296


In [6]:
### for the Ridge Regression
from sklearn.linear_model import Ridge

x = dum_df.drop(['price'],axis=1)  # independents columns 
y = dum_df['price'] # target 

# setting the k fold cross validation 
kfold = KFold(n_splits=5,random_state=24,shuffle=True)


rd = Ridge()

results = cross_val_score(rd,x,y,cv=kfold)

print("r2 squares for 5 fold cross validation is :",results)
print("")
print("Mean of r2 squares for the k fold cros validation ",results.mean())

r2 squares for 5 fold cross validation is : [0.61440956 0.69518871 0.64340081 0.68797502 0.64601555]

Mean of r2 squares for the k fold cros validation  0.6573979310546161


In [7]:
### for the Lasso Regression
from sklearn.linear_model import Lasso

x = dum_df.drop(['price'],axis=1)  # independents columns 
y = dum_df['price'] # target 

# setting the k fold cross validation 
kfold = KFold(n_splits=5,random_state=24,shuffle=True)


ls = Lasso()

results = cross_val_score(ls,x,y,cv=kfold)

print("r2 squares for 5 fold cross validation is :",results)
print("")
print("Mean of r2 squares for the k fold cros validation ",results.mean())

r2 squares for 5 fold cross validation is : [0.61564998 0.69343296 0.64277184 0.6881019  0.64545422]

Mean of r2 squares for the k fold cros validation  0.6570821806267306


In [8]:
### for the Elastic Regression
from sklearn.linear_model import ElasticNet

x = dum_df.drop(['price'],axis=1)  # independents columns 
y = dum_df['price'] # target 

# setting the k fold cross validation 
kfold = KFold(n_splits=5,random_state=24,shuffle=True)


el = ElasticNet()

results = cross_val_score(el,x,y,cv=kfold)

print("r2 squares for 5 fold cross validation is :",results)
print("")
print("Mean of r2 squares for the k fold cros validation ",results.mean())

r2 squares for 5 fold cross validation is : [0.48423761 0.61724948 0.58026324 0.59426232 0.56143989]

Mean of r2 squares for the k fold cros validation  0.5674905101968157


In [9]:
# k fold is always better than the train_test_split 

### Ridge regression using kfold

In [10]:
### for the Ridge Regression
from sklearn.linear_model import Ridge

x = dum_df.drop(['price'],axis=1)  # independents columns 
y = dum_df['price'] # target 

# setting the k fold cross validation 
kfold = KFold(n_splits=5,random_state=24,shuffle=True)

scores=[]
alphas = np.arange(0,10,0.25)
for i in alphas:
    rd = Ridge(alpha=i)
    results = cross_val_score(rd,x,y,cv=kfold)
    scores.append(results.mean())

i_max = np.argmax(scores)
print('Best Alpha : ',alphas[i_max])
print('Best Score : ', scores[i_max])

Best Alpha :  4.0
Best Score :  0.6577381120548891


### Lasso Regression using kfold

In [34]:
### for the Lasso Regression
from sklearn.linear_model import Lasso

x = dum_df.drop(['price'],axis=1)  # independents columns 
y = dum_df['price'] # target 

# setting the k fold cross validation 
kfold = KFold(n_splits=5,random_state=24,shuffle=True)
ls = Lasso()

scores=[]
alphas = np.arange(0,10,0.25)
for i in alphas:
    rd = Lasso(alpha=i)
    results = cross_val_score(rd,x,y,cv=kfold)
    scores.append(results.mean())

i_max = np.argmax(scores)
print('Best Alpha : ',alphas[i_max])
print('Best Score : ', scores[i_max])


Best Alpha :  9.5
Best Score :  0.657087435238261


### Elastic Net regression using kfold

In [22]:
### for the Elastic Regression
from sklearn.linear_model import ElasticNet

x = dum_df.drop(['price'],axis=1)  # independents columns 
y = dum_df['price'] # target 

# setting the k fold cross validation 
kfold = KFold(n_splits=5,random_state=24,shuffle=True)

# correct way  

alphas = np.arange(0,10,0.25)
l1 = np.arange(0, 1,0.1)
scores = []
for i in alphas:
    for j in l1:
        el = ElasticNet(alpha=i, l1_ratio=j)
        results = cross_val_score(el,x,y,cv=kfold)
        scores.append([i,j,results.mean()])
        
pd_scores = pd.DataFrame(scores, columns=['alpha','L1-ratio','r2'])
pd_scores.sort_values(by='r2',ascending=False).iloc[0]


alpha       0.000000
L1-ratio    0.000000
r2          0.657081
Name: 0, dtype: float64

Ridge
Best Alpha :  4.0
Best Score :  0.6577381120548891

Lasso
Best Alpha :  9.5
Best Score :  0.657087435238261

Elastic Net
alpha       0.000000
L1-ratio    0.000000
r2          0.657081
Name: 0, dtype: float64

### finding best score and r2 without loop 

In [13]:
from sklearn.model_selection import GridSearchCV

In [15]:
## Ridge Regression

alphas = np.arange(0,10,0.25)
params = {'alpha':alphas}
ridge = Ridge()
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
gcv = GridSearchCV(ridge, param_grid=params, cv=kfold)
gcv.fit(x,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 4.0}
0.6577381120548891


In [23]:
## lasso Regression

alphas = np.arange(0,10,0.25)
params = {'alpha':alphas}
lasso = Lasso()
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
gcv = GridSearchCV(lasso, param_grid=params, cv=kfold)
gcv.fit(x,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 9.5}
0.657087435238261


In [24]:
## Elastic Net Regression

alphas = np.arange(0,10,0.25)
l1 = np.arange(0, 1,0.1)

params = {'alpha':alphas,'l1_ratio':l1}
el = ElasticNet()
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
gcv = GridSearchCV(el, param_grid=params, cv=kfold)
gcv.fit(x,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'alpha': 0.0, 'l1_ratio': 0.0}
0.6570808836039296


In [35]:
# verify best test score and params
pd_cv = pd.DataFrame(gcv.cv_results_)
pd_cv.to_csv("D:/Machine Learning/Datasets/best_score_results1.csv")