 Supervised Learning Models:Regression Models

In [None]:
#------------------step 1: to import libraries-------------------------
import pandas as pd  # panda is used for data manipulation while working with datasets
import numpy as np   # numpy is used for numerical operations like in array and matrixes
import matplotlib.pyplot as plt   # matplotlib.pyplot is used for visualisation
import seaborn as sns    # seaborn is used with matplotlib.pyplot. it gives a better interface for drawing, graphics
from sklearn.model_selection import train_test_split   # used to slpit dataset into trai and test
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV    #  to create the linear regression model. relationship between dependent and independent variable
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error # this is used to evaluate the performance the linear regression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [None]:
#------------------step 2: to read csv --------------------------------
df= pd.read_csv("/content/Housing.csv")
df.head()

In [None]:
#-------------------step 3: data exploration---------------------------------
print(df.info())    # this is used to get the insights of the data. it gives the number of rows, number of columns, data-tuype of the columns, help to find if there are missing values in the dataset
print(df.describe())   # makes the descrptive statical analisis of data. it gives the count of non-null values, mean of each col, standard deviasion of each col, min value of each col, 25% dataset falls (first quaritile), 50% median (second quaritile), the values below which 75% of data falls (third quaritile), max value of each col

In [None]:
features=[ 'area', 'bedrooms',	'bathrooms',	'stories']  # independent variables
target=['price']   # dependent variable
x=df[features]
y=df[target]

In [None]:
#------------------step 4: data cleaning and preprocessing -----------------------
df= df.dropna(subset=['price']) # dropna removes the row that contains atleast one missing. subset specify if 'price' is NaN(not a number) then the row will be deleted.
numeric_cols=df.select_dtypes(include=np.number).columns
df[numeric_cols]=df[numeric_cols].fillna(df[numeric_cols].mean()) # this will fill the mean value in place of missing values. we are selecting numeric_cols to fill the mean values.fillna is used to fill the missing values.df.mean will calculate the mean of each col and when used with fillna it will fill the mean value in the missing places, inplace='True' is used to modify the dataframe directly rather than returning a new dataframe

In [None]:
# ----------------step: 5: data visualization---------------------------------
sns.pairplot(df[features+target])   # creates the a grid of scatter plots
plt.show()   # display the plots

In [None]:
#----------------------step 6: to split the data into test andt train-------------------------
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
#------------------------step 7: build the regression model----------------------
poly = PolynomialFeatures(degree=5)  # helps the model to learn quadraticaly
x_train_poly=poly.fit_transform(x_train)  # fit calculates necessay parameters like number of features.  transform generates the polynomial new features example: kf there are features like A and B than it will create a new features as A*B
x_test_poly=poly.transform(x_test)
linear_regression_model= LinearRegression().fit(x_train,y_train)
polynomial_regression_model=LinearRegression().fit(x_train_poly, y_train)

##################### cheking the best alpha value##################
ridge_cv=RidgeCV(alphas=[0.01,0.1,1,10,100], store_cv_values=True)
ridge_cv.fit(x_train, y_train)
print("best alpha value for ridge = ", ridge_cv.alpha_)

lasso_cv=LassoCV(alphas=[0.01,0.1,1.0,10,100], cv=5)
lasso_cv.fit(x_train, y_train)
print("best apha value for lasso regression = ", lasso_cv.alpha_)
###########################################################

ridge_regression_model=Ridge(alpha=1.0).fit(x_train_poly, y_train)
lasso_regression_model=Lasso(alpha=100).fit(x_train_poly, y_train)

In [None]:
#---------------------step 8:make predections --------------------------------
y_predict_linear_regression=linear_regression_model.predict(x_test)
y_predict_polynomial_regression=polynomial_regression_model.predict(x_test_poly)
y_predict_ridge_regression=ridge_regression_model.predict(x_test_poly)
y_predict_lasso_regresson=lasso_regression_model.predict(x_test_poly)

In [None]:
#-------------------step 9: evaluate the model's performance --------------------------------
mae_linear_regression=mean_absolute_error(y_test, y_predict_linear_regression)  # mean_absolute_error, mean_squared_error, r2_score is used to quntify the accuracy of regression model by measuring the average magnitute errors in predictions
mse_linear_regression=mean_squared_error(y_test, y_predict_linear_regression)
r2_linear_regression=r2_score(y_test, y_predict_linear_regression)
print(f'mean_absolute_error_linear_regression ={mae_linear_regression}')
print(f'mean_squared_error_linear_regression={mse_linear_regression}')
print(f'r2_score_linear_regression={r2_linear_regression}')

mae_polynomial_regression=mean_absolute_error(y_test, y_predict_polynomial_regression)
mse_polynomial_regression=mean_squared_error(y_test, y_predict_polynomial_regression)
r2_polynomial_regression=r2_score(y_test, y_predict_polynomial_regression)
print(f'mean_absolute_error_polynomial_regression ={mae_polynomial_regression}')
print(f'mean_squared_error_polynomial_regression={mse_polynomial_regression}')
print(f'r2_score_polynomial_regression={r2_polynomial_regression}')

mae_ridge_regression=mean_absolute_error(y_test, y_predict_ridge_regression)
mse_ridge_regression=mean_squared_error(y_test, y_predict_ridge_regression)
r2_ridge_regression=r2_score(y_test, y_predict_ridge_regression)
print(f"mean_absolute_error_ridge_regression={mae_ridge_regression}")
print(f"mean_squared_error_ridge_regression={mse_ridge_regression}")
print(f"r2_score_ridge_regression={r2_ridge_regression}")

mae_lasso_regression=mean_absolute_error(y_test, y_predict_lasso_regresson)
mse_lasso_regression=mean_squared_error(y_test, y_predict_lasso_regresson)
r2_lasso_regression=r2_score(y_test, y_predict_lasso_regresson)
print(f"mean_absolute_error_lasso_regression={mae_lasso_regression}")
print(f"mean_squared_error_lasso_regression={mse_lasso_regression}")
print(f"r2_score_lasso_regression={r2_lasso_regression}")

In [None]:
#--------------------step 10: visualize the predictions -----------------

plt.subplot(1,4,1)
plt.scatter(y_test, y_predict_linear_regression)    # making a scatter plot
plt.xlabel('actual price')
plt.ylabel('predicted price')
plt.title('actual vs predicted price')


print("polynomial regression")
plt.subplot(1,4,2)   #   syntax (number of rows in subplot, number of columns of subplot, activate the subplot of the grid)
plt.scatter(y_test, y_predict_polynomial_regression)
plt.xlabel('actual price')
plt.ylabel('predicted price')
plt.title('actual vs predicted price')

print("for ridge regression")
plt.subplot(1,4,3)
plt.scatter(y_test, y_predict_ridge_regression)
plt.xlabel('actual price')
plt.ylabel('predicted price')
plt.title('actual vs predicted price')


print("for lasso regression")
plt.subplot(1,4,4)
plt.scatter(y_test, y_predict_lasso_regresson)
plt.xlabel('actual price')
plt.ylabel('predicted price')
plt.title('actual vs predicted price')

plt.tight_layout()
plt.show()

In [None]:
print("linear regression=",linear_regression_model.score(x_test, y_test))  # for linear regression
print("polynomial regression=",polynomial_regression_model.score(x_test_poly, y_test))  # for polynomial regression
print("ridge regression=",ridge_regression_model.score(x_test_poly, y_test))  # for ridge regression
print("lasso regression=",lasso_regression_model.score(x_test_poly, y_test))  # for lasso regression