In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 


## Loading Bostan House pricing data

In [None]:
from sklearn.datasets import load_boston 

In [None]:
boston = load_boston()

In [None]:
boston.keys()

## lets check the discription of dataset

In [None]:
print(boston.DESCR)

In [None]:
print(boston.data)

In [None]:
boston.feature_names

## Preparing Dataset

In [None]:
Dataset = pd.DataFrame(boston.data,columns = boston.feature_names)

In [None]:
Dataset

In [None]:
Dataset['price'] = boston.target

In [None]:
Dataset

In [None]:
Dataset.info()

## summerizing the stats of the data

In [None]:
Dataset.describe()

## Check the missing values ..... this is important

In [None]:
Dataset.isnull().sum()

## EDA Exploratory data Analysis

In [None]:
# correlation
Dataset.corr()

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(Dataset)

In [None]:
plt.scatter(Dataset['CRIM'],Dataset['price'])
plt.xlabel('Crime rate')
plt.ylabel('price')

In [None]:
plt.scatter(Dataset['RM'],Dataset['price'])
plt.xlabel('RM')
plt.ylabel('price')

In [None]:
import seaborn as sns
sns.regplot(x = 'RM',y = 'price',data = Dataset) #positively correlated features

In [None]:
sns.regplot(x = 'LSTAT',y = 'price',data = Dataset) #negatively correlated features

In [None]:
sns.regplot(x = 'CHAS',y = 'price',data = Dataset) #CHAS variable has only 1 or 0 

In [None]:
sns.regplot(x = 'PTRATIO',y = 'price',data = Dataset) # slightly negatively correlated features
                                                      # as PTRATIO increases the PRICE decreases

# dependent and independent features


### Here price is the only dependent feature and the remaining all are independent features

In [None]:
X = Dataset.iloc[:,:-1] # taking all the features in to X variable except the last feature that is price
y = Dataset.iloc[:,-1] # taking all the data point of the price column
X.head()

In [None]:
print(y)

## train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train

In [None]:
X_test

## standardize the dataset

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train=scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
X_train

In [None]:
X_test

# Model Training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression = LinearRegression()

In [None]:
regression.fit(X_train,y_train)

In [None]:
## print the co-efficients
print(regression.coef_) ## for every idividual feature it will give a co-efficient value

In [None]:
print(regression.intercept_)

In [None]:
## on which parameters the model has been trained
regression.get_params()

In [None]:
## prediction with the test data
reg_pred = regression.predict(X_test)

In [None]:
reg_pred

In [None]:
## plot a scatter plot for prediction 
plt.scatter(y_test,reg_pred)  # for the prediction of X_test , y_test is original values. so,,.. comparing the both 

## below fig we got the positive expectation 

In [None]:
residuals = y_test-reg_pred ### residuals are the errors or difference btw the original and predicted 

In [None]:
residuals  

In [None]:
## plot this residuals 
sns.displot(residuals,kind = 'kde')

# so by looking into the graph we can tell that our errors are lie btw -10 to +10 ,,, and from +10 to +30 there are some 
# residual values that can be considered as the outliers 


In [None]:
## scattered plot w.r.to prediction and residuals
# uniform distribution is observed for prediction and residuals
plt.scatter(reg_pred,residuals)

In [None]:
from sklearn.metrics  import mean_squared_error
from sklearn.metrics  import mean_absolute_error

In [None]:
print(mean_absolute_error(y_test,reg_pred))
print(mean_squared_error(y_test,reg_pred))
print(np.sqrt(mean_squared_error(y_test,reg_pred)))

# R square and adjusted R square

#### Formula
### R^2 = 1-SSR/SST
#### R^2 =  coefficient of determination  SSR  = sum of squares of residuals  SST  =  total sum of squares

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test,reg_pred)
score

### Adjusted R^2 = 1 – [(1-R2)*(n-1)/(n-k-1)]
#### where:

#### R2: The R2 of the model n: The number of observations k: The number of predictor variables

In [None]:
1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

# New Data Prediction

In [None]:
## new data point
new_data = boston.data[0].reshape(1,-1) # shaped data
print(new_data)
new_data.shape

In [None]:
## transformation of new_data
scaler.transform(new_data)

In [None]:
regression.predict(scaler.transform(new_data))

# Pickling the model file for deployment

In [None]:
import pickle

In [None]:
pickle.dump(regression,open('regmodel.pkl','wb'))

In [None]:
pickled_model=pickle.load(open('regmodel.pkl','rb'))

In [None]:
# prediction
pickled_model.predict(scaler.transform(new_data))