In [3]:
#importing necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [4]:
#importing more necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

In [5]:
#Loading orginal dataframe 
o_df = pd.read_csv('kc_house_data.csv')
o_df.head()

FileNotFoundError: [Errno 2] File b'kc_house_data.csv' does not exist: b'kc_house_data.csv'

In [None]:
#Viewing the shape and size of the dataframe
o_df.shape

In [None]:
#Viewing columns
o_df.columns

In [None]:
#Viewing data type for each feature
o_df.dtypes

In [None]:
#Viewing the descriptive statistics of the dataset
o_df.describe()

In [None]:
#Checking for null values
o_df.isnull().count()

In [None]:
#Converting dates into datetime objects for easy manipulation
o_df['date'] = pd.to_datetime(o_df['date'])

In [None]:
o_df.head()

In [None]:
#Creating a dataframe with only the position coordinates for home in the data
loc_data = o_df[['lat', 'long']]

In [None]:
loc_data.head()

In [None]:
#Viewing the original distributions for each feature in the dataframe
o_df.hist(figsize = (20,12))
plt.tight_layout()
plt.show()

In [None]:
#Creating dataframe for features I believe may be correlated with price
df = o_df[['price','bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'grade', 'zipcode']]

In [None]:
df.head()

In [None]:
#Viewing the distributions for each feature in the testing dataset
df.hist(figsize = (20,12))
plt.tight_layout()
plt.show()

In [None]:
df.head()

In [None]:
#Viewing each feature paired against each other to view correlations and see trends
sns.pairplot(df)

In [None]:
#Viewing the univariate distribution for each feature in the testing dataframe

#Creating variables for the number of rows and columns
rows = 2
cols = 4

#Creating subplot
fig, ax = plt.subplots(nrows = rows, ncols = cols, figsize = (16,4))

#Iterating through each row and column of the testing dataframe
col = df.columns
index = 0
for i in range(rows):
    for j in range(cols):
        sns.distplot(df[col[index]], ax = ax[i][j])
        index += 1

plt.tight_layout()

In [None]:
#Creating and viewing the correlation map 
corrmat = df.corr
corrmat()

In [None]:
type(corrmat())

In [None]:
#Creating subplot
plt.subplots(figsize = (18,10))
#Heatmap showing the correlation between features in the testing dataframe
sns.heatmap(corrmat(), annot = True, annot_kws ={'size':20}, cmap='summer')

In [None]:
corrmat().index.values

In [None]:
#fuction to retrive the correlated data at a certain threshold
def getCorrelationFeatures(corrdata, threshold):
    feature = []
    value = []
    
# If the absolute value of the correlated feature is above 
# the given threshold, the value and the feature will be appended 
# to a new list
    
    for i, index in enumerate(corrdata.index):
        if abs(corrdata[index]) > threshold:
            feature.append(index)
            value.append(corrdata[index])
    df = pd.DataFrame(data = value, index = feature, columns = ['Corr Value'])
    return df

In [None]:
#Setting the threshold
threshold = 0.5

#The correlated features for price greater than 50%
corr_value = getCorrelationFeatures(corrmat()['price'], threshold)
corr_value

In [None]:
corr_value.index

In [None]:
#Creating a dataframe from the indices of the corr value
correlated_data = df[corr_value.index]
correlated_data.head()

In [None]:
#Viewing each feature in the correlated data paired against 
#each other to view correlations and see trends
sns.pairplot(correlated_data)

In [None]:
#Creating Subplots
plt.subplots(figsize = (18,10))
#Heatmap showing the correlation between correlated features in the testing dataframe
sns.heatmap(correlated_data.corr(), annot = True, annot_kws = {'size':20}, cmap="YlGnBu")

In [None]:
#X is a list of all of the features (also known as independent variables or inputs) excluding price
X = correlated_data.drop(labels = ['price'], axis = 1)
#y is the outcome, price
y = correlated_data['price']
X.head()

In [None]:
y.head()

In [None]:
X.shape

In [None]:
#Creating a train and test split, where the test size is 20% of the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
X_train.shape, X_test.shape

In [None]:
#Choosing a linear regression model to feed the data to
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
#Setting the test predicition for the linear model
y_predict = model.predict(X_test)

In [None]:
#Creating a dataframe for the predicted values and actual values from the regression
y1_df = pd.DataFrame(data = [y_predict, y_test])

In [None]:
#0 predicted values
#1 actual values
y1_df.T.head()

In [None]:
correlated_data.columns

In [None]:
#Instantiating variables for the R2 score, mean absolute error and mean squared error
score = r2_score(y_test, y_predict)
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)

print('r2_score: ', score)
print('mae: ', mae)
print('mse: ', mse)

In [None]:
#Creating empty lists for the performance metric function
total_features = []
total_features_name = []
selected_correlation_value = []
r2_scores = []
mae_value = []
mse_value = []

In [None]:
#Defining function for correlated features and reyurn a dataframe
def performance_Metric(features, threshold, y_true, y_pred):
    score = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    
    total_features.append(len(features)-1)
    total_features_name.append(str(features))
    selected_correlation_value.append(threshold)
    r2_scores.append(score)
    mae_value.append(mae)
    mse_value.append(mse)
    
    metric_dataframe = pd.DataFrame(data = [total_features_name, total_features,
                                             selected_correlation_value,
                                             r2_scores, mae_value, mse_value], 
                                             index = ['Features', '# of Features', 'Corr Value',
                                             'R2 Score', 'MAE', 'MSE'])
    return metric_dataframe.T

In [None]:
performance_Metric(correlated_data, threshold, y_test, y_predict)

In [None]:
#Viewing the linear relationship between correlated features against price
rows = 2
cols = 2

#Creating a subplot to graph relationships
fig, ax = plt.subplots(nrows = rows, ncols = cols, figsize = (12,4))
col = correlated_data.columns
index = 0

#generating a graph for each row and column
for i in range(rows):
    for j in range(cols):
        sns.regplot(x = correlated_data[col[index]], y = correlated_data['price'], ax = ax[i][j])
        index += 1

plt.tight_layout()

## Feature Exploration - Condition, Basement, Viewed, Waterfront 

In [None]:
o_df['condition'].head()

In [None]:
#Viewing the number of unique items in the condition feature
o_df['condition'].nunique()

In [None]:
#Viewing the unique values in the condition feature
o_df['condition'].unique()

In [None]:
#Viewing the count of each value in the condition feature
o_df['condition'].value_counts(sort = False)

In [None]:
#creating a bar chart to view the distibution of conditon
o_df['condition'].value_counts(sort = False).plot.bar()

### Exploring Basement

In [None]:
#Setting the sqft_basement feature equal to the total living space - the space above 
#If the house has a basement, it will return the space in sq ft.  It not, it will be 0
o_df['sqft_basement'] = o_df['sqft_living'] - o_df['sqft_above']

In [None]:
o_df.head()

In [None]:
o_df['sqft_basement'].head()

In [None]:
#Viewing the min and max values o_df['sqft_basement'].min(), o_df['sqft_basement'].max()

In [None]:
o_df.head()

In [None]:
#Creating a new feature to make the basement binary. 0 if there's no basement and 1 otherwise
o_df['basement'] = [0 if x <= 0  else 1 for x in o_df['sqft_basement']]

In [None]:
#Viewing the count of each value in the new basement feature 
o_df['basement'].value_counts(ascending = False)

In [None]:
#Creating a bar chart to see the new distribution of the basement feature
o_df['basement'].value_counts().plot.bar()

## Exploring Viewed

In [None]:
o_df.head()

In [None]:
#Viewing the number of unique items in the view feature
o_df['view'].nunique()

In [None]:
#Viewing the unique values in the view feature
o_df['view'].unique()

In [None]:
#Viewing the count for each value in the view feature
o_df['view'].value_counts()

In [None]:
#Viewing the distribution for each unique value in the view feature
o_df['view'].value_counts().plot.bar()

In [None]:
#Creating a new feature called viewed making it binary
#0 for not viewed and 1 for viewed
o_df['viewed'] = [0 if x == 0 else 1 for x in o_df['view']]

In [None]:
o_df['viewed'].value_counts().sort_values()

In [None]:
#Viewing the distribution for the viewed column
o_df['viewed'].value_counts().plot.bar()

In [None]:
o_df.head(5)

## Exploring Waterfront

In [None]:
o_df['waterfront'].head()

In [None]:
#Null values are being filled with 0s
o_df['waterfront'].fillna(0).head()

In [None]:
o_df['waterfront'].nunique()

In [None]:
o_df['waterfront'].unique()

In [None]:
o_df['waterfront'].value_counts()

In [None]:
#Making waterfront binary.  0 for no waterfront and 1 for anything else
o_df['waterfront'] = [0 if x == 0 else 1 for x in o_df['waterfront']]

In [None]:
#Viewing the distribution of the waterfront feature
o_df['waterfront'].value_counts().plot.bar()

## Exploring Zip Code

In [None]:
o_df['zipcode'].head()

In [None]:
#Viewing the number of unique zipcodes
o_df['zipcode'].nunique()

In [None]:
#Viewing each unique zipcode
o_df['zipcode'].unique()

In [None]:
#Viewing the descriptive statistics for the datafame with the modified and added features 
o_df.describe()

In [None]:
#Creating the new dataframe with new and explored features
new_df = o_df[['price','bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'grade', 'zipcode', 'basement', 'viewed','condition', 'waterfront']]

## Testing Features using Dummy Variables

In [None]:
#create a list of features to dummy
todummy_list = ['basement', 'waterfront', 'viewed', 'condition', 'zipcode']

In [None]:
# Function to create dummies for the categorical variables used for modeling
def dummy_df(df, todummy_list):
    #for every feature in the todummy list, add dummies and drop the original feature
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [None]:
#Setting X to be the new and updated feature list with added dummies
X = dummy_df(new_df, todummy_list)
#Setting y to be the outcome variable, price
y = new_df['price']

In [None]:
#Dropping the price column as it is the outcome variable
X = X.drop(labels = ['price'], axis = 1)

In [None]:
X.head()

In [None]:
X.shape

## Linear Regression

In [None]:
#Splitting up the data for training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
#Viewing the size and shape of the training and testing features
X_train.shape, X_test.shape

In [None]:
#Fitting the linear regression
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
#Setting up the predicition for the test value
y_predict = model.predict(X_test)

In [None]:
#Creating a dataframe with both the predicted and actual y values
y_df = pd.DataFrame(data = [y_predict, y_test])

In [None]:
#0 predicted values
#1 actual values
y_df.T.head()

In [None]:
X.corr().head()

In [None]:
X.head()

In [None]:
threshold = 0.5

In [None]:
#Testing the performance of the new dataframe with engineered features
performance_Metric(X.corr(), threshold, y_test, y_predict)

## Ridge Regression

In [None]:
#Setting up the Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=3.8, fit_intercept=True))
]

ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)

In [None]:
# Predicting R2 Score the train set results
y_pred_ridge_train = ridge_pipe.predict(X_train)
r2_score_ridge_train = r2_score(y_train, y_pred_ridge_train)

# Predicting R2 Score the Test set results
y_pred_ridge_test = ridge_pipe.predict(X_test)
r2_score_ridge_test = r2_score(y_test, y_pred_ridge_test)

# Predicting RMSE 
rmse_ridge = (np.sqrt(mean_squared_error(y_test, y_pred_ridge_test)))

print('R2_score (train): ', r2_score_ridge_train)
print('R2_score (test): ', r2_score_ridge_test)
print("RMSE: ", rmse_ridge)

## Lasso Regression

In [None]:
#Setting up the Lasso Regression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=0.012, fit_intercept=True, max_iter=3000))
]

lasso_pipe = Pipeline(steps)
lasso_pipe.fit(X_train, y_train)

In [None]:
# Predicting R2 Score the Train set results
y_pred_lasso_train = lasso_pipe.predict(X_train)
r2_score_lasso_train = r2_score(y_train, y_pred_lasso_train)

# Predicting R2 Score the Test set results
y_pred_lasso_test = lasso_pipe.predict(X_test)
r2_score_lasso_test = r2_score(y_test, y_pred_lasso_test)

# Predicting RMSE
rmse_lasso = (np.sqrt(mean_squared_error(y_test, y_pred_lasso_test)))
print('R2_score (train): ', r2_score_lasso_train)
print('R2_score (test): ', r2_score_lasso_test)
print("RMSE: ", rmse_lasso)

In [None]:
#Visualising the linear predicition for price
plt.figure(figsize=(18,10))
sns.regplot(y_predict,y_test,scatter_kws={'alpha':0.5,'color':'lime'},line_kws={'color':'blue','alpha':0.5})
plt.xlabel('Predictions')
plt.ylabel('Price')
plt.title("Linear Prediction for Price")
plt.show()