In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
#from mlxtend.frequent_patterns import apriori
#from mlxtend.frequent_patterns import association_rules
import seaborn as sns

In [2]:
house_data = pd.read_csv('HouseData.csv')

house_data.head()

Unnamed: 0.1,Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
target = house_data['price']
features = house_data.drop(['price'], axis=1)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2)

In [5]:
X_train.shape, y_train.shape

((17290, 21), (17290,))

In [6]:
X_test.shape, y_test.shape

((4323, 21), (4323,))

In [7]:
features.shape

(21613, 21)

In [8]:
from math import sqrt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [9]:
def adjusted_r2(r_square, labels, features):
    adj_r_square = 1 - ((1 - r_square) * (len(labels) - 1)) / (len(labels) - features.shape[1] - 1)
    return adj_r_square

In [10]:
def build_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print('Training R2:{}'.format(model.score(X_train, y_train)))
    print()
    test_r2_score = r2_score(y_test, y_pred)
    print('Testing R2:{}'.format(test_r2_score))
    print('Testing adjusted R2:{}'.format(adjusted_r2(test_r2_score, y_test, X_test)))
    
    print()
    print('Mean square error: {}'.format(mean_squared_error(y_test, y_pred)))
    print('Root mean square error: {}'.format(sqrt(mean_squared_error(y_test, y_pred))))
    print('Mean Absolute error: {}'.format(mean_absolute_error(y_test, y_pred) ))
    
    return y_pred

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [12]:
linear_regression = LinearRegression()
y_pred = build_and_evaluate_model(linear_regression, X_train, y_train, X_test, y_test)

ValueError: could not convert string to float: '20150105T000000'

In [None]:
df = pd.DataFrame({'Test': y_test, 'Predicted': y_pred})
df.sample(10)

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(y_test.values, y_pred)

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def draw_learning_curves(X, y, estimator, cv=None):
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv,
                                                           train_sizes=np.linspace(.1, 1, 10))
    plt.figure(figsize=(12, 8))
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    
    plt.grid(True)
    
    plt.title("Learning Curves", fontsize=22)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    plt.plot(train_scores_mean, 'o-', color="g",
             label="Training score")
    plt.plot(test_scores_mean, 'o-', color="y",
             label="Testing score")
    
    plt.legend(loc="best")
    
    plt.show()
    return train_scores_mean, test_scores_mean

In [None]:
cv = ShuffleSplit(n_splits=10)
cv
ShuffleSplit(n_splits=10, random_state=None, test_size=None, train_size=None)

In [None]:
train_score, test_score = draw_learning_curves(X_train, y_train, linear_regression, cv)

In [None]:
decision_tree = DecisionTreeRegressor()
y_pred = build_and_evaluate_model(decision_tree, X_train, y_train, X_test, y_test)

In [None]:
train_score, test_score = draw_learning_curves(X_train, y_train, decision_tree, cv)

In [None]:
decision_tree_7 = DecisionTreeRegressor(max_depth=7)
y_pred = build_and_evaluate_model(decision_tree_7, X_train, y_train,X_test, y_test )

In [None]:
train_score, test_score = draw_learning_curves(X_train, y_train, decision_tree_7, cv)

In [None]:
k_neighbors = KNeighborsRegressor(n_neighbors=5)
y_pred = build_and_evaluate_model(k_neighbors, X_train, y_train,X_test, y_test)

In [None]:
train_score, test_score = draw_learning_curves(X_train, y_train, k_neighbors , cv)

In [None]:
gb_regressor_5 = GradientBoostingRegressor(n_estimators=5)
y_pred = build_and_evaluate_model(gb_regressor_5, X_train, y_train,X_test, y_test)

In [None]:
train_score, test_score = draw_learning_curves(X_train, y_train, gb_regressor_5 , cv)

In [None]:
gb_regressor_100 = GradientBoostingRegressor()
y_pred = build_and_evaluate_model(gb_regressor_100, X_train, y_train,X_test, y_test)

In [None]:
train_score, test_score = draw_learning_curves(X_train, y_train, gb_regressor_100 , cv)