In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv("../input/kc_house_data.csv")
df.info()

In [None]:
df.head(3)

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.model_selection import validation_curve
from sklearn.linear_model import Ridge
from sklearn.base import clone
from itertools import combinations


In [None]:
df["date"] = pd.to_datetime(df["date"])
df.head(3)

In [None]:
X_visual = df.iloc[:, 2:]
X = df.iloc[:, 3:]
y = df.iloc[:, 2]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
sns.set(style="whitegrid", context="notebook")
#sns.pairplot(X_visual, size=2.5)

In [None]:
cols = list(X_visual.columns)

In [None]:
cm = np.corrcoef(X_visual.values.T)
sns.set(font_scale=1.5)
fig, ax = plt.subplots(figsize = (20,25))
hm = sns.heatmap(cm, cbar=True, annot=True, fmt = '.2f', xticklabels = cols, yticklabels = cols)

In [None]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)


In [None]:
lr=LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_train)
pred1 = lr.predict(X_test)
print("r2_score for training data: ", r2_score(y_train, pred))
print("r2_score for test data: ", r2_score(y_test, pred1))

In [None]:
train_sizes, train_scores, test_scores =learning_curve(estimator=lr,X=X_train,y=y_train,train_sizes=np.linspace(0.1, 1.0, 10),n_jobs=-1)
#print(train_scores,test_scores)
train_mean = np.mean(train_scores, axis=1)
print(train_mean)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)


In [None]:
plt.plot(train_sizes, train_mean,
color='blue', marker='o',
markersize=5,
label='training accuracy')

plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='validation accuracy')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.5, 1.0])
plt.show()

In [None]:
lr.coef_

In [None]:
ridge = Ridge()
ridge.fit(X_train, y_train)
pred2 = ridge.predict(X_train)
pred3 = ridge.predict(X_test)
print("R2 score for train ridge: ", r2_score(y_train, pred2))
print("r2_score for test data: ", r2_score(y_test, pred3))

In [None]:
from sklearn.learning_curve import validation_curve
param_range = [1e-5,1e-4,0.001, 0.01, 0.1, 1.0, 10.0, 100.0 ]
train_scores, test_scores = validation_curve(
estimator=ridge,
X=X_train,
y=y_train,
param_name='alpha',
param_range=param_range)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean,
color='blue', marker='o',
markersize=5,
label='training accuracy')

plt.plot(param_range, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='validation accuracy')

plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1.0])
plt.show()


In [None]:
ridge.coef_

In [None]:
from sklearn.preprocessing import PolynomialFeatures

pr = LinearRegression(n_jobs=-1)
quad = PolynomialFeatures(degree = 2)
X_quad = quad.fit_transform(X_train)
pr = pr.fit(X_quad, y_train)
quadratic_r2 = r2_score(y_train, pr.predict(X_quad))
print("r2  of 2: ", quadratic_r2)

In [None]:
train_sizes, train_scores, test_scores =learning_curve(estimator=pr,X=X_quad,y=y_train,train_sizes=np.linspace(0.2, 1.0, 10),cv=10,n_jobs=-1)
#print(train_scores,test_scores)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
print(train_sizes)
print(test_mean)


In [None]:
plt.plot(train_sizes, train_mean,
color='blue', marker='o',
markersize=5,
label='training accuracy')

plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='validation accuracy')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0, 1.0])
plt.show()

In [None]:
X_test_quad = quad.transform(X_test)
pred=pr.predict(X_test_quad)
#pred=pr.predict(X_test)
print(X_test.shape)
print(y_test.shape)

In [None]:

print('r2_2',r2_score(y_test,pred))


Now since polynomial features adds lot of features to the data we select most important features based on feature selection techniques and then try to fit higher degree Polynomial Features and check the r2_score on training and test accuracy

# selecting features through sequential backward selection

In [None]:
from sklearn.base import clone
from sklearn.itertools import combinations
