In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [None]:
data = pd.read_csv(os.path.join('..', 'data', 'datasets', 'kc_house_data.csv'))
data.shape

In [None]:
data.head(100)

In [None]:
data.describe()

In [None]:
data['pdate'] = pd.to_datetime(data.date)
data = data.set_index('pdate')

In [None]:
data['price'].plot()
data['rprice'] = data['price'].rolling(10).mean()
data['rprice'].plot()

In [None]:
data['bedrooms'].value_counts().plot(kind='bar')
plt.title('number of Bedroom')
plt.xlabel('Bedrooms')
plt.ylabel('Count')

In [None]:
plt.scatter(data.price,data.sqft_living)
plt.title("Price vs Square Feet")

In [None]:
plt.scatter(data.price,data.long)
plt.title("Price vs Location of the area")

In [None]:
plt.scatter(data.price,data.lat)
plt.xlabel("Price")
plt.ylabel('Latitude')
plt.title("Latitude vs Price")

In [None]:
plt.scatter(data.bedrooms,data.price)
plt.title("Bedroom and Price ")
plt.xlabel("Bedrooms")
plt.ylabel("Price")
plt.show()

In [None]:
plt.scatter((data['sqft_living']+data['sqft_basement']),data['price'])

In [None]:
plt.scatter(data.waterfront,data.price)
plt.title("Waterfront vs Price ( 0= no waterfront)")

In [None]:
train1 = data.drop(['id', 'price'],axis=1)

In [None]:
train1.head()

In [None]:
data.floors.value_counts().plot(kind='bar')

In [None]:
plt.scatter(data.floors,data.price)

In [None]:
plt.scatter(data.condition,data.price)

In [None]:
plt.scatter(data.zipcode,data.price)
plt.title("Which is the pricey location by zipcode?")

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
data = data.dropna()
labels = data['price']
conv_dates = [1 if values == 2014 else 0 for values in data.date ]
data['date'] = conv_dates
train1 = data.drop(['id', 'price'],axis=1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train1, labels, test_size = 0.10, random_state = 2)

In [None]:
reg.fit(x_train,y_train)

In [None]:
reg.score(x_test, y_test)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

model = make_pipeline(Normalizer(), LinearRegression())
model.fit(x_train,y_train)
model.score(x_test, y_test)

In [None]:
plt.scatter(y_test, reg.predict(x_test))

In [None]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 100, learning_rate = 0.1)

In [None]:
clf.fit(x_train, y_train)

In [None]:
clf.score(x_test, y_test)

In [None]:
plt.scatter(y_test, clf.predict(x_test))

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "learning_rate": [0.1, 0.01],
    "n_estimators": [100, 200]
}

clf = GridSearchCV(ensemble.GradientBoostingRegressor(), parameters, cv=3, n_jobs=-1)

clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
print(clf.best_params_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {
    "learning_rate": np.linspace(0.1, 0.001, 10),
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth": [3,5,8],
    "max_features": ["log2","sqrt"],
    "criterion": ["friedman_mse",  "squared_error"],
    "subsample": [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators": [10, 20, 50, 100, 200]
}

clf = RandomizedSearchCV(ensemble.GradientBoostingRegressor(), parameters, cv=3, n_jobs=-1, n_iter=10)

clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
print(clf.best_params_)

In [None]:
plt.scatter(y_test, clf.predict(x_test))