In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import RFE, f_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
import warnings
warnings.simplefilter(action='ignore', category=Warning)


# Loading Dataset

In [None]:
dataset = pd.read_csv('kc_house_data.csv')


# Exploratory Data Analysis


In [None]:
dataset.head()


In [None]:
dataset.shape


In [None]:
dataset.describe()


In [None]:
dataset.info()


In [None]:
dataset.isnull().values.any()


## Finding Correlation among the variables

In [None]:
# corr_data = dataset.drop(['id','date','price'], axis = 1)
# plt.figure(figsize=(20, 17))
# matrix = np.triu(corr_data.corr())
# sns.heatmap(corr_data.corr(), annot=True,
#             linewidth=.8, mask=matrix, cmap="rocket")


In [None]:
dataset.describe().T.style.bar(
    subset=['mean'],
    color='#606ff2').background_gradient(
    subset=['std'], cmap='PuBu').background_gradient(subset=['50%'], cmap='PuBu')


## Visualising Numerical Data

In [None]:
plot_data = dataset.drop(['id', 'date', ], axis=1)
# fig = plt.figure(figsize=(20, 20))
# for i in range(len(plot_data.columns)):
#     fig.add_subplot(np.ceil(len(plot_data.columns)/5), 5, i+1)
#     plot_data.iloc[:, i].hist(bins=20)
#     plt.title(plot_data.columns[i])
#     fig.tight_layout(pad=3.0)
# plt.show()


In [None]:
def distributionPlot(dataset):
    fig = plt.figure(figsize=(20, 20))
    for i in range(len(dataset.columns)):
        fig.add_subplot(np.ceil(len(dataset.columns)/5), 5, i+1)
        sns.distplot(
            dataset.iloc[:, i], color="lightcoral", rug=True)
        fig.tight_layout(pad=3.0)


In [None]:
# distributionPlot(plot_data)


In [None]:
dataset['floors'].count()


In [None]:
def pieChartPlotter(dataset, columnName):
    values = dataset[columnName].value_counts()
    labels = dataset[columnName].unique()
    pie, ax = plt.subplots(figsize=[10, 6])

    patches, texts, autotexts = ax.pie(values, labels=labels, autopct='%1.2f%%', shadow=True, pctdistance=.5, explode=[0.06]*dataset[columnName].unique()
                                       )

    plt.legend(patches, labels, loc="best")
    plt.title(columnName, color='white', fontsize=14)
    plt.setp(texts, color='white', fontsize=20)
    plt.setp(autotexts, size=10, color='white')
    autotexts[1].set_color('white')
    plt.axis('equal')
    plt.tight_layout()
    plt.show()


In [None]:
# pieChartPlotter(dataset,'waterfront')
# pieChartPlotter(dataset, 'floors')


In [None]:
def countPlotter(dataset):
    fig = plt.figure(figsize=(20, 20))
    for i in range(len(dataset.columns)):
        if not dataset.columns[i] == 'price':
            fig.add_subplot(np.ceil(len(dataset.columns)/2), 2, i)
            sns.countplot(dataset[dataset.columns[i]],
                          order=dataset[dataset.columns[i]].value_counts().index)

            fig.tight_layout(pad=3.0)


In [None]:
group_plot_data = dataset.drop(
    ['id', 'date', 'sqft_living15', 'sqft_lot15', 'lat', 'long', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode'], axis=1)


In [None]:
# countPlotter(group_plot_data)


In [None]:
def groupBarPlotter(dataset):
    # groups = dataset.groupby([column])['price'].mean()
    fig = plt.figure(figsize=(20, 20))
    for i in range(len(dataset.columns)):
        if not dataset.columns[i] == 'price':
            groups = dataset.groupby(dataset.columns[i])['price'].mean()
            fig.add_subplot(np.ceil(len(dataset.columns)/2), 2, i)
            plt.xlabel('price')
            groups.plot.barh()
            fig.tight_layout(pad=3.0)


In [None]:
# groupBarPlotter(group_plot_data)


In [None]:
dataset.head()


# Data Preprocessing

In [None]:
house = dataset.drop(['id', 'date','price'], axis=1)



In [None]:
def skewnessCorrector(dataset,columnName):
    import seaborn as sns
    from scipy import stats
    from scipy.stats import norm, boxcox
    """
    This function returns two plots distplot and probability plot for non-normalized data and after normalizing the provided data. 
    Just provide it with two parameters dataset and the name of column.
    It corrects the skewness of data applying Boxcox transformation on the provided data
    Example:
    1) Single Column
        skewnessCorrector(Dataset,'XYZ')
    2) Multiple Columns
        skewColumnList = ['ABC',
                  'DEF', 'GHI']
        for column in skewColumnList:
            skewnessCorrector(column)
    """
    print('''Before Correcting''')
    (mu, sigma) = norm.fit(dataset[columnName])
    print("Mu before correcting {} : {}, Sigma before correcting {} : {}".format(
        columnName.capitalize(), mu, columnName.capitalize(), sigma))
    plt.figure(figsize=(20, 10))
    plt.subplot(1, 2, 1)
    sns.distplot(dataset[columnName], fit=norm, color="lightcoral");
    plt.title(columnName.capitalize() +
              " Distplot before Skewness Correction", color="black")
    plt.subplot(1, 2, 2)
    stats.probplot(dataset[columnName], plot=plt)
    plt.show()
    # Applying BoxCox Transformation
    dataset[columnName], lam_fixed_acidity = boxcox(
        dataset[columnName])
    
    print('''After Correcting''')
    (mu, sigma) = norm.fit(dataset[columnName])
    print("Mu after correcting {} : {}, Sigma after correcting {} : {}".format(
        columnName.capitalize(), mu, columnName.capitalize(), sigma))
    plt.figure(figsize=(20, 10))
    plt.subplot(1, 2, 1)
    sns.distplot(dataset[columnName], fit=norm, color="orange");
    plt.title(columnName.capitalize() +
              " Distplot After Skewness Correction", color="black")
    plt.subplot(1, 2, 2)
    stats.probplot(dataset[columnName], plot=plt)
    plt.show()


In [None]:
skewcolumns = [ 'sqft_living', 'sqft_lot', 'floors',
                'condition', 'grade', 'sqft_above',
                 'lat',
               'sqft_living15', 'sqft_lot15']
for column in skewcolumns:
    skewnessCorrector(house,column)


In [None]:
X = house.values
y = dataset.iloc[:, 2:3].values
colnames = house.columns


In [None]:
X


In [None]:
y


In [None]:
X.shape


In [None]:
y.shape


# Feature Selection via RFE

In [None]:
# Define dictionary to store our rankings
ranks = {}
# Create our function which stores the feature rankings to the ranks dictionary


def ranking(ranks, names, order=1):
    minmax = MinMaxScaler()
    if np.array(ranks).ndim == 1:
        ranks = np.array(ranks).reshape(1, -1)
    ranks = minmax.fit_transform(order*np.array(ranks).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks))


In [None]:
ranks = {}
def featureRanker(X, y, ranking, colnames):
    """"""
    params = {}
    models = {
        LinearRegression(**params): [{'normalize': True}, 'lr'],
        Ridge(**params): [{'alpha': 7}, 'Ridge'],
        Lasso(**params): [{'alpha': .05}, 'Lasso'],
        ElasticNet(**params): [{'alpha': 0.0005, 'l1_ratio': .9, 'random_state': 0}, 'Elastic'],
        SVR(**params): [{'kernel': 'rbf'}, 'SVR'],
        RandomForestRegressor(**params): [{'n_jobs': -1,
                                           'n_estimators': 100, 'random_state': 0}, 'RF'],
        GradientBoostingRegressor(**params): [{'n_jobs': -1,
                                               'n_estimators': 100, 'random_state': 0}, 'GBR'],
        XGBRegressor(**params): [{'n_jobs': -1,
                                  'n_estimators': 100, 'random_state': 0}, 'XGBR'],
        LGBMRegressor(**params): [{'n_jobs': -1,
                                   'n_estimators': 100, 'random_state': 0}, 'LGBM'],
    }
    for i, model in enumerate(models):
        params = models[model][0]
        estimator = model
        print(model, params)
        estimator.fit(X, y)
        if models[model][1] == 'lr':
            rfe = RFE(estimator, n_features_to_select=1)
            rfe.fit(X, y)
            ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
        if not hasattr(estimator, 'coef_'):
            if not hasattr(estimator, 'dual_coef_'):
                ranks[models[model][1]] = ranking(
                    estimator.feature_importances_, colnames)
            elif hasattr(estimator, 'dual_coef_'):
                ranks[models[model][1]] = ranking(
                    np.abs(svr.dual_coef_), colnames)
        elif hasattr(estimator, 'coef_'):
            ranks[models[model][1]] = ranking(
                np.abs(estimator.coef_), colnames)
    return ranks


In [None]:
ranks = featureRanker(X, y, ranking, colnames)


# Creating the Feature Ranking Matrix

We combine the scores from the various methods above and output it in a matrix form for convenient viewing as such:

In [None]:
# Create empty dictionary to store the mean value calculated from all the scores
r = {}
for name in colnames:
    r[name] = round(np.mean([ranks[method][name]
                             for method in ranks.keys()]), 2)

methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")

print("\t%s" % "\t".join(methods))
for name in colnames:
    print("%s\t%s" % (name, "\t".join(map(str,
                                          [ranks[method][name] for method in methods]))))


Now, with the matrix above, the numbers and layout does not seem very easy or pleasant to the eye. Therefore, let's just collate the mean ranking score attributed to each of the feature and plot that via Seaborn's factorplot.

In [None]:
# Put the mean scores into a Pandas dataframe
meanplot = pd.DataFrame(list(r.items()), columns=['Feature', 'Mean Ranking'])

# Sort the dataframe
meanplot = meanplot.sort_values('Mean Ranking', ascending=False)


In [None]:
# Let's plot the ranking of the features
sns.factorplot(x="Mean Ranking", y="Feature", data=meanplot,
               kind="bar", size=4, aspect=1.9, palette='coolwarm')


Well as you can see from our feature ranking endeavours, the top 3 features are 'lat', 'grade' and 'waterfront'. The bottom 3 are 'yr_renovated', 'sqft_lot15' and 'sqft_lot' . This sort of feature ranking can be really useful, especially if one has many many features in the dataset and would like to trim or cut off features that contribute negligibly.

In [None]:
features = house.loc[:,['lat', 'grade', 'waterfront']].values
target = dataset.loc[:,['price']].values

In [None]:
features.shape

In [None]:
target.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=0)


In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
rmse_test = {}
rmse_train={}
score={}
def predictor(X_train, X_test ,y_train, y_test):
    """"""
    
    models = {
        LinearRegression(normalize= True):  'lr',
        Ridge(alpha= 7):  'Ridge',
        Lasso(alpha= .05):  'Lasso',
        ElasticNet(alpha= 0.0005, l1_ratio= .9, random_state= 0):  'Elastic',
        RandomForestRegressor(n_jobs= -1,
                               n_estimators= 100, random_state= 0):  'RF',
        GradientBoostingRegressor(n_estimators= 100, random_state= 0):  'GBR',
        XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                      colsample_bytree=1, max_depth=70, random_state = 0):  'XGBR',
        LGBMRegressor(n_jobs= -1,
                       n_estimators= 100, random_state= 0):  'LGBM',
    }
    for i, model in enumerate(models):

      
        estimator = model
        estimator.fit(X_train, y_train)
        ytrain_pred = estimator.predict(X_train)
        y_pred = estimator.predict(X_test)

        rmse_train[models[model]] = mean_squared_error(
            y_train, ytrain_pred, squared=False)
        rmse_test[models[model]] = mean_squared_error(
            y_test, y_pred, squared=False)
        score[models[model]] = estimator.score(X_test, y_test)
    return rmse_train, rmse_test, score


In [None]:
rmse_train, rmse_test, score = predictor(X_train, X_test, y_train, y_test)

In [None]:
rmse_train

In [None]:
rmse_test

In [None]:
score

In [None]:
estimator = XGBRegressor(
    objective='binary:logistic',
    nthread=4,
    seed=42
)


In [None]:
parameters = {
    'max_depth': range(2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}


In [None]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring='roc_auc',
    n_jobs=10,
    cv=10,
    verbose=True
)


In [None]:
grid_search.best_estimator_
