Introduction

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Any results you write to the current directory are saved as output.
df_macro = pd.read_csv('../input/macro.csv', parse_dates=['timestamp'])
df_train = pd.read_csv('../input/train.csv', parse_dates=['timestamp'])
df_test = pd.read_csv('../input/test.csv', parse_dates=['timestamp'])
df_sample_submission = pd.read_csv('../input/sample_submission.csv')

In [None]:
weak_model_train = df_train.dropna(axis=1)
weak_model_macro = df_macro.dropna(axis=1)
weak_model_test = df_test.dropna(axis=1)

In [None]:
i1 = set(weak_model_train.columns)
i2 = set(weak_model_test.columns)

# new set with element in i1 but not in i2
i1.difference(i2)

In [None]:
i2.difference(i1)

In [None]:
weak_model_test = weak_model_test.drop(['floor',
                                          'kitch_sq',
                                          'material',
                                          'max_floor',
                                          'num_room'],
                                        axis=1)
weak_model_train = weak_model_train.drop(['green_part_2000',
                                        'product_type'],
                                      axis=1)

In [None]:
weak_model_test.shape

In [None]:
weak_model_train.shape

In [None]:
weak_model_union = weak_model_train.merge(weak_model_macro, left_on='timestamp', right_on='timestamp', how='inner')

# We remove id and timestamp
weak_model_union = weak_model_union.drop(['id', 'timestamp'], axis=1)

# We only keep continuous predictors
weak_model_union = weak_model_union.select_dtypes([np.number])

In [None]:
weak_model_test = weak_model_test.merge(weak_model_macro, left_on='timestamp', right_on='timestamp', how='inner')

# We remove id and timestamp
weak_model_test = weak_model_test.drop(['id', 'timestamp'], axis=1)

# We only keep continuous predictors
weak_model_test = weak_model_test.select_dtypes([np.number])

In [None]:
index_unions = set(weak_model_union.columns)
index_test = set(weak_model_test.columns)

# new set with element in index_unions but not in index_test
index_unions.difference(index_test)

In [None]:
weak_model_union.shape

In [None]:
weak_model_test.shape

In [None]:
# Machine learning - metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

# Machine learning - algorithms
from sklearn.ensemble import GradientBoostingRegressor

# Machine learning - preprocessing
from sklearn import preprocessing

In [None]:
X = weak_model_union.drop('price_doc', axis=1).values
Y = weak_model_union['price_doc'].values

In [None]:
# Normalization
std_scale = preprocessing.StandardScaler().fit(X)
X = std_scale.transform(X)

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

title = "Learning Curves (GBR)"
cv = 5
estimator = GradientBoostingRegressor()
plot_learning_curve(estimator, title, X, Y, ylim=(0.7, 1.01), cv=cv)

plt.show()

In [None]:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
X, Y, test_size = 0.33, random_state = 5)

In [None]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, Y_train)
pred_train = gbr.predict(X_train)
pred_test = gbr.predict(X_test)

In [None]:
print("Fit a model X_train, and calculate RMSLE with Y_train:",
      RMSLE(gbr.predict(X_train), Y_train))
print("Fit a model X_train, and calculate RMSLE with X_test, Y_test:",
      RMSLE(gbr.predict(X_test), Y_test))

In [None]:
# Root Mean Squared Logarithmic Error
def RMSLE(predictions, a):
    return np.sqrt( (1/a.shape[0]) * np.square(np.sum([np.log(predictions+1), - np.log(a+1)])) )

In [None]:
import xgboost as xgb
from sklearn.cross_validation import KFold, train_test_split

rng = np.random.RandomState(31337)
kf = KFold(Y.shape[0], n_folds=10, shuffle=True, random_state=rng)
err = []

for train_index, test_index in kf:
    xgb_model = xgb.XGBRegressor().fit(X[train_index], Y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = Y[test_index]
    err.append(RMSLE(predictions, actuals))

print("xgboost results, mean: {}, std: {}".format(np.mean(err), np.std(err)))

xgb_model = xgb.XGBRegressor()

param_grid = {'max_depth' : [4],
              'n_estimators': [50, 100]
              }

grid_search = GridSearchCV(xgb_model,
                           param_grid=param_grid,
                           cv=kf,
                           verbose=1)

grid_search.fit(X, Y)
grid_search.best_score_, grid_search.best_params_

# Predict on test set then write on CSV file

In [None]:
X_validation = weak_model_test.values

In [None]:
xgb_model = xgb.XGBRegressor().fit(X, Y)
predictions_validation = xgb_model.predict(X_validation)

In [None]:
result_csv = pd.DataFrame({'id':df_test.id.values , 'price_doc': predictions_validation})
result_csv.to_csv('result.csv', index = False)