In [35]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, normalize
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
from seaborn import barplot
%matplotlib inline

# IMPORT TRAIN DATA

In [3]:
df = pd.read_csv('training_data_cleaned.csv', low_memory = False)
dfy = pd.read_csv('training_data_cleaned_y.csv', header=None)

In [25]:
X = np.array(df.values)
y = dfy.values.reshape(y.shape[0],)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.30)

In [38]:
test_dataframe = pd.read_csv('testing_data_cleaned.csv')
testing_X = test_dataframe.values

In [47]:
true_predicted_values = pd.read_csv('data/do_not_open/test_soln.csv')

In [48]:
true_predicted_values_list = true_predicted_values['SalePrice'].values

In [57]:
true_list = []
for i in true_predicted_values_list:
    true_list.append(float(i))
true_list = np.array(true_list)
true_list

array([ 31000.,  54000.,  26500., ...,  12500.,  10000.,  13000.])

In [60]:
model = RandomForestRegressor(bootstrap = True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes = None,
            min_impurity_split = 1e-08, min_samples_leaf = 1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators = 100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [65]:
model.fit(X_train, y_train)

array([  5.66061857e-02,   2.46387629e-02,   3.52081989e-02,
         8.84527611e-04,   2.60492080e-01,   5.06227783e-03,
         2.33610858e-03,   7.66210969e-02,   1.99962294e-02,
         7.05708280e-02,   5.63437558e-02,   5.28107447e-03,
         4.53084005e-03,   2.27615940e-01,   3.68294908e-02,
         1.54203323e-02,   3.02936835e-04,   3.23086858e-04,
         1.87259254e-03,   1.13680239e-02,   6.67822750e-04,
         6.77635470e-05,   9.54870714e-04,   2.23708534e-04,
         1.46018906e-03,   4.63249418e-05,   1.40794764e-03,
         1.35922575e-02,   2.18788048e-03,   2.10385874e-03,
         9.00817702e-03,   2.21057294e-03,   3.26209218e-03,
         1.38869143e-03,   4.02763280e-03,   2.58455824e-03,
         1.56421541e-03,   2.94462398e-02,   9.35209739e-06,
         4.23015445e-06,   2.30931951e-04,   1.58329245e-03,
         1.90173180e-03,   1.10787519e-03,   5.28125452e-04,
         8.40832823e-04,   4.67949424e-04,   2.77881515e-03,
         1.52273097e-03,

In [68]:
feature_importance_list = np.column_stack((X, model.feature_importances_))
feature_importance_list

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [62]:
model.score(X_test, y_test)

0.88413512336001554

In [63]:
predictions = model.predict(testing_X)

In [64]:
log_diff = np.log(predictions + 1) - np.log(true_list + 1)
err = np.sqrt(np.mean(log_diff**2))
err

0.45699142465806342