diff --git a/house-prices-py/house-prices-random-forest.ipynb b/house-prices-py/house-prices-random-forest.ipynb new file mode 100644 index 0000000..f9330fa --- /dev/null +++ b/house-prices-py/house-prices-random-forest.ipynb @@ -0,0 +1,1567 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.cross_validation import train_test_split\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DOTSCIENCE_INPUTS=[\"agent1\", \"agent2\"]\n", + "DOTSCIENCE_OUTPUTS=[\"model\"]\n", + "DOTSCIENCE_LABELS={\"model_type\": \"random_forest\"}\n" + ] + } + ], + "source": [ + "print('DOTSCIENCE_INPUTS=[\"agent1\", \"agent2\"]')\n", + "print('DOTSCIENCE_OUTPUTS=[\"model\"]')\n", + "print('DOTSCIENCE_LABELS={\"model_type\": \"random_forest\"}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are using a housing price dataset sourced from Bay Area Home Sales Database and Zillow. This dataset was based on the homes sold between January 2013 and December 2015. " + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "inputs = [pd.read_csv('./agent1/bay_area_zillow_agent1.csv'), pd.read_csv('./agent2/bay_area_zillow_agent2.csv')]\n", + "df = pd.concat(f for f in inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# randomise my dataframe rows to remove any ordering in the data\n", + "# TODO fix seed to preserve reproducibility\n", + "df = df.sample(frac=1).reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0addressinfoz_addressbathroomsbedroomsfinishedsqftlastsolddatelastsoldpricelatitudelongitudeneighborhoodtotalroomsusecodeyearbuiltzestimatezindexvaluezipcodezpid
count11330.00000011330113301133011330.00000011330.00000011330.000000113301.133000e+0411330.00000011330.0000001133011330.0000001133011330.0000001.133000e+041.133000e+0411330.0000001.133000e+04
uniqueNaN107301123210684NaNNaNNaN954NaNNaNNaN71NaN10NaNNaNNaNNaNNaN
topNaNAddress: 1300 Eddy StreetSan FranciscoSales price: 850000Sales date: 02...1300 Eddy StNaNNaNNaN08/30/2013NaNNaNNaNMissionNaNSingleFamilyNaNNaNNaNNaNNaN
freqNaN535NaNNaNNaN46NaNNaNNaN540NaN5803NaNNaNNaNNaNNaN
mean9171.729214NaNNaNNaN1.9802292.6144751585.420918NaN1.263928e+0637.759711-122.436518NaN6.111562NaN1948.4981471.565695e+061.320205e+0694116.9120043.689973e+07
std4921.941074NaNNaNNaN1.0473581.299457921.978245NaN1.042079e+060.0255780.030743NaN12.125819NaN37.9111961.229417e+065.848170e+059.4008777.800741e+07
min2.000000NaNNaNNaN0.5000000.0000001.000000NaN5.350000e+0237.708170-122.510726NaN1.000000NaN1860.0000004.323850e+056.881000e+0594102.0000001.506329e+07
25%5039.750000NaNNaNNaN1.0000002.0000001019.000000NaN7.292500e+0537.739286-122.455157NaN4.000000NaN1916.0000009.052375e+059.829000e+0594110.0000001.510847e+07
50%9198.500000NaNNaNNaN2.0000002.0000001362.000000NaN9.900000e+0537.760513-122.432510NaN5.000000NaN1940.0000001.230758e+061.211900e+0694115.0000001.515697e+07
75%13374.750000NaNNaNNaN2.0000003.0000001876.000000NaN1.450000e+0637.781386-122.413359NaN7.000000NaN1986.0000001.731170e+061.480400e+0694123.0000005.970040e+07
max17632.000000NaNNaNNaN14.00000020.00000027275.000000NaN2.388900e+0737.806083-122.381201NaN1264.000000NaN2016.0000001.553325e+075.333500e+0694158.0000002.146999e+09
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 address \\\n", + "count 11330.000000 11330 \n", + "unique NaN 10730 \n", + "top NaN Address: 1300 Eddy Street \n", + "freq NaN 5 \n", + "mean 9171.729214 NaN \n", + "std 4921.941074 NaN \n", + "min 2.000000 NaN \n", + "25% 5039.750000 NaN \n", + "50% 9198.500000 NaN \n", + "75% 13374.750000 NaN \n", + "max 17632.000000 NaN \n", + "\n", + " info z_address \\\n", + "count 11330 11330 \n", + "unique 11232 10684 \n", + "top San FranciscoSales price: 850000Sales date: 02... 1300 Eddy St \n", + "freq 3 5 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " bathrooms bedrooms finishedsqft lastsolddate lastsoldprice \\\n", + "count 11330.000000 11330.000000 11330.000000 11330 1.133000e+04 \n", + "unique NaN NaN NaN 954 NaN \n", + "top NaN NaN NaN 08/30/2013 NaN \n", + "freq NaN NaN NaN 46 NaN \n", + "mean 1.980229 2.614475 1585.420918 NaN 1.263928e+06 \n", + "std 1.047358 1.299457 921.978245 NaN 1.042079e+06 \n", + "min 0.500000 0.000000 1.000000 NaN 5.350000e+02 \n", + "25% 1.000000 2.000000 1019.000000 NaN 7.292500e+05 \n", + "50% 2.000000 2.000000 1362.000000 NaN 9.900000e+05 \n", + "75% 2.000000 3.000000 1876.000000 NaN 1.450000e+06 \n", + "max 14.000000 20.000000 27275.000000 NaN 2.388900e+07 \n", + "\n", + " latitude longitude neighborhood totalrooms usecode \\\n", + "count 11330.000000 11330.000000 11330 11330.000000 11330 \n", + "unique NaN NaN 71 NaN 10 \n", + "top NaN NaN Mission NaN SingleFamily \n", + "freq NaN NaN 540 NaN 5803 \n", + "mean 37.759711 -122.436518 NaN 6.111562 NaN \n", + "std 0.025578 0.030743 NaN 12.125819 NaN \n", + "min 37.708170 -122.510726 NaN 1.000000 NaN \n", + "25% 37.739286 -122.455157 NaN 4.000000 NaN \n", + "50% 37.760513 -122.432510 NaN 5.000000 NaN \n", + "75% 37.781386 -122.413359 NaN 7.000000 NaN \n", + "max 37.806083 -122.381201 NaN 1264.000000 NaN \n", + "\n", + " yearbuilt zestimate zindexvalue zipcode zpid \n", + "count 11330.000000 1.133000e+04 1.133000e+04 11330.000000 1.133000e+04 \n", + "unique NaN NaN NaN NaN NaN \n", + "top NaN NaN NaN NaN NaN \n", + "freq NaN NaN NaN NaN NaN \n", + "mean 1948.498147 1.565695e+06 1.320205e+06 94116.912004 3.689973e+07 \n", + "std 37.911196 1.229417e+06 5.848170e+05 9.400877 7.800741e+07 \n", + "min 1860.000000 4.323850e+05 6.881000e+05 94102.000000 1.506329e+07 \n", + "25% 1916.000000 9.052375e+05 9.829000e+05 94110.000000 1.510847e+07 \n", + "50% 1940.000000 1.230758e+06 1.211900e+06 94115.000000 1.515697e+07 \n", + "75% 1986.000000 1.731170e+06 1.480400e+06 94123.000000 5.970040e+07 \n", + "max 2016.000000 1.553325e+07 5.333500e+06 94158.000000 2.146999e+09 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe(include = \"all\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Unnamed: 0', 'address', 'info', 'z_address', 'bathrooms', 'bedrooms',\n", + " 'finishedsqft', 'lastsolddate', 'lastsoldprice', 'latitude',\n", + " 'longitude', 'neighborhood', 'totalrooms', 'usecode', 'yearbuilt',\n", + " 'zestimate', 'zindexvalue', 'zipcode', 'zpid'],\n", + " dtype='object')" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# drop unneeded columns\n", + "df.drop(df.columns[[0, 1, 2, 3, 11, 13, 14, 15, 16, 17, 18]], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bathroomsbedroomsfinishedsqftlastsolddatelastsoldpricelatitudelongitudetotalrooms
count11330.00000011330.00000011330.000000113301.133000e+0411330.00000011330.00000011330.000000
uniqueNaNNaNNaN954NaNNaNNaNNaN
topNaNNaNNaN08/30/2013NaNNaNNaNNaN
freqNaNNaNNaN46NaNNaNNaNNaN
mean1.9802292.6144751585.420918NaN1.263928e+0637.759711-122.4365186.111562
std1.0473581.299457921.978245NaN1.042079e+060.0255780.03074312.125819
min0.5000000.0000001.000000NaN5.350000e+0237.708170-122.5107261.000000
25%1.0000002.0000001019.000000NaN7.292500e+0537.739286-122.4551574.000000
50%2.0000002.0000001362.000000NaN9.900000e+0537.760513-122.4325105.000000
75%2.0000003.0000001876.000000NaN1.450000e+0637.781386-122.4133597.000000
max14.00000020.00000027275.000000NaN2.388900e+0737.806083-122.3812011264.000000
\n", + "
" + ], + "text/plain": [ + " bathrooms bedrooms finishedsqft lastsolddate lastsoldprice \\\n", + "count 11330.000000 11330.000000 11330.000000 11330 1.133000e+04 \n", + "unique NaN NaN NaN 954 NaN \n", + "top NaN NaN NaN 08/30/2013 NaN \n", + "freq NaN NaN NaN 46 NaN \n", + "mean 1.980229 2.614475 1585.420918 NaN 1.263928e+06 \n", + "std 1.047358 1.299457 921.978245 NaN 1.042079e+06 \n", + "min 0.500000 0.000000 1.000000 NaN 5.350000e+02 \n", + "25% 1.000000 2.000000 1019.000000 NaN 7.292500e+05 \n", + "50% 2.000000 2.000000 1362.000000 NaN 9.900000e+05 \n", + "75% 2.000000 3.000000 1876.000000 NaN 1.450000e+06 \n", + "max 14.000000 20.000000 27275.000000 NaN 2.388900e+07 \n", + "\n", + " latitude longitude totalrooms \n", + "count 11330.000000 11330.000000 11330.000000 \n", + "unique NaN NaN NaN \n", + "top NaN NaN NaN \n", + "freq NaN NaN NaN \n", + "mean 37.759711 -122.436518 6.111562 \n", + "std 0.025578 0.030743 12.125819 \n", + "min 37.708170 -122.510726 1.000000 \n", + "25% 37.739286 -122.455157 4.000000 \n", + "50% 37.760513 -122.432510 5.000000 \n", + "75% 37.781386 -122.413359 7.000000 \n", + "max 37.806083 -122.381201 1264.000000 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe(include = \"all\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "bathrooms False\n", + "bedrooms False\n", + "finishedsqft False\n", + "lastsolddate False\n", + "lastsoldprice False\n", + "latitude False\n", + "longitude False\n", + "totalrooms False\n", + "dtype: bool" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check none of our data is null or NaN\n", + "df.isnull().any()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "bathrooms float64\n", + "bedrooms int64\n", + "finishedsqft int64\n", + "lastsolddate object\n", + "lastsoldprice int64\n", + "latitude float64\n", + "longitude float64\n", + "totalrooms int64\n", + "dtype: object" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes\n" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "df['bathrooms'] = df['bathrooms'].astype('int64', copy=False)\n", + "df['lastsolddate'] = pd.to_datetime(df['lastsolddate'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "bathrooms int64\n", + "bedrooms int64\n", + "finishedsqft int64\n", + "lastsolddate datetime64[ns]\n", + "lastsoldprice int64\n", + "latitude float64\n", + "longitude float64\n", + "totalrooms int64\n", + "dtype: object" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We hypothesise that `finishedsqft`, `bathrooms` and `bedrooms` are positively correlated with `lastsoldprice`. Let's plot these to see." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plot(x='finishedsqft', y='lastsoldprice', style='o')\n", + "df.plot(x='bathrooms', y='lastsoldprice', style='o')\n", + "df.plot(x='bedrooms', y='lastsoldprice', style='o')\n", + "df.plot(x='totalrooms', y='lastsoldprice', style='o')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's try a random forest model on those features to predict `lastsoldprice`" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "X = df[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'longitude', 'latitude']]\n", + "Y = df['lastsoldprice']" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "features = df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['bathrooms', 'bedrooms', 'finishedsqft', 'lastsolddate',\n", + " 'lastsoldprice', 'latitude', 'longitude', 'totalrooms'],\n", + " dtype='object')" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# These are our features:\n", + "features" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "# Split data into test and training set. Use random_state for reproducibility\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)\n", + "\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "param_grid = [\n", + " {'n_estimators': [3, 10, 12, 14], 'max_features': [1,2, 3, 4, 5, 6], 'bootstrap': [False]}\n", + "]\n", + "\n", + "rand_forest_regressor = RandomForestRegressor()\n", + "\n", + "# grid_search = GridSearchCV(rand_forest_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')\n", + "\n", + "# grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['bootstrap', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rand_forest_regressor.get_params().keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "grid_search = GridSearchCV(rand_forest_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'bootstrap': False, 'max_features': 1, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 1, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 1, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 1, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 14}]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#rand_forest_regressor.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=5, error_score='raise',\n", + " estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", + " max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", + " oob_score=False, random_state=None, verbose=0, warm_start=False),\n", + " fit_params=None, iid=True, n_jobs=1,\n", + " param_grid=[{'n_estimators': [3, 10, 12, 14], 'max_features': [1, 2, 3, 4, 5, 6], 'bootstrap': [False]}],\n", + " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", + " scoring='neg_mean_squared_error', verbose=0)" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bootstrap False\n", + "max_features 1\n", + "n_estimators 3\n", + "bootstrap False\n", + "max_features 1\n", + "n_estimators 10\n", + "bootstrap False\n", + "max_features 1\n", + "n_estimators 12\n", + "bootstrap False\n", + "max_features 1\n", + "n_estimators 14\n", + "bootstrap False\n", + "max_features 2\n", + "n_estimators 3\n", + "bootstrap False\n", + "max_features 2\n", + "n_estimators 10\n", + "bootstrap False\n", + "max_features 2\n", + "n_estimators 12\n", + "bootstrap False\n", + "max_features 2\n", + "n_estimators 14\n", + "bootstrap False\n", + "max_features 3\n", + "n_estimators 3\n", + "bootstrap False\n", + "max_features 3\n", + "n_estimators 10\n", + "bootstrap False\n", + "max_features 3\n", + "n_estimators 12\n", + "bootstrap False\n", + "max_features 3\n", + "n_estimators 14\n", + "bootstrap False\n", + "max_features 4\n", + "n_estimators 3\n", + "bootstrap False\n", + "max_features 4\n", + "n_estimators 10\n", + "bootstrap False\n", + "max_features 4\n", + "n_estimators 12\n", + "bootstrap False\n", + "max_features 4\n", + "n_estimators 14\n", + "bootstrap False\n", + "max_features 5\n", + "n_estimators 3\n", + "bootstrap False\n", + "max_features 5\n", + "n_estimators 10\n", + "bootstrap False\n", + "max_features 5\n", + "n_estimators 12\n", + "bootstrap False\n", + "max_features 5\n", + "n_estimators 14\n", + "bootstrap False\n", + "max_features 6\n", + "n_estimators 3\n", + "bootstrap False\n", + "max_features 6\n", + "n_estimators 10\n", + "bootstrap False\n", + "max_features 6\n", + "n_estimators 12\n", + "bootstrap False\n", + "max_features 6\n", + "n_estimators 14\n" + ] + } + ], + "source": [ + "for param_dict in grid_search.cv_results_['params']:\n", + " for key in param_dict:\n", + " print(key, param_dict[key])\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'bootstrap': False, 'max_features': 1, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 1, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 1, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 1, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 14}]" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#print(grid_search.cv_results_)\n", + "grid_search.cv_results_['params']" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split3_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split4_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n" + ] + }, + { + "data": { + "text/plain": [ + "{'mean_fit_time': array([ 0.02670035, 0.07218471, 0.0794826 , 0.09442725, 0.03206363,\n", + " 0.09923239, 0.11128678, 0.13725066, 0.04301653, 0.13199544,\n", + " 0.17645354, 0.19107656, 0.05362077, 0.16885095, 0.21285925,\n", + " 0.23253999, 0.06591096, 0.21372252, 0.23496523, 0.26321697,\n", + " 0.06686144, 0.22868457, 0.28511877, 0.33475099]),\n", + " 'std_fit_time': array([ 0.0027579 , 0.00708657, 0.00544732, 0.00600505, 0.00313444,\n", + " 0.00198584, 0.0021343 , 0.00471558, 0.00172394, 0.00476192,\n", + " 0.01372714, 0.01207889, 0.00258355, 0.00605446, 0.00831972,\n", + " 0.00932508, 0.00374727, 0.0087858 , 0.00829984, 0.00423302,\n", + " 0.00184826, 0.00382915, 0.02120295, 0.01036391]),\n", + " 'mean_score_time': array([ 0.00182452, 0.00407996, 0.00453982, 0.00528498, 0.00177431,\n", + " 0.00419474, 0.00507927, 0.00540996, 0.00154238, 0.00390286,\n", + " 0.00559421, 0.005477 , 0.00178895, 0.0042151 , 0.00505462,\n", + " 0.00534844, 0.00163779, 0.00430694, 0.00482154, 0.00492797,\n", + " 0.00133824, 0.0037324 , 0.00462227, 0.00533772]),\n", + " 'std_score_time': array([ 2.85087678e-04, 4.44251950e-04, 8.28719934e-05,\n", + " 3.31613740e-04, 2.64974638e-04, 4.98963847e-04,\n", + " 8.44094708e-04, 3.86222350e-04, 4.98128023e-05,\n", + " 1.73863773e-04, 5.80196938e-04, 5.62661409e-04,\n", + " 3.03510076e-04, 3.24203004e-04, 4.87569144e-04,\n", + " 3.82854244e-04, 7.05502451e-05, 8.13061547e-04,\n", + " 1.00646838e-03, 7.94383515e-05, 9.54047369e-05,\n", + " 2.45328589e-04, 4.12959738e-04, 4.63507548e-04]),\n", + " 'param_bootstrap': masked_array(data = [False False False False False False False False False False False False\n", + " False False False False False False False False False False False False],\n", + " mask = [False False False False False False False False False False False False\n", + " False False False False False False False False False False False False],\n", + " fill_value = ?),\n", + " 'param_max_features': masked_array(data = [1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6],\n", + " mask = [False False False False False False False False False False False False\n", + " False False False False False False False False False False False False],\n", + " fill_value = ?),\n", + " 'param_n_estimators': masked_array(data = [3 10 12 14 3 10 12 14 3 10 12 14 3 10 12 14 3 10 12 14 3 10 12 14],\n", + " mask = [False False False False False False False False False False False False\n", + " False False False False False False False False False False False False],\n", + " fill_value = ?),\n", + " 'params': [{'bootstrap': False, 'max_features': 1, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 1, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 1, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 1, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 2, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 3, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 4, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 5, 'n_estimators': 14},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 3},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 10},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 12},\n", + " {'bootstrap': False, 'max_features': 6, 'n_estimators': 14}],\n", + " 'split0_test_score': array([ -3.73210453e+11, -3.75617421e+11, -3.53000648e+11,\n", + " -3.48841678e+11, -3.89588473e+11, -3.50343802e+11,\n", + " -3.38603327e+11, -3.36419514e+11, -4.35346491e+11,\n", + " -3.19536464e+11, -3.37024631e+11, -3.29556892e+11,\n", + " -4.12524444e+11, -3.31318687e+11, -3.43613585e+11,\n", + " -3.31097418e+11, -4.08422691e+11, -3.67054186e+11,\n", + " -3.69188263e+11, -3.73568342e+11, -5.09776978e+11,\n", + " -5.07857455e+11, -5.16109617e+11, -5.03238538e+11]),\n", + " 'split1_test_score': array([ -4.85921454e+11, -4.26055299e+11, -4.37122637e+11,\n", + " -3.57437502e+11, -4.85473380e+11, -4.07036025e+11,\n", + " -3.85138089e+11, -3.87314074e+11, -4.39112367e+11,\n", + " -3.46495062e+11, -3.70806046e+11, -4.14826225e+11,\n", + " -4.54859564e+11, -3.76532174e+11, -3.68583079e+11,\n", + " -3.86200349e+11, -4.99693692e+11, -3.74434248e+11,\n", + " -4.07055253e+11, -3.95715275e+11, -5.77531645e+11,\n", + " -5.70379520e+11, -5.75627608e+11, -5.74490267e+11]),\n", + " 'split2_test_score': array([ -3.53828995e+11, -3.04408880e+11, -2.89291056e+11,\n", + " -2.66893217e+11, -3.51380273e+11, -2.72608128e+11,\n", + " -3.02995686e+11, -2.88950680e+11, -3.31649826e+11,\n", + " -2.70944599e+11, -2.80415236e+11, -2.55037934e+11,\n", + " -3.80568173e+11, -2.84812762e+11, -2.85750408e+11,\n", + " -2.76453429e+11, -3.47291223e+11, -3.66580438e+11,\n", + " -2.92836064e+11, -2.89943075e+11, -4.97099654e+11,\n", + " -4.88324636e+11, -4.85433095e+11, -4.86742451e+11]),\n", + " 'split3_test_score': array([ -3.39816242e+11, -2.83434114e+11, -2.68707981e+11,\n", + " -2.82834452e+11, -3.69059905e+11, -3.10539503e+11,\n", + " -3.04013156e+11, -2.72275162e+11, -3.69934651e+11,\n", + " -2.88188104e+11, -2.82239079e+11, -3.07033906e+11,\n", + " -3.35689531e+11, -2.86390315e+11, -2.97174218e+11,\n", + " -3.12073956e+11, -3.83696120e+11, -3.01340113e+11,\n", + " -2.79114192e+11, -2.77464461e+11, -4.07529908e+11,\n", + " -4.03536456e+11, -4.00890599e+11, -4.08953559e+11]),\n", + " 'split4_test_score': array([ -3.94472822e+11, -3.01609342e+11, -2.82826344e+11,\n", + " -3.12078007e+11, -4.22748129e+11, -2.71236224e+11,\n", + " -2.89100625e+11, -3.02856998e+11, -3.74722610e+11,\n", + " -3.04669627e+11, -2.96513114e+11, -2.83199011e+11,\n", + " -4.26019631e+11, -3.65731558e+11, -3.10472419e+11,\n", + " -3.41221055e+11, -4.31300989e+11, -3.99737860e+11,\n", + " -3.86339668e+11, -3.81359891e+11, -5.84484502e+11,\n", + " -5.71717919e+11, -5.87681309e+11, -5.79422226e+11]),\n", + " 'mean_test_score': array([ -3.89447945e+11, -3.38229726e+11, -3.26193114e+11,\n", + " -3.13621413e+11, -4.03648259e+11, -3.22356266e+11,\n", + " -3.23972022e+11, -3.17565663e+11, -3.90158887e+11,\n", + " -3.05968482e+11, -3.13402600e+11, -3.17932259e+11,\n", + " -4.01933604e+11, -3.28957397e+11, -3.21121578e+11,\n", + " -3.29409454e+11, -4.14080230e+11, -3.61830028e+11,\n", + " -3.46909497e+11, -3.43613986e+11, -5.15283843e+11,\n", + " -5.08363134e+11, -5.13148819e+11, -5.10568484e+11]),\n", + " 'std_test_score': array([ 5.16266300e+10, 5.40327175e+10, 6.25603187e+10,\n", + " 3.54797081e+10, 4.72966041e+10, 5.13430957e+10,\n", + " 3.46673258e+10, 4.07575192e+10, 4.12533288e+10,\n", + " 2.59641526e+10, 3.52060003e+10, 5.44107709e+10,\n", + " 4.08343925e+10, 3.84214836e+10, 3.06470604e+10,\n", + " 3.59766998e+10, 5.10696945e+10, 3.25702517e+10,\n", + " 5.13563692e+10, 4.95820595e+10, 6.42383656e+10,\n", + " 6.20449245e+10, 6.75790690e+10, 6.28830243e+10]),\n", + " 'rank_test_score': array([16, 12, 9, 3, 19, 7, 8, 4, 17, 1, 2, 5, 18, 10, 6, 11, 20,\n", + " 15, 14, 13, 24, 21, 23, 22], dtype=int32),\n", + " 'split0_train_score': array([ -1.39474571e+09, -1.39557176e+09, -1.39474593e+09,\n", + " -1.39488215e+09, -1.39492085e+09, -1.39478511e+09,\n", + " -1.39516107e+09, -1.39478041e+09, -1.39474571e+09,\n", + " -1.39477849e+09, -1.39496611e+09, -1.39479348e+09,\n", + " -1.39509599e+09, -1.39474571e+09, -1.39531606e+09,\n", + " -1.39474571e+09, -1.39474571e+09, -1.39474571e+09,\n", + " -1.39474571e+09, -1.39474973e+09, -1.39474571e+09,\n", + " -1.39474571e+09, -1.39474571e+09, -1.39474571e+09]),\n", + " 'split1_train_score': array([ -3.71654798e+09, -3.71667824e+09, -3.71682614e+09,\n", + " -3.71659435e+09, -3.72530378e+09, -3.71657083e+09,\n", + " -3.71657912e+09, -3.71677602e+09, -3.71709522e+09,\n", + " -3.71654924e+09, -3.71654957e+09, -3.71656658e+09,\n", + " -3.71655148e+09, -3.71656124e+09, -3.71655148e+09,\n", + " -3.71674498e+09, -3.71654798e+09, -3.71655082e+09,\n", + " -3.71654820e+09, -3.71654898e+09, -3.71654798e+09,\n", + " -3.71654798e+09, -3.71654798e+09, -3.71654798e+09]),\n", + " 'split2_train_score': array([ -3.83041187e+09, -3.83046681e+09, -3.83129961e+09,\n", + " -3.83111138e+09, -3.83041187e+09, -3.83054584e+09,\n", + " -3.83076155e+09, -3.83104134e+09, -3.83041187e+09,\n", + " -3.83041344e+09, -3.83042318e+09, -3.83041776e+09,\n", + " -3.83041187e+09, -3.83041187e+09, -3.83041187e+09,\n", + " -3.83041187e+09, -3.83041187e+09, -3.83041187e+09,\n", + " -3.83041187e+09, -3.83041187e+09, -3.83041187e+09,\n", + " -3.83041187e+09, -3.83041187e+09, -3.83041187e+09]),\n", + " 'split3_train_score': array([ -3.82759258e+09, -3.82757983e+09, -3.82763665e+09,\n", + " -3.82771829e+09, -3.83186892e+09, -3.82777968e+09,\n", + " -3.82758574e+09, -3.82757857e+09, -3.82757857e+09,\n", + " -3.82757857e+09, -3.82757857e+09, -3.82757857e+09,\n", + " -3.82757857e+09, -3.82757857e+09, -3.82757857e+09,\n", + " -3.82757857e+09, -3.82757857e+09, -3.82757857e+09,\n", + " -3.82757857e+09, -3.82757857e+09, -3.82757857e+09,\n", + " -3.82757857e+09, -3.82757857e+09, -3.82757857e+09]),\n", + " 'split4_train_score': array([ -1.80517362e+09, -1.80562566e+09, -1.80531850e+09,\n", + " -1.80520222e+09, -1.80517362e+09, -1.80520514e+09,\n", + " -1.80536271e+09, -1.80517430e+09, -1.80517712e+09,\n", + " -1.80517393e+09, -1.80517362e+09, -1.80519409e+09,\n", + " -1.80517362e+09, -1.80517393e+09, -1.80517383e+09,\n", + " -1.80517510e+09, -1.80517362e+09, -1.80517362e+09,\n", + " -1.80517449e+09, -1.80517378e+09, -1.80517362e+09,\n", + " -1.80517362e+09, -1.80517362e+09, -1.80517362e+09]),\n", + " 'mean_train_score': array([ -2.91489435e+09, -2.91518446e+09, -2.91516536e+09,\n", + " -2.91510168e+09, -2.91753581e+09, -2.91497732e+09,\n", + " -2.91509004e+09, -2.91507013e+09, -2.91500170e+09,\n", + " -2.91489874e+09, -2.91493821e+09, -2.91491010e+09,\n", + " -2.91496231e+09, -2.91489426e+09, -2.91500636e+09,\n", + " -2.91493125e+09, -2.91489155e+09, -2.91489212e+09,\n", + " -2.91489177e+09, -2.91489259e+09, -2.91489155e+09,\n", + " -2.91489155e+09, -2.91489155e+09, -2.91489155e+09]),\n", + " 'std_train_score': array([ 1.08223574e+09, 1.08193741e+09, 1.08240484e+09,\n", + " 1.08233799e+09, 1.08420872e+09, 1.08227581e+09,\n", + " 1.08214290e+09, 1.08236379e+09, 1.08231375e+09,\n", + " 1.08222455e+09, 1.08217361e+09, 1.08221951e+09,\n", + " 1.08213549e+09, 1.08223527e+09, 1.08207363e+09,\n", + " 1.08226226e+09, 1.08223337e+09, 1.08223379e+09,\n", + " 1.08223323e+09, 1.08223236e+09, 1.08223337e+09,\n", + " 1.08223337e+09, 1.08223337e+09, 1.08223337e+09])}" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search.cv_results_" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grid_search.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "ename": "NotFittedError", + "evalue": "This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotFittedError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgrid_search\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/sklearn/utils/metaestimators.py\u001b[0m in \u001b[0;36m\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[0;31m# lambda, but not partial, allows help() to work with update_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 115\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 116\u001b[0m \u001b[0;31m# update the docstring of the returned function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0mupdate_wrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 466\u001b[0m \"\"\"\n\u001b[0;32m--> 467\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'predict'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 468\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36m_check_is_fitted\u001b[0;34m(self, method_name)\u001b[0m\n\u001b[1;32m 449\u001b[0m % (type(self).__name__, method_name))\n\u001b[1;32m 450\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 451\u001b[0;31m \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'best_estimator_'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 452\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 453\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mif_delegate_has_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelegate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'best_estimator_'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'estimator'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_is_fitted\u001b[0;34m(estimator, attributes, msg, all_or_any)\u001b[0m\n\u001b[1;32m 766\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 767\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mall_or_any\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattr\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mattr\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mattributes\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 768\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mNotFittedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 769\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 770\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNotFittedError\u001b[0m: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method." + ] + } + ], + "source": [ + "y_pred = grid_search.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#rand_forest_regressor.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-387803891893.90759" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Mix type of y not allowed, got types {'multiclass', 'continuous'}", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mclassification_report\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/sklearn/metrics/classification.py\u001b[0m in \u001b[0;36mclassification_report\u001b[0;34m(y_true, y_pred, labels, target_names, sample_weight, digits)\u001b[0m\n\u001b[1;32m 1419\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1420\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1421\u001b[0;31m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0munique_labels\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1422\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1423\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/sklearn/utils/multiclass.py\u001b[0m in \u001b[0;36munique_labels\u001b[0;34m(*ys)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mys_types\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 82\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Mix type of y not allowed, got types %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mys_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0mlabel_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mys_types\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Mix type of y not allowed, got types {'multiclass', 'continuous'}" + ] + } + ], + "source": [ + "from sklearn.metrics import classification_report\n", + "\n", + "\n", + "classification_report(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Linear Regression coefficient of determination (R squared): 387803891893.9076\n" + ] + } + ], + "source": [ + "print('Linear Regression coefficient of determination (R squared): %.4f' % (0 - grid_search.score(X_test, y_test))) # since score is negative" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "lin_mse = mean_squared_error(y_pred, y_test)\n", + "lin_rmse = np.sqrt(lin_mse)\n", + "print('Linear Regression RMSE: %.4f' % lin_rmse)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('DOTSCIENCE_PARAMETERS=' + json.dumps({\"features\": \", \".join(sorted(features))}))\n", + "\n", + "\n", + "print('DOTSCIENCE_SUMMARY=' + json.dumps({\"lin_rmse\": lin_rmse, \"regressor_score\": regressor_score}))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}