# Try Again with Values from Supplemental Info Section

## Load/Prep the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [42]:
solar=pd.read_csv('../deepsolar_tract.csv',encoding = "ISO-8859-1")
solar.head()
solar_fields=pd.read_csv('../deepsolar fields.csv')
solar_fields.head()

Unnamed: 0,Field,Description,Unit,Data Type,Formula,Possible Values,Observed Max,Observed Min,Theoretical Min,Theoretical Max,Relevant Feature,Mentioned in Supplemental Info
0,Unnamed: 0,Index,,Numeric,,,72537.0,0.0,,,0,
1,tile_count,total number of tiles in census tract,,Numeric,,,4468.0,0.0,0.0,,0,
2,solar_system_count,Total number of solar systems in census tract,,Numeric,,,1535.0,0.0,0.0,,0,
3,total_panel_area,,,Numeric,,,592031.075,0.0,0.0,,0,
4,fips,FIPS identifier for the census tract,,String,,,,,,,0,


## Pre-process data; impute missing values using median

In [43]:
#define relevant features and dependent variable


features=solar_fields.loc[(solar_fields['Mentioned in Supplemental Info']==1)]['Field'].tolist()
all_variables=features+['number_of_solar_system_per_household']

solar2=solar[all_variables]

#Create binary version of number_of_solar_system_per_household for RF classifier

solar2['solar_flag']=solar2['number_of_solar_system_per_household'].apply(lambda x: int(x>0))
solar2=solar2.loc[np.isfinite(solar2['number_of_solar_system_per_household'])]

#designate independent variable frame
independent_vars=solar2.loc[:,~solar2.columns.isin(['number_of_solar_system_per_household','solar_flag'])]
X=independent_vars.values
yc=solar2['solar_flag'].values
yr=solar2['number_of_solar_system_per_household'].values

#impute missing values
missing_val_imputer=Imputer(strategy='median')
X=missing_val_imputer.fit_transform(X)


X_train, X_test, yc_train, yc_test,yr_train,yr_test = train_test_split(X, yc,yr, test_size=0.3, random_state=42)



print('training data shape: ',X_train.shape)
print('classifier train labels shape: ',yc_train.shape)
print('regressor train labels shape: ',yr_train.shape)
print('test data shape: ',X_test.shape)
print('classifier test labels shape: ',yc_test.shape)
print('regressor test labels shape: ',yr_test.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


training data shape:  (50236, 95)
classifier train labels shape:  (50236,)
regressor train labels shape:  (50236,)
test data shape:  (21530, 95)
classifier test labels shape:  (21530,)
regressor test labels shape:  (21530,)


## Alternatively, pre-process values using group median by state

In [17]:
features=solar_fields.loc[(solar_fields['Mentioned in Supplemental Info']==1)]['Field'].tolist()
all_variables=features+['number_of_solar_system_per_household']+['state']

solar2=solar[all_variables]

#Create binary version of number_of_solar_system_per_household for RF classifier

solar2['solar_flag']=solar2['number_of_solar_system_per_household'].apply(lambda x: int(x>0))
solar2=solar2.loc[np.isfinite(solar2['number_of_solar_system_per_household'])]

#designate independent variable frame
independent_vars=solar2.loc[:,~solar2.columns.isin(['number_of_solar_system_per_household','solar_flag'])].groupby('state').transform(lambda x: x.fillna(x.median()))
X=independent_vars.values
yc=solar2['solar_flag'].values
yr=solar2['number_of_solar_system_per_household'].values

X_train, X_test, yc_train, yc_test,yr_train,yr_test = train_test_split(X, yc,yr, test_size=0.3, random_state=42)



print('training data shape: ',X_train.shape)
print('classifier train labels shape: ',yc_train.shape)
print('regressor train labels shape: ',yr_train.shape)
print('test data shape: ',X_test.shape)
print('classifier test labels shape: ',yc_test.shape)
print('regressor test labels shape: ',yr_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


training data shape:  (50236, 97)
classifier train labels shape:  (50236,)
regressor train labels shape:  (50236,)
test data shape:  (21530, 97)
classifier test labels shape:  (21530,)
regressor test labels shape:  (21530,)


## Train the Classifier

In [18]:
params_dict={'max_depth':(15, 20, 30), 'n_estimators':[100,150,200]}
classifier=RandomForestClassifier()
clf = GridSearchCV(classifier, params_dict, cv=3,n_jobs=-1)
clf.fit(X,yc)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': (15, 20, 30), 'n_estimators': [100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
clf.cv_results_



{'mean_fit_time': array([  57.39886419,   86.70215297,  118.13012513,   66.8765223 ,
         101.37996372,  133.39823429,   69.39873433,   94.48175327,
         104.51939106]),
 'mean_score_time': array([ 0.60033711,  0.94648743,  1.17688878,  0.7273314 ,  1.00923546,
         1.37408272,  0.67787266,  0.87169433,  0.98681275]),
 'mean_test_score': array([ 0.76888777,  0.76805172,  0.76876237,  0.7654739 ,  0.76532063,
         0.76529276,  0.76268707,  0.76310509,  0.76292395]),
 'mean_train_score': array([ 0.94787225,  0.94833905,  0.9480534 ,  0.99036452,  0.99044812,
         0.99038542,  0.9999582 ,  0.99997213,  1.        ]),
 'param_max_depth': masked_array(data = [15 15 15 20 20 20 30 30 30],
              mask = [False False False False False False False False False],
        fill_value = ?),
 'param_n_estimators': masked_array(data = [100 150 200 100 150 200 100 150 200],
              mask = [False False False False False False False False False],
        fill_value = ?),
 

In [20]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
clf.best_score_

0.76888777415489229

In [59]:
classifier=RandomForestClassifier(max_depth=25,n_estimators=100, n_jobs=-1)
classifier.fit(X_train,yc_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [60]:
classifier.score(X_test,yc_test)

0.80780306549001391

In [46]:
feature_importances=classifier.feature_importances_
features=independent_vars.columns
feature_tuples=[(features[i],feature_importances[i]) for i in range(len(features))]
sorted_features=sorted(feature_tuples,reverse=True,key=lambda k: k[1])
for i in range(0,20):
    print(sorted_features[i])

('population_density', 0.081252234127436582)
('heating_fuel_coal_coke_rate', 0.038051572503679791)
('housing_unit_median_value', 0.029527141513755323)
('race_asian_rate', 0.027728178506559592)
('occupancy_vacant_rate', 0.026918870276855203)
('housing_unit_median_gross_rent', 0.021552282176967384)
('relative_humidity', 0.021169843676909057)
('daily_solar_radiation', 0.020809245180654315)
('average_household_income', 0.015701815804567931)
('education_high_school_graduate_rate', 0.015469313705492782)
('occupation_agriculture_rate', 0.015043275885951297)
('education_bachelor_rate', 0.014665439525420508)
('number_of_years_of_education', 0.01409047921496499)
('travel_time_less_than_10_rate', 0.014030473650021483)
('transportation_public_rate', 0.013894648541640148)
('mortgage_with_rate', 0.013893873208899786)
('transportation_car_alone_rate', 0.013598833437498976)
('race_white_rate', 0.012816918688231227)
('travel_time_10_19_rate', 0.012691837123245609)
('occupancy_owner_rate', 0.01251130328

### Feature Importances Using Permutation Importance

In [68]:
import rfpimp as rfp

In [74]:
rfp.importances(classifier,pd.DataFrame(X_test,columns=features),pd.DataFrame(yc_test,columns=['solar_flag']))

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
population_density,0.0192
daily_solar_radiation,0.0058
heating_fuel_coal_coke_rate,0.0052
relative_humidity,0.0044
race_asian_rate,0.0042
race_white_rate,0.0040
education_high_school_graduate_rate,0.0036
heating_fuel_electricity_rate,0.0032
mortgage_with_rate,0.0032
housing_unit_median_gross_rent,0.0030


## Train the Regressor

In [26]:
params_dict={'max_depth':(None, 10, 15,20), 'n_estimators':[100,150,200]}
regressor=RandomForestRegressor()
clf_r = GridSearchCV(regressor, params_dict, cv=3,n_jobs=-1)
clf_r.fit(X,yr)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': (None, 10, 15, 20), 'n_estimators': [100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
clf_r.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [28]:
clf_r.best_score_

0.18329555143788012

In [47]:
regressor=RandomForestRegressor(max_depth=None,n_estimators=150,n_jobs=-1)
regressor.fit(X_train,yr_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [48]:
regressor.score(X_test,yr_test)

0.42004730465774254

## Feature Importances

In [49]:
feature_importances=regressor.feature_importances_
features=independent_vars.columns
feature_tuples=[(features[i],feature_importances[i]) for i in range(len(features))]
sorted_features=sorted(feature_tuples,reverse=True,key=lambda k: k[1])
for i in range(0,20):
    print(sorted_features[i])

('age_more_than_85_rate', 0.12340178299804885)
('daily_solar_radiation', 0.10938378764856163)
('occupation_manufacturing_rate', 0.072060978504967413)
('age_25_34_rate', 0.052780783428892357)
('occupancy_owner_rate', 0.049191561545301597)
('education_high_school_graduate_rate', 0.036289482315891887)
('population_density', 0.028523913274468439)
('average_household_income', 0.021746736678951235)
('health_insurance_public_rate', 0.021060905285102034)
('occupation_construction_rate', 0.019733963219661345)
('household_type_family_rate', 0.016788052288447083)
('heating_fuel_gas_rate', 0.015048545899773138)
('avg_electricity_retail_rate', 0.014535654871655907)
('net_metering', 0.014532832357603071)
('frost_days', 0.014372983119111954)
('education_college_rate', 0.013505286712900967)
('voting_2016_dem_percentage', 0.012535762852915381)
('relative_humidity', 0.011840709674098204)
('earth_temperature_amplitude', 0.011675508850153013)
('average_household_size', 0.011554550068678001)


### Feature Importances Using Permutation Importance Calculation

In [75]:
rfp.importances(regressor,pd.DataFrame(X_test,columns=features),pd.DataFrame(yr_test,columns=['number_of_solar_system_per_household']))

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
age_more_than_85_rate,0.542441
daily_solar_radiation,0.197658
frost_days,0.082486
education_high_school_graduate_rate,0.074044
average_household_income,0.052126
occupancy_owner_rate,0.051699
population_density,0.043091
health_insurance_public_rate,0.036169
education_college_rate,0.029778
heating_fuel_gas_rate,0.026034


## Put it all Together

In [50]:
classifier_preds=classifier.predict(X_test)
classifier_preds[0:100]

array([1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1], dtype=int64)

In [51]:
regressor_preds=regressor.predict(X_test)
regressor_preds[0:100]

array([  7.63184897e-04,   3.41643126e-02,   5.21087440e-03,
         7.13294700e-04,   2.37500290e-04,   3.43188241e-03,
         1.17781383e-01,   2.49495925e-02,   8.44087353e-02,
         2.67774639e-02,   1.14458618e-03,   1.62076189e-03,
         1.32853109e-03,   1.58408605e-04,   4.61278267e-04,
         1.41956532e-02,   4.43256144e-02,   2.05605951e-02,
         6.46131704e-03,   5.28030101e-04,   3.56837311e-04,
         1.22059497e-01,   9.71670656e-03,   1.14557943e-02,
         1.78388322e-03,   9.55865575e-04,   3.58159138e-03,
         1.74140911e-03,   6.91442337e-03,   1.81767574e-03,
         2.58797171e-03,   6.05078847e-04,   2.92438016e-03,
         3.37513101e-03,   4.90853281e-03,   3.15760312e-02,
         4.68645328e-02,   2.79124037e-04,   5.12004947e-03,
         8.63487099e-05,   4.84770551e-04,   3.09784420e-03,
         1.53632904e-03,   1.76749954e-03,   1.81397409e-03,
         5.85976582e-02,   4.60087820e-04,   1.95697066e-02,
         8.12572525e-04,

In [52]:
final_preds=regressor_preds*classifier_preds
final_preds[0:100]

array([ 0.00076318,  0.03416431,  0.00521087,  0.00071329,  0.        ,
        0.00343188,  0.11778138,  0.        ,  0.08440874,  0.02677746,
        0.00114459,  0.00162076,  0.00132853,  0.        ,  0.        ,
        0.01419565,  0.04432561,  0.0205606 ,  0.00646132,  0.        ,
        0.        ,  0.1220595 ,  0.00971671,  0.01145579,  0.00178388,
        0.        ,  0.00358159,  0.00174141,  0.00691442,  0.00181768,
        0.00258797,  0.        ,  0.00292438,  0.00337513,  0.00490853,
        0.03157603,  0.04686453,  0.        ,  0.        ,  0.        ,
        0.        ,  0.00309784,  0.00153633,  0.0017675 ,  0.00181397,
        0.05859766,  0.        ,  0.01956971,  0.00081257,  0.        ,
        0.00128048,  0.0187409 ,  0.00164539,  0.00167301,  0.00555061,
        0.00164613,  0.04866622,  0.01281308,  0.        ,  0.00121752,
        0.07101307,  0.01504007,  0.03576257,  0.00127573,  0.00298409,
        0.00070192,  0.001302  ,  0.        ,  0.29400233,  0.00

In [53]:
from sklearn.metrics import r2_score

In [36]:
yr_test.shape

(21530,)

In [37]:
final_preds.shape

(71766,)

In [54]:
r2_score(yr_test,final_preds)

0.42418118368511337

## Write Predictions to Output File

In [65]:
solar_full=solar[features].replace([np.inf,' '],np.nan)
solar_full['number_of_solar_system_per_household']=solar['number_of_solar_system_per_household'].replace(np.inf,np.nan)
solar_full['fips']=solar['fips']
solar_full=solar_full.loc[np.isfinite(solar_full['number_of_solar_system_per_household'])]
solar_full=solar_full.assign(predicted_solar=final_preds)
solar_full.head(10)

Unnamed: 0,average_household_income,gini_index,heating_fuel_coal_coke,heating_fuel_electricity,heating_fuel_fuel_oil_kerosene,heating_fuel_gas,heating_fuel_housing_unit_count,heating_fuel_none,heating_fuel_other,heating_fuel_solar,...,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate,number_of_solar_system_per_household,fips,predicted_solar
0,70352.78987,0.349,200,448,98,1720,2527,10,51,0,...,34,0,0,25,12,0,9.46,0.0,27145011200,0.0
1,61727.0852,0.4074,20,379,17,1799,2230,0,15,0,...,34,0,0,25,12,0,9.46,0.006726,27145011301,0.001495
2,71496.88658,0.3926,69,440,45,2098,2698,29,17,0,...,34,0,0,25,12,0,9.46,0.001112,27145011302,0.001445
3,86840.15275,0.3949,188,442,61,1113,1833,13,16,0,...,34,0,0,25,12,0,9.46,0.0,27145011304,0.0
4,89135.3156,0.4463,96,497,47,1202,1917,31,44,0,...,34,0,0,25,12,0,9.46,0.002087,27145011400,0.001388
5,62225.90361,0.3847,72,759,81,1695,2656,45,4,0,...,34,0,0,25,12,0,9.46,0.0,27145011500,0.0
6,41068.93617,0.7166,3,222,5,193,470,16,31,0,...,34,0,0,25,12,0,9.46,0.004255,27145011600,0.002721
7,74073.83367,0.4317,108,382,190,783,1479,3,13,0,...,34,0,0,25,12,0,9.46,0.0,27145010500,0.0
8,69412.19243,0.3824,241,534,156,1735,2723,20,37,0,...,34,0,0,25,12,0,9.46,0.0,27145011100,0.0
9,82502.40707,0.3588,18,734,27,2419,3282,29,55,0,...,34,0,0,25,12,0,9.46,0.002742,27145010102,0.001954


In [66]:
relevant_fields=['number_of_solar_system_per_household']+features+['fips']
final_frame=pd.merge(solar[relevant_fields],solar_full[['fips','predicted_solar']],on=['fips'],how='left')
final_frame.head(10)

Unnamed: 0,number_of_solar_system_per_household,average_household_income,gini_index,heating_fuel_coal_coke,heating_fuel_electricity,heating_fuel_fuel_oil_kerosene,heating_fuel_gas,heating_fuel_housing_unit_count,heating_fuel_none,heating_fuel_other,...,diversity,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate,fips,predicted_solar
0,0.0,70352.78987,0.349,200,448,98,1720,2527,10,51,...,0.04696,34,0,0,25,12,0,9.46,27145011200,0.0
1,0.006726,61727.0852,0.4074,20,379,17,1799,2230,0,15,...,0.145934,34,0,0,25,12,0,9.46,27145011301,0.001495
2,0.001112,71496.88658,0.3926,69,440,45,2098,2698,29,17,...,0.00915,34,0,0,25,12,0,9.46,27145011302,0.001445
3,0.0,86840.15275,0.3949,188,442,61,1113,1833,13,16,...,0.187334,34,0,0,25,12,0,9.46,27145011304,0.0
4,0.002087,89135.3156,0.4463,96,497,47,1202,1917,31,44,...,0.090766,34,0,0,25,12,0,9.46,27145011400,0.001388
5,0.0,62225.90361,0.3847,72,759,81,1695,2656,45,4,...,0.081981,34,0,0,25,12,0,9.46,27145011500,0.0
6,0.004255,41068.93617,0.7166,3,222,5,193,470,16,31,...,0.470443,34,0,0,25,12,0,9.46,27145011600,0.002721
7,0.0,74073.83367,0.4317,108,382,190,783,1479,3,13,...,0.008239,34,0,0,25,12,0,9.46,27145010500,0.0
8,0.0,69412.19243,0.3824,241,534,156,1735,2723,20,37,...,0.029574,34,0,0,25,12,0,9.46,27145011100,0.0
9,0.002742,82502.40707,0.3588,18,734,27,2419,3282,29,55,...,0.120052,34,0,0,25,12,0,9.46,27145010102,0.001954


In [67]:
final_frame.to_csv('baseline_model_predictions.csv')

# Try Other Decision Tree Algorithms

### Gradient Boosting Classifier

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

In [7]:
classifier=GradientBoostingClassifier()
classifier.fit(X_train,yc_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [8]:
classifier.score(X_test,yc_test)

0.81379470506270324

In [11]:
regressor=GradientBoostingRegressor()
regressor.fit(X_train,yr_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [12]:
regressor.score(X_test,yr_test)

0.32123964790412141

In [14]:
regressor_preds=regressor.predict(X_test)
classifier_preds=classifier.predict(X_test)
final_preds=regressor_preds*classifier_preds
r2_score(yr_test,final_preds)

0.32472191340330459

## XGBoost

In [15]:
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [16]:
classifier=XGBClassifier()
classifier.fit(X_train,yc_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [17]:
classifier.score(X_test,yc_test)

0.81216906641895026

In [18]:
regressor=XGBRegressor()
regressor.fit(X_train,yr_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [19]:
regressor.score(X_test,yr_test)

0.30330564225161372

In [20]:
regressor_preds=regressor.predict(X_test)
classifier_preds=classifier.predict(X_test)
final_preds=regressor_preds*classifier_preds
r2_score(yr_test,final_preds)

0.27814778838778842

## Adaboost

In [21]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score

In [22]:
classifier=AdaBoostClassifier(n_estimators=100)
classifier.fit(X_train,yc_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

In [23]:
classifier.score(X_test,yc_test)

0.80780306549001391

In [24]:
regressor=AdaBoostRegressor(n_estimators=100)
regressor.fit(X_train,yr_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=100, random_state=None)

In [25]:
regressor.score(X_test,yr_test)

-35.643734164492642

In [26]:
regressor_preds=regressor.predict(X_test)
classifier_preds=classifier.predict(X_test)
final_preds=regressor_preds*classifier_preds
r2_score(yr_test,final_preds)

-28.201460540731961

## Random Forest

In [27]:
classifier=RandomForestClassifier(n_jobs=-1)
classifier.fit(X_train,yc_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
classifier.score(X_test,yc_test)

0.79131444496052017

In [29]:
regressor=RandomForestRegressor(n_jobs=-1)
regressor.fit(X_train,yr_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [30]:
regressor.score(X_test,yr_test)

0.36700101017966047

In [31]:
regressor_preds=regressor.predict(X_test)
classifier_preds=classifier.predict(X_test)
final_preds=regressor_preds*classifier_preds
r2_score(yr_test,final_preds)

0.42858626503555475