In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

# Load and Prepare the Data

In [21]:
solar=pd.read_csv('../deepsolar_tract.csv',encoding = "ISO-8859-1")
solar.head()
solar_fields=pd.read_csv('../deepsolar fields.csv')
solar_fields.head()

Unnamed: 0,Field,Description,Unit,Data Type,Formula,Possible Values,Observed Max,Observed Min,Theoretical Min,Theoretical Max,Relevant Feature
0,Unnamed: 0,Index,,Numeric,,,72537.0,0.0,,,0
1,tile_count,total number of tiles in census tract,,Numeric,,,4468.0,0.0,0.0,,0
2,solar_system_count,Total number of solar systems in census tract,,Numeric,,,1535.0,0.0,0.0,,0
3,total_panel_area,,,Numeric,,,592031.075,0.0,0.0,,0
4,fips,FIPS identifier for the census tract,,String,,,,,,,0


In [46]:
#define relevant features and dependent variable


features=solar_fields.loc[(solar_fields['Relevant Feature']==1)]['Field'].tolist()
all_variables=features+['number_of_solar_system_per_household']

#drop rows with NaN values for now
solar2=solar[all_variables].replace([np.inf,' '],np.nan).dropna()

#Create binary version of number_of_solar_system_per_household for RF classifier

solar2['solar_flag']=solar2['number_of_solar_system_per_household'].apply(lambda x: int(x>0))


#create dummy variables for state
solar2=pd.get_dummies(solar2,columns=['state'])

#create binary version of vote dem win variables

solar2['voting_2016_dem_win']=solar2['voting_2016_dem_win'].apply(lambda x: int(x))
solar2['voting_2012_dem_win']=solar2['voting_2012_dem_win'].apply(lambda x: int(x))


#designate independent variable frame
independent_vars=solar2.loc[:,~solar2.columns.isin(['number_of_solar_system_per_household','solar_flag'])]

#create training and test data
shuffle = np.random.permutation(np.arange(independent_vars.shape[0]))
split_size=int(shuffle.shape[0]*0.8)

X,y=independent_vars.values[shuffle],solar2['solar_flag'].values[shuffle]
X_train,y_train=X[0:split_size],y[0:split_size]
X_dev,y_dev=X[split_size:],y[split_size:]
print('training data shape: ',X_train.shape)
print('training labels shape: ',y_train.shape)
print('dev data shape: ',X_dev.shape)
print('dev labels shape: ',y_dev.shape)




training data shape:  (36143, 142)
training labels shape:  (36143,)
dev data shape:  (9036, 142)
dev labels shape:  (9036,)


# Fit the Classifier

In [47]:
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Classifier Score on the Dev Set

In [48]:
classifier.score(X_dev,y_dev)

0.80190349712262066

## Feature Importance List - Top 20

In [53]:
feature_importances=classifier.feature_importances_
features=independent_vars.columns
feature_tuples=[(features[i],feature_importances[i]) for i in range(len(features))]
sorted_features=sorted(feature_tuples,reverse=True,key=lambda k: k[1])
for i in range(0,20):
    print(sorted_features[i])

('population_density', 0.038107178854703147)
('occupancy_vacant_rate', 0.02770418171630722)
('heating_fuel_coal_coke_rate', 0.02263082269050521)
('lon', 0.022240364183156319)
('housing_unit_median_gross_rent', 0.019521973176544125)
('education_high_school_graduate_rate', 0.016301678149650535)
('number_of_years_of_education', 0.016264585436631552)
('electricity_consume_total', 0.01541594707953781)
('race_asian_rate', 0.015224226141095819)
('occupation_agriculture_rate', 0.014533732072643796)
('voting_2012_dem_percentage', 0.013979679294780681)
('per_capita_income', 0.013690732732964127)
('electricity_consume_industrial', 0.013570484307351402)
('travel_time_10_19_rate', 0.013518525105643669)
('average_household_income', 0.012952429305080417)
('heating_design_temperature', 0.012410502982126145)
('travel_time_less_than_10_rate', 0.012351416786129023)
('travel_time_40_59_rate', 0.012145865917251438)
('transportation_car_alone_rate', 0.011945228499702046)
('race_white_rate', 0.01182067478840

### Observations

* 80% accuracy on dev set with no hyperparamter tuning
* list of important features is similar to feature importance list for the classifier in SolarForest