In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the DeepSolar Data and Fields List

In [2]:
solar=pd.read_csv('../../deepsolar_tract.csv',encoding = "ISO-8859-1")
solar.head()
solar_fields=pd.read_csv('../deepsolar fields.csv')
solar_fields.head()

Unnamed: 0,Field,Description,Unit,Data Type,Formula,Possible Values,Observed Max,Observed Min,Theoretical Min,Theoretical Max,Relevant Feature
0,Unnamed: 0,Index,,Numeric,,,72537.0,0.0,,,0
1,tile_count,total number of tiles in census tract,,Numeric,,,4468.0,0.0,0.0,,0
2,solar_system_count,Total number of solar systems in census tract,,Numeric,,,1535.0,0.0,0.0,,0
3,total_panel_area,,,Numeric,,,592031.075,0.0,0.0,,0
4,fips,FIPS identifier for the census tract,,String,,,,,,,0


In [3]:
solar

Unnamed: 0.1,Unnamed: 0,tile_count,solar_system_count,total_panel_area,fips,average_household_income,county,education_bachelor,education_college,education_doctoral,...,incentive_count_nonresidential,incentive_residential_state_level,incentive_nonresidential_state_level,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate
0,0,0.0,0.0,0.000000,27145011200,70352.789869,Stearns County,569,1690,13,...,39,11,13,34,0,0,25,12,0,9.46
1,1,25.0,21.0,1133.436461,27145011301,61727.085202,Stearns County,674,1434,108,...,39,11,13,34,0,0,25,12,0,9.46
2,2,3.0,3.0,64.505776,27145011302,71496.886583,Stearns County,854,1459,31,...,39,11,13,34,0,0,25,12,0,9.46
3,3,0.0,0.0,0.000000,27145011304,86840.152755,Stearns County,640,1116,68,...,39,11,13,34,0,0,25,12,0,9.46
4,4,5.0,5.0,164.583303,27145011400,89135.315597,Stearns County,654,1314,15,...,39,11,13,34,0,0,25,12,0,9.46
5,5,0.0,0.0,0.000000,27145011500,62225.903614,Stearns County,522,1395,24,...,39,11,13,34,0,0,25,12,0,9.46
6,6,2.0,2.0,25.299013,27145011600,41068.936170,Stearns County,49,278,32,...,39,11,13,34,0,0,25,12,0,9.46
7,7,0.0,0.0,0.000000,27145010500,74073.833671,Stearns County,242,867,10,...,39,11,13,34,0,0,25,12,0,9.46
8,8,0.0,0.0,0.000000,27145011100,69412.192435,Stearns County,527,1665,6,...,39,11,13,34,0,0,25,12,0,9.46
9,9,11.0,10.0,415.365350,27145010102,82502.407069,Stearns County,1582,1949,6,...,39,11,13,34,0,0,25,12,0,9.46


# Load/Test PySpark

In [3]:
from pyspark import SparkContext
sc = SparkContext()

In [4]:
import numpy as np

TOTAL = 100
dots = sc.parallelize([2.0 * np.random.random(2) - 1.0 for i in range(TOTAL)]).cache()
print("Number of random points:", dots.count())

stats = dots.stats()
print('Mean:', stats.mean())
print('stdev:', stats.stdev())

Number of random points: 100
Mean: [-0.10418646  0.0869734 ]
stdev: [ 0.60871921  0.57665841]


# Process DeepSolar Data

## Create Training/Dev Sets

In [5]:
features=solar_fields.loc[(solar_fields['Relevant Feature']==1)]['Field'].tolist()
all_variables=features+['number_of_solar_system_per_household']

#Set infinity and blank spaces to NaN in independent variables, set infinite to 0 in dependent variable
solar2=solar[features].replace([np.inf,' '],np.nan)
solar2['number_of_solar_system_per_household']=solar['number_of_solar_system_per_household'].replace([np.inf,np.nan],0)

#Create binary version of number_of_solar_system_per_household for RF classifier

solar2['solar_flag']=solar2['number_of_solar_system_per_household'].apply(lambda x: int(x>0))


#create dummy variables for state
solar2=pd.get_dummies(solar2,columns=['state'])

#create binary version of vote dem win variables

solar2['voting_2016_dem_win']=solar2['voting_2016_dem_win'].apply(lambda x: int(x))
solar2['voting_2012_dem_win']=solar2['voting_2012_dem_win'].apply(lambda x: int(x))


#designate independent variable frame
independent_vars=solar2.loc[:,~solar2.columns.isin(['number_of_solar_system_per_household','solar_flag'])]

#create training and test data
shuffle = np.random.permutation(np.arange(independent_vars.shape[0]))
split_size=int(shuffle.shape[0]*0.8)

X,y=independent_vars.values[shuffle],solar2['solar_flag'].values[shuffle]
X_train,y_train=X[0:split_size],y[0:split_size]
X_dev,y_dev=X[split_size:],y[split_size:]
print('training data shape: ',X_train.shape)
print('training labels shape: ',y_train.shape)
print('dev data shape: ',X_dev.shape)
print('dev labels shape: ',y_dev.shape)


training data shape:  (58029, 168)
training labels shape:  (58029,)
dev data shape:  (14508, 168)
dev labels shape:  (14508,)


## Train the Model/Make Predictions

In [6]:
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.linalg import Vectors

In [20]:
#convert training data to LabeledPoint data type
data=[]
for i in range(X_train.shape[0]):
    data.append(LabeledPoint(y_train[i],X_train[i]))

## Train the Model

In [22]:
model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42)

## Obtain Dev Set Accuracy

In [24]:
dev_set=[]
for i in range(X_dev.shape[0]):
    dev_set.append(list(X_dev[i]))

In [27]:
preds=model.predict(sc.parallelize(dev_set)).collect()

In [29]:
preds_array=np.array(preds)
preds_array

array([ 1.,  1.,  0., ...,  1.,  1.,  1.])

In [30]:
(preds_array==y_dev).mean()

0.75372208436724564

# Alternative Implementation

In [155]:
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.sql.session import SparkSession
from pyspark.sql import *
from pyspark.sql.types import *
spark = SparkSession(sc)
from pyspark.ml.linalg import DenseVector
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Write Processed Dataframe to csv - cleaner than trying to convert pandas df to pyspark df

In [116]:
processed_frame=solar2.loc[:,~solar2.columns.isin(['number_of_solar_system_per_household'])]
processed_frame.head()

Unnamed: 0,average_household_income,gini_index,per_capita_income,population_density,education_less_than_high_school_rate,education_high_school_graduate_rate,education_college_rate,education_bachelor_rate,education_master_rate,education_professional_school_rate,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
0,70352.789869,0.349,26999.0,44.52005,0.073879,0.386324,0.371592,0.12511,0.034521,0.005717,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,61727.085202,0.4074,20951.0,482.6443,0.062836,0.217096,0.405887,0.190773,0.080668,0.012171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,71496.886583,0.3926,28021.0,186.1673,0.064509,0.343973,0.32567,0.190625,0.061607,0.006696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,86840.152755,0.3949,29275.0,84.1361,0.06684,0.31684,0.322917,0.185185,0.078125,0.010417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,89135.315597,0.4463,32770.0,119.6323,0.048686,0.293309,0.392473,0.195341,0.050777,0.014934,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [118]:
processed_frame.to_csv('processed_frame.csv',index=False)

## Read csv into pyspark df

In [119]:
df = spark.read.format("csv").option("header", True).option("inferSchema", True).load("processed_frame.csv")

## process dataframe for random forest classifier

In [141]:
cols=df.columns
new_cols=[i for i in cols if i!='solar_flag']
new_cols=['solar_flag']+new_cols
new_cols

['solar_flag',
 'average_household_income',
 'gini_index',
 'per_capita_income',
 'population_density',
 'education_less_than_high_school_rate',
 'education_high_school_graduate_rate',
 'education_college_rate',
 'education_bachelor_rate',
 'education_master_rate',
 'education_professional_school_rate',
 'education_doctoral_rate',
 'race_white_rate',
 'race_black_africa_rate',
 'race_indian_alaska_rate',
 'race_asian_rate',
 'race_islander_rate',
 'race_other_rate',
 'race_two_more_rate',
 'employ_rate',
 'poverty_family_below_poverty_level_rate',
 'heating_fuel_gas_rate',
 'heating_fuel_electricity_rate',
 'heating_fuel_fuel_oil_kerosene_rate',
 'heating_fuel_coal_coke_rate',
 'heating_fuel_solar_rate',
 'heating_fuel_other_rate',
 'heating_fuel_none_rate',
 'median_household_income',
 'electricity_price_residential',
 'electricity_price_commercial',
 'electricity_price_industrial',
 'electricity_price_transportation',
 'electricity_price_overall',
 'electricity_consume_residential',


In [144]:
input_data=df.select(new_cols).rdd.map(lambda x: (x[0], DenseVector(x[1:])))

In [145]:
mod_df = spark.createDataFrame(input_data, ["label", "features"])

## Create Model Pipeline

In [148]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(mod_df)

In [149]:
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(mod_df)

In [150]:
(trainingData, testData) = mod_df.randomSplit([0.8, 0.2],seed=1234)

In [151]:
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

In [157]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [158]:
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

## Generate Predictions/Assess Accuracy

In [159]:
model = pipeline.fit(trainingData)

In [160]:
predictions = model.transform(testData)

In [162]:
predictions.select("predictedLabel", "label", "features").show(50)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|             1|    0|[9770.23809524,0....|
|             1|    0|[11758.4745763,0....|
|             1|    0|[11797.2457627,0....|
|             1|    0|[17166.6666667,0....|
|             1|    0|[19851.1111110999...|
|             1|    0|[20153.2268796,0....|
|             1|    0|[20448.7021014000...|
|             1|    0|[20837.0873785999...|
|             1|    0|[21236.7384615,0....|
|             1|    0|[21519.3913043,0....|
|             1|    0|[21580.3738318,0....|
|             1|    0|[22810.3806228,0....|
|             1|    0|[23990.7788162,0....|
|             1|    0|[24295.5592105,0....|
|             1|    0|[24710.8023072999...|
|             0|    0|[24747.094431,0.4...|
|             1|    0|[25109.5679012,0....|
|             0|    0|[25166.9505963,0....|
|             1|    0|[25277.6448942,0....|
|             1|    0|[26601.676

In [163]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.215424


In [164]:
print('Test Accuracy: ',accuracy)

Test Accuracy:  0.7845761191970753


## Get Top Feature Importances

In [167]:
td=labelIndexer.transform(mod_df)

In [170]:
td

DataFrame[label: bigint, features: vector, indexedLabel: double]

In [176]:
rf_model=RandomForestClassifier(labelCol="indexedLabel", numTrees=10)
results=rf_model.fit(td)

In [181]:
feature_importances=results.featureImportances

In [186]:
feature_importances

SparseVector(168, {0: 0.0083, 1: 0.0003, 2: 0.0355, 3: 0.0596, 4: 0.0101, 6: 0.0007, 7: 0.001, 8: 0.0002, 9: 0.0025, 10: 0.0027, 11: 0.0292, 12: 0.0012, 13: 0.0018, 14: 0.0034, 16: 0.011, 18: 0.0046, 19: 0.0281, 20: 0.0027, 21: 0.0016, 22: 0.0082, 23: 0.1605, 24: 0.0, 25: 0.0068, 26: 0.0004, 27: 0.0028, 28: 0.0056, 29: 0.0007, 30: 0.002, 31: 0.0, 32: 0.0349, 33: 0.0002, 34: 0.0002, 35: 0.0702, 36: 0.0303, 37: 0.0004, 38: 0.0593, 39: 0.0307, 40: 0.0033, 41: 0.012, 42: 0.0023, 43: 0.0016, 46: 0.0033, 48: 0.023, 49: 0.0199, 50: 0.0054, 51: 0.0005, 52: 0.0003, 53: 0.0081, 54: 0.0028, 55: 0.0063, 56: 0.0002, 57: 0.0005, 60: 0.0001, 62: 0.0002, 66: 0.0034, 67: 0.0018, 68: 0.0, 69: 0.0, 73: 0.0041, 75: 0.0001, 76: 0.0076, 77: 0.0009, 78: 0.0025, 79: 0.0007, 80: 0.0841, 81: 0.0005, 82: 0.0245, 83: 0.0009, 87: 0.0001, 88: 0.0023, 89: 0.004, 90: 0.0025, 91: 0.0109, 92: 0.0019, 93: 0.0011, 94: 0.0017, 95: 0.0009, 96: 0.0148, 97: 0.0006, 99: 0.0191, 100: 0.0074, 101: 0.0018, 104: 0.0005, 106: 0.03

In [194]:
np.array(x.toArray())).as_matrix().reshape(-1,1)

0.0083428282483971761

In [199]:
fia=feature_importances.toArray()
fi_list=[]
for i in range(fia.shape[0]):
    fi_list.append((new_cols[i+1],fia[i]))
sorted_list=sorted(fi_list,reverse=True,key=lambda k: k[1])
sorted_list[0:20]


[('heating_fuel_coal_coke_rate', 0.160457441212084),
 ('occupancy_vacant_rate', 0.084051285683999721),
 ('electricity_consume_industrial', 0.070194270795195052),
 ('population_density', 0.059638866475150411),
 ('housing_unit_median_value', 0.059342263850459376),
 ('number_of_years_of_education', 0.03993639152962386),
 ('per_capita_income', 0.035468022235511813),
 ('electricity_price_overall', 0.034898140216468768),
 ('housing_unit_median_gross_rent', 0.030686357763874416),
 ('electricity_consume_total', 0.03026152063089731),
 ('race_white_rate', 0.029167970513115832),
 ('poverty_family_below_poverty_level_rate', 0.028112067412654131),
 ('mortgage_with_rate', 0.024512071819863392),
 ('relative_humidity', 0.022955361512356592),
 ('daily_solar_radiation', 0.019902084471418797),
 ('travel_time_average', 0.019092064615510114),
 ('incentive_count_residential', 0.018124043950105088),
 ('health_insurance_public_rate', 0.014752364458470706),
 ('lon', 0.012042746861574699),
 ('race_other_rate', 