In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the DeepSolar Data and Fields List

In [3]:
solar=pd.read_csv('../../deepsolar_tract.csv',encoding = "ISO-8859-1")
solar.head()
solar_fields=pd.read_csv('../deepsolar fields.csv')
solar_fields.head()

Unnamed: 0,Field,Description,Unit,Data Type,Formula,Possible Values,Observed Max,Observed Min,Theoretical Min,Theoretical Max,Relevant Feature
0,Unnamed: 0,Index,,Numeric,,,72537.0,0.0,,,0
1,tile_count,total number of tiles in census tract,,Numeric,,,4468.0,0.0,0.0,,0
2,solar_system_count,Total number of solar systems in census tract,,Numeric,,,1535.0,0.0,0.0,,0
3,total_panel_area,,,Numeric,,,592031.075,0.0,0.0,,0
4,fips,FIPS identifier for the census tract,,String,,,,,,,0


In [105]:
solar[['household_count','housing_unit_count']]

Unnamed: 0,household_count,housing_unit_count
0,2527,2931
1,2230,2356
2,2698,2909
3,1833,2226
4,1917,2004
5,2656,2845
6,470,559
7,1479,1619
8,2723,3588
9,3282,3566


# Load/Test PySpark

In [4]:
from pyspark import SparkContext
sc = SparkContext()

In [5]:
import numpy as np

TOTAL = 100
dots = sc.parallelize([2.0 * np.random.random(2) - 1.0 for i in range(TOTAL)]).cache()
print("Number of random points:", dots.count())

stats = dots.stats()
print('Mean:', stats.mean())
print('stdev:', stats.stdev())

Number of random points: 100
Mean: [ 0.08095369 -0.04892929]
stdev: [ 0.56633402  0.60561417]


# Train Model Using Spark ML

In [32]:
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.sql.session import SparkSession
from pyspark.sql import *
from pyspark.sql.types import *
spark = SparkSession(sc)
from pyspark.ml.linalg import DenseVector
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator

## Pre-process for Spark ML

In [None]:
features=solar_fields.loc[(solar_fields['Relevant Feature']==1)]['Field'].tolist()
#all_variables=['number_of_solar_system_per_household']features+

#Set infinity and blank spaces to NaN in independent variables, set infinite to 0 in dependent variable
solar2=solar[features].replace([np.inf,' '],np.nan)
solar2['number_of_solar_system_per_household']=solar['number_of_solar_system_per_household'].replace([np.inf,np.nan],0)

#Create binary version of number_of_solar_system_per_household for RF classifier

solar2['solar_flag']=solar2['number_of_solar_system_per_household'].apply(lambda x: int(x>0))


#convert states to indicator indices
solar2['state']=solar2['state'].astype('category').cat.codes

#create binary version of vote dem win variables

solar2['voting_2016_dem_win']=solar2['voting_2016_dem_win'].apply(lambda x: int(x))
solar2['voting_2012_dem_win']=solar2['voting_2012_dem_win'].apply(lambda x: int(x))

## Write solar2 to csv - cleaner than trying to convert pandas df to pyspark df

In [65]:
solar2.to_csv('ml_frame.csv',index=False)

## Read csv into pyspark df

In [87]:
df = spark.read.format("csv").option("header", True).option("inferSchema", True).load("ml_frame.csv")

In [88]:
df.printSchema()

root
 |-- average_household_income: double (nullable = true)
 |-- gini_index: double (nullable = true)
 |-- per_capita_income: double (nullable = true)
 |-- population_density: double (nullable = true)
 |-- state: integer (nullable = true)
 |-- education_less_than_high_school_rate: double (nullable = true)
 |-- education_high_school_graduate_rate: double (nullable = true)
 |-- education_college_rate: double (nullable = true)
 |-- education_bachelor_rate: double (nullable = true)
 |-- education_master_rate: double (nullable = true)
 |-- education_professional_school_rate: double (nullable = true)
 |-- education_doctoral_rate: double (nullable = true)
 |-- race_white_rate: double (nullable = true)
 |-- race_black_africa_rate: double (nullable = true)
 |-- race_indian_alaska_rate: double (nullable = true)
 |-- race_asian_rate: double (nullable = true)
 |-- race_islander_rate: double (nullable = true)
 |-- race_other_rate: double (nullable = true)
 |-- race_two_more_rate: double (nullable 

## Process Spark Df for Random Forest Classifier - Create Train and Test Sets

In [89]:
cols=df.columns
indep_vars=[i for i in cols if i not in ['solar_flag','number_of_solar_system_per_household']]
classifier_cols=['solar_flag']+indep_vars
regressor_cols=['number_of_solar_system_per_household']+indep_vars

In [90]:
(trainingData, testData) = df.randomSplit([0.8, 0.2],seed=1234)

classifier_train=trainingData.select(classifier_cols).rdd.map(lambda x: (x[0], DenseVector(x[1:])))
classifier_test=testData.select(classifier_cols).rdd.map(lambda x: (x[0], DenseVector(x[1:])))

regressor_train=trainingData.select(regressor_cols).rdd.map(lambda x: (x[0], DenseVector(x[1:])))
regressor_test=testData.select(regressor_cols).rdd.map(lambda x: (x[0], DenseVector(x[1:])))

In [91]:
classifier_train_df = spark.createDataFrame(classifier_train, ["label", "features"])
classifier_test_df = spark.createDataFrame(classifier_test, ["label", "features"])


regressor_train_df=spark.createDataFrame(regressor_train, ["label", "features"])
regressor_test_df=spark.createDataFrame(regressor_test, ["label", "features"])

### Generate Full classifier and regressor frames as reference for featureIndexer

In [92]:
classifier_data=df.select(classifier_cols).rdd.map(lambda x: (x[0], DenseVector(x[1:])))
regressor_data=df.select(regressor_cols).rdd.map(lambda x: (x[0], DenseVector(x[1:])))

classifier_df = spark.createDataFrame(classifier_data, ["label", "features"])
regressor_df=spark.createDataFrame(regressor_data, ["label", "features"])

## Create Model Pipeline - Classifier

In [93]:
#index the labels and the features for the random forest model
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(classifier_df)
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=50).fit(classifier_df)


#define the model and the label converter
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=100,maxBins=50)
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

#put all the steps together into a pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

#fit the training data
model = pipeline.fit(classifier_train_df)

#generate predictions on the test data
predictions = model.transform(classifier_test_df)

## Evaluate Classifier Accuracy

In [94]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print('test set accuracy: ',accuracy)

test set accuracy:  0.7901055682053405


## Get Classifier Feature Importances

In [95]:
rfModel = model.stages[2]
feature_importances=rfModel.featureImportances
#print(len(indep_vars))
fia=feature_importances.toArray()
#print(fia)
fi_list=[]
for i in range(fia.shape[0]):
    fi_list.append((indep_vars[i],fia[i]))
sorted_list=sorted(fi_list,reverse=True,key=lambda k: k[1])
sorted_list[0:20]

[('population_density', 0.13118768896957744),
 ('heating_fuel_coal_coke_rate', 0.098633403376446646),
 ('race_asian_rate', 0.052678830389025426),
 ('occupancy_vacant_rate', 0.051736586298504035),
 ('occupation_agriculture_rate', 0.03639380860602482),
 ('electricity_price_commercial', 0.031224988615582552),
 ('housing_unit_median_gross_rent', 0.027445022386295172),
 ('education_high_school_graduate_rate', 0.022922427733089234),
 ('electricity_consume_total', 0.021794422291053633),
 ('property_tax', 0.021552085043743004),
 ('number_of_years_of_education', 0.019682853553454553),
 ('relative_humidity', 0.019045283843342403),
 ('electricity_price_overall', 0.018920983671550116),
 ('electricity_consume_commercial', 0.017355679171692374),
 ('electricity_consume_residential', 0.016814018507011842),
 ('mortgage_with_rate', 0.016302518215167486),
 ('race_black_africa_rate', 0.015803695962652881),
 ('housing_unit_median_value', 0.015743282759629565),
 ('transportation_public_rate', 0.015573952011

## Create Model Pipeline - RandomForestRegressor

In [96]:
#define the feature indexer
featureIndexer2 =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=50).fit(regressor_df)


#define the model
rf2 = RandomForestRegressor(featuresCol="indexedFeatures", numTrees=200,maxBins=50)

#put all the steps together into a pipeline
pipeline2 = Pipeline(stages=[featureIndexer2, rf2])

#fit the training data
model2 = pipeline2.fit(regressor_train_df)

#generate predictions on the test data
predictions2 = model2.transform(regressor_test_df)

In [61]:
predictions2.show(50)

+-----+--------------------+--------------------+--------------------+
|label|            features|     indexedFeatures|          prediction|
+-----+--------------------+--------------------+--------------------+
|  0.0|[9770.23809524,0....|[9770.23809524,0....|0.001449081326517553|
|  0.0|[11758.4745763,0....|[11758.4745763,0....|0.028789590601968798|
|  0.0|[11797.2457627,0....|[11797.2457627,0....| 0.01017962048883469|
|  0.0|[17166.6666667,0....|[17166.6666667,0....|0.001813317692320...|
|  0.0|[19851.1111110999...|[19851.1111110999...|0.001341892594864...|
|  0.0|[20153.2268796,0....|[20153.2268796,0....|0.001384800277668636|
|  0.0|[20448.7021014000...|[20448.7021014000...|0.008925279480280261|
|  0.0|[20837.0873785999...|[20837.0873785999...|0.009356201710655963|
|  0.0|[21236.7384615,0....|[21236.7384615,0....|0.007192531952161102|
|  0.0|[21519.3913043,0....|[21519.3913043,0....|0.005539057246963736|
|  0.0|[21580.3738318,0....|[21580.3738318,0....|0.001376358549263...|
|  0.0

## Assess R^2 for Regression Model

In [97]:
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions2)
print("R^2 on test data = %g" % r2)

R^2 on test data = 0.199125


## Regression Model Feature Importances

In [98]:
rfModel2 = model2.stages[1]
feature_importances2=rfModel2.featureImportances
#print(len(indep_vars))
fia2=feature_importances2.toArray()
#print(fia)
fi_list2=[]
for i in range(fia2.shape[0]):
    fi_list2.append((indep_vars[i],fia2[i]))
sorted_list2=sorted(fi_list2,reverse=True,key=lambda k: k[1])
sorted_list2[0:20]

[('state', 0.084087104752229566),
 ('heating_fuel_electricity_rate', 0.065892949906152451),
 ('electricity_price_residential', 0.057150443926521161),
 ('occupancy_owner_rate', 0.053728231346459293),
 ('electricity_price_commercial', 0.044322663264767102),
 ('voting_2012_gop_percentage', 0.034745174064165017),
 ('electricity_price_industrial', 0.033559489325648346),
 ('median_household_income', 0.031770318844977996),
 ('education_high_school_graduate_rate', 0.028625187619716191),
 ('average_household_income', 0.025791543883914945),
 ('voting_2016_dem_percentage', 0.025598945532754037),
 ('electricity_consume_residential', 0.021572830497412737),
 ('electricity_price_overall', 0.01942859823129214),
 ('population_density', 0.018874743085619702),
 ('electricity_consume_commercial', 0.018014974864441174),
 ('household_type_family_rate', 0.017064164720980733),
 ('relative_humidity', 0.0161713084536949),
 ('transportation_public_rate', 0.014736279963133189),
 ('heating_fuel_gas_rate', 0.014204

## Overall R^2 for SolarForest

In [99]:
classifier_preds=predictions.select('prediction').collect()
classifier_preds_list=[i.prediction for i in classifier_preds]

In [100]:
regressor_preds=predictions2.select('prediction').collect()
regressor_preds_list=[i.prediction for i in regressor_preds]

In [101]:
def new_preds(classifier,regressor):
    new_preds=[]
    for i in range(len(classifier)):
        if classifier[i]==0:
            new_preds.append(0)
        else:
            new_preds.append(regressor[i])
            
    return new_preds

In [102]:
new_predictions_list=new_preds(classifier_preds_list,regressor_preds_list)
actual_vals=predictions2.select('label').collect()
actual_vals_list=[i.label for i in actual_vals]

In [103]:
from sklearn.metrics import r2_score

In [104]:
r2_score(actual_vals_list,new_predictions_list)

-0.049687838551428776