In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

# Load and Prepare the Data

In [21]:
solar=pd.read_csv('../deepsolar_tract.csv',encoding = "ISO-8859-1")
solar.head()
solar_fields=pd.read_csv('../deepsolar fields.csv')
solar_fields.head()

Unnamed: 0,Field,Description,Unit,Data Type,Formula,Possible Values,Observed Max,Observed Min,Theoretical Min,Theoretical Max,Relevant Feature
0,Unnamed: 0,Index,,Numeric,,,72537.0,0.0,,,0
1,tile_count,total number of tiles in census tract,,Numeric,,,4468.0,0.0,0.0,,0
2,solar_system_count,Total number of solar systems in census tract,,Numeric,,,1535.0,0.0,0.0,,0
3,total_panel_area,,,Numeric,,,592031.075,0.0,0.0,,0
4,fips,FIPS identifier for the census tract,,String,,,,,,,0


In [46]:
#define relevant features and dependent variable


features=solar_fields.loc[(solar_fields['Relevant Feature']==1)]['Field'].tolist()
all_variables=features+['number_of_solar_system_per_household']

#drop rows with NaN values for now
solar2=solar[all_variables].replace([np.inf,' '],np.nan).dropna()

#Create binary version of number_of_solar_system_per_household for RF classifier

solar2['solar_flag']=solar2['number_of_solar_system_per_household'].apply(lambda x: int(x>0))


#create dummy variables for state
solar2=pd.get_dummies(solar2,columns=['state'])

#create binary version of vote dem win variables

solar2['voting_2016_dem_win']=solar2['voting_2016_dem_win'].apply(lambda x: int(x))
solar2['voting_2012_dem_win']=solar2['voting_2012_dem_win'].apply(lambda x: int(x))


#designate independent variable frame
independent_vars=solar2.loc[:,~solar2.columns.isin(['number_of_solar_system_per_household','solar_flag'])]

#create training and test data
shuffle = np.random.permutation(np.arange(independent_vars.shape[0]))
split_size=int(shuffle.shape[0]*0.8)

X,y=independent_vars.values[shuffle],solar2['solar_flag'].values[shuffle]
X_train,y_train=X[0:split_size],y[0:split_size]
X_dev,y_dev=X[split_size:],y[split_size:]
print('training data shape: ',X_train.shape)
print('training labels shape: ',y_train.shape)
print('dev data shape: ',X_dev.shape)
print('dev labels shape: ',y_dev.shape)




training data shape:  (36143, 142)
training labels shape:  (36143,)
dev data shape:  (9036, 142)
dev labels shape:  (9036,)


# Fit the Classifier

In [47]:
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Classifier Score on the Dev Set

In [48]:
classifier.score(X_dev,y_dev)

0.80190349712262066

## Feature Importance List - Top 20

In [53]:
feature_importances=classifier.feature_importances_
features=independent_vars.columns
feature_tuples=[(features[i],feature_importances[i]) for i in range(len(features))]
sorted_features=sorted(feature_tuples,reverse=True,key=lambda k: k[1])
for i in range(0,20):
    print(sorted_features[i])

('population_density', 0.038107178854703147)
('occupancy_vacant_rate', 0.02770418171630722)
('heating_fuel_coal_coke_rate', 0.02263082269050521)
('lon', 0.022240364183156319)
('housing_unit_median_gross_rent', 0.019521973176544125)
('education_high_school_graduate_rate', 0.016301678149650535)
('number_of_years_of_education', 0.016264585436631552)
('electricity_consume_total', 0.01541594707953781)
('race_asian_rate', 0.015224226141095819)
('occupation_agriculture_rate', 0.014533732072643796)
('voting_2012_dem_percentage', 0.013979679294780681)
('per_capita_income', 0.013690732732964127)
('electricity_consume_industrial', 0.013570484307351402)
('travel_time_10_19_rate', 0.013518525105643669)
('average_household_income', 0.012952429305080417)
('heating_design_temperature', 0.012410502982126145)
('travel_time_less_than_10_rate', 0.012351416786129023)
('travel_time_40_59_rate', 0.012145865917251438)
('transportation_car_alone_rate', 0.011945228499702046)
('race_white_rate', 0.01182067478840

### Observations

* 80% accuracy on dev set with no hyperparamter tuning
* list of important features is similar to feature importance list for the classifier in SolarForest

# Try Again with Values from Supplemental Info Section

## Load/Prep the Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [3]:
solar=pd.read_csv('../deepsolar_tract.csv',encoding = "ISO-8859-1")
solar.head()
solar_fields=pd.read_csv('../deepsolar fields.csv')
solar_fields.head()

Unnamed: 0,Field,Description,Unit,Data Type,Formula,Possible Values,Observed Max,Observed Min,Theoretical Min,Theoretical Max,Relevant Feature,Mentioned in Supplemental Info
0,Unnamed: 0,Index,,Numeric,,,72537.0,0.0,,,0,
1,tile_count,total number of tiles in census tract,,Numeric,,,4468.0,0.0,0.0,,0,
2,solar_system_count,Total number of solar systems in census tract,,Numeric,,,1535.0,0.0,0.0,,0,
3,total_panel_area,,,Numeric,,,592031.075,0.0,0.0,,0,
4,fips,FIPS identifier for the census tract,,String,,,,,,,0,


In [5]:
#define relevant features and dependent variable


features=solar_fields.loc[(solar_fields['Mentioned in Supplemental Info']==1)]['Field'].tolist()
all_variables=features+['number_of_solar_system_per_household']

solar2=solar[all_variables]

#Create binary version of number_of_solar_system_per_household for RF classifier

solar2['solar_flag']=solar2['number_of_solar_system_per_household'].apply(lambda x: int(x>0))
solar2=solar2.loc[np.isfinite(solar2['number_of_solar_system_per_household'])]

#designate independent variable frame
independent_vars=solar2.loc[:,~solar2.columns.isin(['number_of_solar_system_per_household','solar_flag'])]
X=independent_vars.values
yc=solar2['solar_flag'].values
yr=solar2['number_of_solar_system_per_household'].values

#impute missing values
missing_val_imputer=Imputer(strategy='median')
X=missing_val_imputer.fit_transform(X)


X_train, X_test, yc_train, yc_test,yr_train,yr_test = train_test_split(X, yc,yr, test_size=0.3, random_state=42)



print('training data shape: ',X_train.shape)
print('classifier train labels shape: ',yc_train.shape)
print('regressor train labels shape: ',yr_train.shape)
print('test data shape: ',X_test.shape)
print('classifier test labels shape: ',yc_test.shape)
print('regressor test labels shape: ',yr_test.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


training data shape:  (50236, 97)
classifier train labels shape:  (50236,)
regressor train labels shape:  (50236,)
test data shape:  (21530, 97)
classifier test labels shape:  (21530,)
regressor test labels shape:  (21530,)


## Run the Classifier

In [6]:
classifier=RandomForestClassifier()
classifier.fit(X_train,yc_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
classifier.score(X_test,yc_test)

0.78857408267533668

### Try with GridSearch

In [40]:
params_dict={'max_depth':(15, 20, 30), 'n_estimators':[100,150,200]}
classifier=RandomForestClassifier()
clf = GridSearchCV(classifier, params_dict, cv=3,n_jobs=-1)
clf.fit(X,yc)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': (15, 20, 30), 'n_estimators': [100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [41]:
clf.cv_results_



{'mean_fit_time': array([  64.03800201,   94.74899475,  130.92498597,   76.42666507,
         110.05466549,  141.88466334,   75.21966537,   96.4893326 ,
         105.43634653]),
 'mean_score_time': array([ 0.63266706,  0.99101329,  1.47533043,  0.73066735,  1.124669  ,
         1.48134168,  0.75333563,  0.86800043,  1.05031784]),
 'mean_test_score': array([ 0.76976563,  0.77010005,  0.77110331,  0.76826074,  0.76728534,
         0.76813533,  0.76526489,  0.76615668,  0.76681158]),
 'mean_train_score': array([ 0.94578213,  0.94663908,  0.94668785,  0.98827439,  0.98915921,
         0.98922888,  0.99993033,  0.99996516,  0.99996516]),
 'param_max_depth': masked_array(data = [15 15 15 20 20 20 30 30 30],
              mask = [False False False False False False False False False],
        fill_value = ?),
 'param_n_estimators': masked_array(data = [100 150 200 100 150 200 100 150 200],
              mask = [False False False False False False False False False],
        fill_value = ?),
 

In [42]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [43]:
clf.best_score_

0.77110330797313487

In [12]:
classifier=RandomForestClassifier(max_depth=15,n_estimators=200, n_jobs=-1)
classifier.fit(X_train,yc_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
classifier.score(X_test,yc_test)

0.81007895959126797

In [14]:
feature_importances=classifier.feature_importances_
features=independent_vars.columns
feature_tuples=[(features[i],feature_importances[i]) for i in range(len(features))]
sorted_features=sorted(feature_tuples,reverse=True,key=lambda k: k[1])
for i in range(0,20):
    print(sorted_features[i])

('population_density', 0.082371804053878298)
('heating_fuel_housing_unit_count', 0.035031509309243591)
('race_asian_rate', 0.027140577303938134)
('housing_unit_median_value', 0.025006715476245001)
('occupancy_vacant_rate', 0.023619916748656565)
('daily_solar_radiation', 0.023169724684760785)
('heating_fuel_coal_coke', 0.020340427680199977)
('housing_unit_median_gross_rent', 0.019417423133018016)
('relative_humidity', 0.018114988880803339)
('heating_fuel_gas', 0.017656120357699968)
('average_household_income', 0.015430440620901032)
('education_high_school_graduate_rate', 0.014970768580587896)
('number_of_years_of_education', 0.01457771882572215)
('heating_fuel_electricity', 0.014551685727852879)
('race_white_rate', 0.014435951709916952)
('education_bachelor_rate', 0.014281799690752979)
('mortgage_with_rate', 0.013547239530027797)
('travel_time_less_than_10_rate', 0.013347216996470335)
('travel_time_average', 0.013344409332333346)
('transportation_public_rate', 0.013264758818592355)


# Train the Regressor

In [44]:
params_dict={'max_depth':(None, 10, 15), 'n_estimators':[100,150,200]}
regressor=RandomForestRegressor()
clf_r = GridSearchCV(regressor, params_dict, cv=3,n_jobs=-1)
clf_r.fit(X,yr)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': (None, 10, 15), 'n_estimators': [100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
clf_r.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [47]:
clf_r.best_score_

0.16963068462854103

In [8]:
regressor=RandomForestRegressor(max_depth=15,n_estimators=150,n_jobs=-1)
regressor.fit(X_train,yr_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [9]:
regressor.score(X_test,yr_test)

0.44434322263741666

## Feature Importances

In [11]:
feature_importances=regressor.feature_importances_
features=independent_vars.columns
feature_tuples=[(features[i],feature_importances[i]) for i in range(len(features))]
sorted_features=sorted(feature_tuples,reverse=True,key=lambda k: k[1])
for i in range(0,20):
    print(sorted_features[i])

('age_more_than_85_rate', 0.11642611854656532)
('daily_solar_radiation', 0.10586856316714562)
('age_25_34_rate', 0.077091593721001772)
('occupation_manufacturing_rate', 0.072977960182347201)
('heating_fuel_housing_unit_count', 0.065991014442417728)
('occupancy_owner_rate', 0.049155040732826512)
('education_high_school_graduate_rate', 0.03762831556129953)
('population_density', 0.029652075736730611)
('occupation_construction_rate', 0.029447385505135218)
('average_household_income', 0.022477050653042677)
('frost_days', 0.014519909074967587)
('relative_humidity', 0.014031481288856961)
('net_metering', 0.013601183241591075)
('voting_2016_gop_percentage', 0.0129401024253794)
('heating_fuel_electricity', 0.012614881116415792)
('household_type_family_rate', 0.012346258444754211)
('education_college_rate', 0.011147643647710466)
('occupancy_vacant_rate', 0.010407835619004668)
('earth_temperature_amplitude', 0.0098384891444286155)
('avg_electricity_retail_rate', 0.0095585214218685873)


# Put it all Together

In [50]:
classifier=RandomForestClassifier(max_depth=15,n_estimators=200, n_jobs=-1)
classifier.fit(X_train,yc_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [51]:
classifier.score(X_test,yc_test)

0.81133302368787741

In [62]:
classifier_preds=classifier.predict(X)
classifier_preds[0:100]

array([0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [63]:
regressor_preds=regressor.predict(X)
regressor_preds[0:100]

array([ 0.00114169,  0.0014949 ,  0.00144492,  0.00159351,  0.00138759,
        0.00113158,  0.00272081,  0.00113158,  0.00109523,  0.0019542 ,
        0.0014949 ,  0.0014949 ,  0.00152301,  0.0012744 ,  0.00119131,
        0.0014949 ,  0.00113512,  0.00112879,  0.00115699,  0.0014949 ,
        0.00152301,  0.0014949 ,  0.00321333,  0.0014949 ,  0.0019942 ,
        0.0014949 ,  0.00175004,  0.00112353,  0.0011263 ,  0.01416651,
        0.00748623,  0.00747806,  0.01461936,  0.0086219 ,  0.00542014,
        0.01203697,  0.02050741,  0.01121039,  0.00745423,  0.01468637,
        0.00780877,  0.01718609,  0.00823749,  0.00537207,  0.01747512,
        0.01221843,  0.00588148,  0.02147911,  0.01045148,  0.00524687,
        0.00762797,  0.00847698,  0.01284661,  0.02334697,  0.01394182,
        0.00696577,  0.00932254,  0.00280443,  0.00520237,  0.0073258 ,
        0.00764751,  0.00723688,  0.00448796,  0.00703137,  0.02156461,
        0.00480691,  0.01105882,  0.00792109,  0.02423347,  0.01

In [64]:
final_preds=regressor_preds*classifier_preds
final_preds[0:100]

array([ 0.        ,  0.0014949 ,  0.00144492,  0.        ,  0.00138759,
        0.        ,  0.00272081,  0.        ,  0.        ,  0.0019542 ,
        0.0014949 ,  0.0014949 ,  0.00152301,  0.        ,  0.00119131,
        0.0014949 ,  0.00113512,  0.        ,  0.00115699,  0.0014949 ,
        0.00152301,  0.0014949 ,  0.00321333,  0.0014949 ,  0.0019942 ,
        0.0014949 ,  0.00175004,  0.        ,  0.        ,  0.01416651,
        0.00748623,  0.00747806,  0.01461936,  0.0086219 ,  0.00542014,
        0.01203697,  0.02050741,  0.01121039,  0.00745423,  0.01468637,
        0.00780877,  0.01718609,  0.00823749,  0.00537207,  0.01747512,
        0.01221843,  0.00588148,  0.02147911,  0.01045148,  0.00524687,
        0.00762797,  0.00847698,  0.01284661,  0.02334697,  0.01394182,
        0.00696577,  0.00932254,  0.00280443,  0.00520237,  0.0073258 ,
        0.00764751,  0.00723688,  0.00448796,  0.00703137,  0.02156461,
        0.00480691,  0.01105882,  0.00792109,  0.02423347,  0.01

In [56]:
from sklearn.metrics import r2_score

In [57]:
r2_score(yr_test,final_preds)

0.46463879012921439

In [65]:
solar_full=solar[features].replace([np.inf,' '],np.nan)
solar_full['number_of_solar_system_per_household']=solar['number_of_solar_system_per_household'].replace(np.inf,np.nan)
solar_full['fips']=solar['fips']
solar_full=solar_full.loc[np.isfinite(solar_full['number_of_solar_system_per_household'])]
solar_full=solar_full.assign(predicted_solar=final_preds)
solar_full.head(10)

Unnamed: 0,average_household_income,gini_index,heating_fuel_coal_coke,heating_fuel_electricity,heating_fuel_fuel_oil_kerosene,heating_fuel_gas,heating_fuel_housing_unit_count,heating_fuel_none,heating_fuel_other,heating_fuel_solar,...,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate,number_of_solar_system_per_household,fips,predicted_solar
0,70352.78987,0.349,200,448,98,1720,2527,10,51,0,...,34,0,0,25,12,0,9.46,0.0,27145011200,0.0
1,61727.0852,0.4074,20,379,17,1799,2230,0,15,0,...,34,0,0,25,12,0,9.46,0.006726,27145011301,0.001495
2,71496.88658,0.3926,69,440,45,2098,2698,29,17,0,...,34,0,0,25,12,0,9.46,0.001112,27145011302,0.001445
3,86840.15275,0.3949,188,442,61,1113,1833,13,16,0,...,34,0,0,25,12,0,9.46,0.0,27145011304,0.0
4,89135.3156,0.4463,96,497,47,1202,1917,31,44,0,...,34,0,0,25,12,0,9.46,0.002087,27145011400,0.001388
5,62225.90361,0.3847,72,759,81,1695,2656,45,4,0,...,34,0,0,25,12,0,9.46,0.0,27145011500,0.0
6,41068.93617,0.7166,3,222,5,193,470,16,31,0,...,34,0,0,25,12,0,9.46,0.004255,27145011600,0.002721
7,74073.83367,0.4317,108,382,190,783,1479,3,13,0,...,34,0,0,25,12,0,9.46,0.0,27145010500,0.0
8,69412.19243,0.3824,241,534,156,1735,2723,20,37,0,...,34,0,0,25,12,0,9.46,0.0,27145011100,0.0
9,82502.40707,0.3588,18,734,27,2419,3282,29,55,0,...,34,0,0,25,12,0,9.46,0.002742,27145010102,0.001954


In [66]:
relevant_fields=['number_of_solar_system_per_household']+features+['fips']
final_frame=pd.merge(solar[relevant_fields],solar_full[['fips','predicted_solar']],on=['fips'],how='left')
final_frame.head(10)

Unnamed: 0,number_of_solar_system_per_household,average_household_income,gini_index,heating_fuel_coal_coke,heating_fuel_electricity,heating_fuel_fuel_oil_kerosene,heating_fuel_gas,heating_fuel_housing_unit_count,heating_fuel_none,heating_fuel_other,...,diversity,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate,fips,predicted_solar
0,0.0,70352.78987,0.349,200,448,98,1720,2527,10,51,...,0.04696,34,0,0,25,12,0,9.46,27145011200,0.0
1,0.006726,61727.0852,0.4074,20,379,17,1799,2230,0,15,...,0.145934,34,0,0,25,12,0,9.46,27145011301,0.001495
2,0.001112,71496.88658,0.3926,69,440,45,2098,2698,29,17,...,0.00915,34,0,0,25,12,0,9.46,27145011302,0.001445
3,0.0,86840.15275,0.3949,188,442,61,1113,1833,13,16,...,0.187334,34,0,0,25,12,0,9.46,27145011304,0.0
4,0.002087,89135.3156,0.4463,96,497,47,1202,1917,31,44,...,0.090766,34,0,0,25,12,0,9.46,27145011400,0.001388
5,0.0,62225.90361,0.3847,72,759,81,1695,2656,45,4,...,0.081981,34,0,0,25,12,0,9.46,27145011500,0.0
6,0.004255,41068.93617,0.7166,3,222,5,193,470,16,31,...,0.470443,34,0,0,25,12,0,9.46,27145011600,0.002721
7,0.0,74073.83367,0.4317,108,382,190,783,1479,3,13,...,0.008239,34,0,0,25,12,0,9.46,27145010500,0.0
8,0.0,69412.19243,0.3824,241,534,156,1735,2723,20,37,...,0.029574,34,0,0,25,12,0,9.46,27145011100,0.0
9,0.002742,82502.40707,0.3588,18,734,27,2419,3282,29,55,...,0.120052,34,0,0,25,12,0,9.46,27145010102,0.001954


In [67]:
final_frame.to_csv('baseline_model_predictions.csv')