<a href="https://colab.research.google.com/github/edwardb1203/GoogleSunroofML/blob/main/GoogleSunroofML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [41]:
# We are using the Google Sunroof data set taken from Kaggle
# Project Sunroof computes how much sunlight hits your roof in a year. It takes into account:
# Google's database of imagery and maps
# 3D modeling of your roof
# Shadows cast by nearby structures and trees
# All possible sun positions over the course of a year
# Historical cloud and temperature patterns that might affect solar energy production 
url = 'https://raw.githubusercontent.com/edwardb1203/GoogleSunroofML/main/sunroof_solar_potential_by_censustract.csv'
# url_city_data = 'https://raw.githubusercontent.com/edwardb1203/GoogleSunroofML/main/project-sunroof-city-09082017.csv'
df=pd.read_csv(url)
# Let's take a look at the data
df.head()

Unnamed: 0,carbon_offset_metric_tons,count_qualified,existing_installs_count,install_size_kw_buckets,kw_median,kw_total,lat_avg,lat_max,lat_min,lng_avg,...,region_name,state_name,yearly_sunlight_kwh_e,yearly_sunlight_kwh_f,yearly_sunlight_kwh_kw_threshold_avg,yearly_sunlight_kwh_median,yearly_sunlight_kwh_n,yearly_sunlight_kwh_s,yearly_sunlight_kwh_total,yearly_sunlight_kwh_w
0,2159.887389,586,12,[[0171][5231][10114][1545][2016][252][352][451...,7.25,5219.0,34.26704,34.272362,34.261761,-118.432716,...,6037104401,California,1014756.0,2539152.0,1285.247909,10863.01795,1075291.0,1760043.0,7922383.0,1533140.0
1,11392.316227,369,4,[[060][550][1024][1518][2023][2515][3012][3515...,28.0,28391.25,43.156368,43.162289,43.15097,-77.60704,...,36055009400,New York,733882.7,28158550.0,947.75,30689.598633,173981.0,1342966.0,31176060.0,766685.5
2,6902.286133,788,3,[[0364][5313][1057][1513][207][258][305][351][...,5.25,7352.25,42.277794,42.288818,42.266918,-89.066589,...,17201000700,Illinois,1024630.0,2850205.0,980.05,5823.500452,134264.2,3155397.0,8337565.0,1173068.0
3,3238.933474,432,1,[[0320][567][1020][1512][251][302][351][401][4...,3.25,3598.25,40.422962,40.42894,40.418259,-79.993729,...,42003180300,Pennsylvania,1203715.0,1432569.0,948.6,3724.817041,73181.9,760424.8,3912445.0,442553.9
4,6111.873688,827,32,[[0125][5219][10227][15124][2039][2520][3015][...,11.5,15098.5,33.844044,33.847179,33.839771,-117.933113,...,6059086602,California,2070519.0,11834610.0,1260.55,16996.447681,1691435.0,3551939.0,22418120.0,3269615.0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48722 entries, 0 to 48721
Data columns (total 31 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   carbon_offset_metric_tons             48722 non-null  float64
 1   count_qualified                       48722 non-null  int64  
 2   existing_installs_count               48722 non-null  int64  
 3   install_size_kw_buckets               48664 non-null  object 
 4   kw_median                             48664 non-null  float64
 5   kw_total                              48664 non-null  float64
 6   lat_avg                               48722 non-null  float64
 7   lat_max                               48722 non-null  float64
 8   lat_min                               48722 non-null  float64
 9   lng_avg                               48722 non-null  float64
 10  lng_max                               48722 non-null  float64
 11  lng_min        

The columns we want to consider are: 
<br> yearly_sunlight_kwh_kw_threshold_avg
<br>yearly_sunlight_kwh_f
<br>yearly_sunlight_kwh_n
<br>yearly_sunlight_kwh_s
<br>yearly_sunlight_kwh_e
<br>yearly_sunlight_kwh_w
<br>number_of_panels_f
<br>number_of_panels_n
<br>number_of_panels_s
<br>number_of_panels_e
<br>number_of_panels_w
<br>lat_avg 
<br> lng_avg
<br>kw_total

In [43]:
# Let's clean our data
# A function to determine missing values
def missing_value(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    q = list(percent_missing)
    missing_value_df = pd.DataFrame({'Column Name': df.columns, 'Percent Missing % ': q})
    # I only want to see the missing values
    return missing_value_df.loc[missing_value_df['Percent Missing % '] > 0]

In [44]:
# Let's take a look
missing_value(df)

Unnamed: 0,Column Name,Percent Missing %
3,install_size_kw_buckets,0.119043
4,kw_median,0.119043
5,kw_total,0.119043
14,number_of_panels_median,0.119043
17,number_of_panels_total,0.119043
22,state_name,0.002052
25,yearly_sunlight_kwh_kw_threshold_avg,0.002052
26,yearly_sunlight_kwh_median,0.119043
29,yearly_sunlight_kwh_total,0.119043


In [47]:
# The only column we want that is missing values is yearly_sunlight_kwh_kw_threshold_avg, with 0.002052% of values missing
# Let's drop them
desired_columns = ['carbon_offset_metric_tons','yearly_sunlight_kwh_kw_threshold_avg', 'yearly_sunlight_kwh_f', 'yearly_sunlight_kwh_n', 'yearly_sunlight_kwh_s', 'yearly_sunlight_kwh_e','yearly_sunlight_kwh_w','number_of_panels_f','number_of_panels_n','number_of_panels_s','number_of_panels_e','number_of_panels_w','lat_avg','lng_avg','kw_total']
df = df.dropna(subset=['yearly_sunlight_kwh_kw_threshold_avg'])
df = df[desired_columns]
df.head()

Unnamed: 0,carbon_offset_metric_tons,yearly_sunlight_kwh_kw_threshold_avg,yearly_sunlight_kwh_f,yearly_sunlight_kwh_n,yearly_sunlight_kwh_s,yearly_sunlight_kwh_e,yearly_sunlight_kwh_w,number_of_panels_f,number_of_panels_n,number_of_panels_s,number_of_panels_e,number_of_panels_w,lat_avg,lng_avg,kw_total
0,2159.887389,1285.247909,2539152.0,1075291.0,1760043.0,1014756.0,1533140.0,6574,3159,4352,2964,3827,34.26704,-118.432716,5219.0
1,11392.316227,947.75,28158550.0,173981.0,1342966.0,733882.7,766685.5,102635,693,4578,2760,2899,43.156368,-77.60704,28391.25
2,6902.286133,980.05,2850205.0,134264.2,3155397.0,1024630.0,1173068.0,9963,535,10649,3841,4421,42.277794,-89.066589,7352.25
3,3238.933474,948.6,1432569.0,73181.9,760424.8,1203715.0,442553.9,5001,299,2529,4920,1644,40.422962,-79.993729,3598.25
4,6111.873688,1260.55,11834610.0,1691435.0,3551939.0,2070519.0,3269615.0,31555,5146,8933,5933,8827,33.844044,-117.933113,15098.5


In [56]:
# Splitting features and labels
features = df[desired_columns]
target = df['carbon_offset_metric_tons']
# Splitting train and test data
Xtrain, Xtest, Ytrain, Ytest = train_test_split(features,target,test_size = 0.25,random_state = 1)
print(Xtrain)

       carbon_offset_metric_tons  yearly_sunlight_kwh_kw_threshold_avg  \
41362               12449.995368                           1246.950000   
40515               35001.045118                           1052.300000   
24568                4873.783567                           1003.850000   
39765                5213.060434                           1285.258533   
40397                2085.536195                            977.500000   
...                          ...                                   ...   
43724               26628.307361                           1116.900000   
32511                2547.045929                           1160.250000   
5192                17900.842290                           1138.150000   
12172               11644.847401                            932.450000   
33003               12584.295800                            997.050000   

       yearly_sunlight_kwh_f  yearly_sunlight_kwh_n  yearly_sunlight_kwh_s  \
41362           1.863228e+07     

In [52]:
# Regression Model
linear_regression_model = LinearRegression()
# Training
linear_regression_model.fit(Xtrain,Ytrain)

ValueError: ignored

In [None]:
# How does the model do?
# r_sq is a number between 0 and 1 that measures how well a statistical model predicts an outcome
r_sq = linear_regression_model.score(Xtrain, Ytrain)
print(f"coefficient of determination: {r_sq}")

In [None]:
# Test model
carbon_offset = linear_regression_model.predict(Xtest)
print(f"predicted response:\n{carbon_offset}")