In [None]:
import json
import numpy as np
import pandas as pd
import pickle
import warnings
from pandas.core.common import SettingWithCopyWarning
from pandas.tseries.offsets import MonthEnd
from sklearn.decomposition import PCA

In [None]:
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
    
with open("cfg.json", "r") as jin:
    cfg = json.loads(jin.read())

# rewrite to ensure formatting
with open("cfg.json", "w") as jout:
    json.dump(cfg, jout, indent=4)

In [None]:
def find_anomalies(data):
    #define a list to accumlate anomalies
    anomalies = []
    
    # Set upper and lower limit to 3 standard deviation
    random_data_std = np.std(data)
    random_data_mean = np.mean(data)
    anomaly_cut_off = random_data_std * 3
    
    lower_limit  = random_data_mean - anomaly_cut_off 
    upper_limit = random_data_mean + anomaly_cut_off
    #print(lower_limit, upper_limit)
    # Generate outliers
    for outlier in data:
        if outlier > upper_limit or outlier < lower_limit:
            anomalies.append(outlier)
    return anomalies  

In [15]:
def installation_data_res(files):
    for i, file in enumerate(files):
        bk1 = pd.read_csv(file,encoding= 'unicode_escape', low_memory = False)
        if i == 0:
            bk = bk1.copy()
        else:
            bk = pd.concat([bk,bk1])
    
    bk = bk.replace({-9999:np.NaN}).replace({"-9999":np.NaN})


    
    bk_P = bk[(bk['customer_segment'] == 'RES') & (bk.zip_code != 'redacted')]
    bk_P = bk_P[cfg["filter_cols"]]

    bk_P['battery_rated_capacity_kWh'] = bk_P['battery_rated_capacity_kWh'].fillna(0)
    bk_P['installation_date'] = pd.to_datetime(bk_P['installation_date'])
    bk_P = bk_P.dropna(subset=['zip_code'])
    bk_P['zip_code'] = bk_P['zip_code'].apply(lambda a: str(a).split('-')[0].zfill(5))
    
    bk_P['install_month'] = bk_P.installation_date.dt.strftime('%Y-%m')
    bk_P['install_month'] = pd.to_datetime(bk_P['install_month']) + MonthEnd(1)
    
    bk_P_3 = bk_P[bk_P.system_size_DC <= 25]
    
    bk_P_rt = bk_P_3[(bk_P_3.ground_mounted != 1) & (bk_P_3.third_party_owned != 1) & (bk_P_3.battery_rated_capacity_kWh == 0)]
    bk_P_rt['cost_per_kW'] = bk_P_rt.total_installed_price/bk_P_rt.system_size_DC
    costs_df = bk_P_rt.groupby('install_month').cost_per_kW.mean().reset_index().dropna()
    
    bk_P_rt['days_from_first'] = int(str(bk_P_rt.installation_date - bk_P_rt.installation_date.min()).split()[0])
    bk_P_rt_cost = bk_P_rt.dropna(subset = ['cost_per_kW'])
    bk_P_rt_cost['PCA_cost'] = PCA(n_components = 1).fit_transform(bk_P_rt_cost[['days_from_first','cost_per_kW']])
    outliers = find_anomalies(bk_P_rt_cost.PCA_cost)
    bk_P_rt_cost = bk_P_rt_cost[~bk_P_rt_cost.PCA_cost.isin(outliers)]
    costs_df2 = bk_P_rt_cost.groupby('install_month').cost_per_kW.mean().reset_index().dropna()
    
    bk_P_rt_cost['rebate_per_kW'] = bk_P_rt_cost.rebate_or_grant/bk_P_rt_cost.system_size_DC
    rebate_df = bk_P_rt_cost.groupby('install_month').rebate_or_grant.mean().reset_index().dropna()
    
    total_RES_size_25 = bk_P_rt.groupby('zip_code').system_size_DC.count().reset_index().rename(columns = {'system_size_DC':'number_installs'})
    
    sub_25_map = dict(zip(total_RES_size_25.zip_code, total_RES_size_25.number_installs))
    
    return sub_25_map

In [16]:
files = [cfg["data_dir"] + cfg["berkeley_1"],
         cfg["data_dir"] + cfg["berkeley_2"]]

sub_25 = installation_data_res(files)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bk_P_rt['cost_per_kW'] = bk_P_rt.total_installed_price/bk_P_rt.system_size_DC
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bk_P_rt['days_from_first'] = int(str(bk_P_rt.installation_date - bk_P_rt.installation_date.min()).split()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bk_P_rt_cost['PCA

In [17]:
with open('../data/sub_25.p', 'wb') as fp:
    pickle.dump(sub_25, fp, protocol=pickle.HIGHEST_PROTOCOL)