In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sb

from math import nan
from collections import Counter

dfd = pd.DataFrame.from_dict

Loading data

In [None]:
df = pd.read_csv('SDGEPropertyUpgradeData.csv') # replace with own filepath
df = df.set_index('Unnamed: 0')
df.head()

# note: will need to select only the time series data

Overall descriptive stats (uncomment whichever one)

In [None]:
# set indexes accordingly (this goes for the whole notebook)

#df[list(df)[4:]].mean(0)
df[list(df)[4:]].median(0)

Relative Statistics: here a specific upgrade is selected to analyze how its installation corresponds with that of other upgrades, which in this case means plotting the distribution of relative ages (age differences between upgrades).

In [None]:
# selecting portion of data containing the upgrade in question

u = 'Electrical Panel Upgrade'
sub_df = df[df[u].notnull()]
#sub_df = sub_df[~sub_df['Solar PV'].notnull()]
sub_df.head()

In [4]:
# This function takes all the time data for a properties upgrades and centers it on the upgrade of interest
# so the ages become ages relative to the upgrade of interest. A negative number implies an upgrade was installed after,
# while positive implies prior.
def center(row, zero, upgrades):
    val = float(row[zero])
    for upgrade in upgrades:
        u = row[upgrade]
        if u:
            age = float(u)
            centered_age = age - val
            row[upgrade] = centered_age

In [None]:
# centering data

data = sub_df.to_dict(orient='index')

zero = u
upgrades = list(df)[4:] # change index

for p in data.keys():
    prop = data[p]
    center(prop, zero, upgrades)

c_df = dfd(data, orient='index')
c_df.head()

In [None]:
# pulling only the time series data (numeric data)

df_data = c_df[list(c_df)[4:]] # change index
df_data.head()

In [123]:
# for if you want all descriptive data as csv

#df_data.mean(0)
epu_rl = df_data.describe()
epu_rl.to_csv('EPU Relative Age Data.csv')

In [None]:
# this cell plots the distribution of relative ages, currently set up to plot the distributions
# of two upgrades relative to the upgrade of interest

# setting upgrades
u_other_1 = 'Kitchen Remodel'
u_other_2 = 'Bathroom Remodel'

# getting correct portions of data to plot
other_1 = df_data[df_data[u_other_1].notnull()]
other_2 = df_data[df_data[u_other_2].notnull()]

# if you want a subset
#other_sub = other[(other[u_other] >= -1) & (other[u_other] <= 14)]

# getting relative ages and plotting
o_relative_age_1 = list(other_1[u_other_1])
o_relative_age_2 = list(other_2[u_other_2])
sb.displot({u_other_1: o_relative_age_1, u_other_2: o_relative_age_2}, bins=30)

The next section is for checking overall distribution of upgrades by year, does require different data format where time series data is set to contain the date of installation. The permit data files could work well for this, just select only permits where then desired upgrades are present then pull 'EffectiveDate' column.

In [None]:
df = pd.read_csv('sf_utime.csv') # replace file
df.head()

In [12]:
# picking out upgrades and setting dates to be year only

u1 = 'Kitchen Remodel'
u2 = 'Bathroom Remodel'
e = 'Electrical Panel Upgrade'

# this is an old version, would need to update
solar = df[df[u1] != '0']
solar = solar[u1].apply(lambda x: x[0:4])

epu = df[df[e] != '0']
epu = epu[e].apply(lambda x: x[0:4])

ou = df[df[u2] != '0']
ou = ou[u2].apply(lambda x: x[0:4])

In [None]:
# selecting portion of data and plotting results

s_data = [int(i) for i in list(solar) if i != '1900']
e_data = [int(i) for i in list(epu) if i != '1900']
u_data = [int(i) for i in list(ou) if i != '1900']
#sub_solar_data = [i for i in s_data if i >= 1980]
sb.displot({'Electrical Panel Upgrades': e_data, u1: s_data, u2: u_data}, binwidth=2)

With Property Age

Loading and preparing data

In [None]:
df = pd.read_csv('sf_time_final.csv')
df.info()

In [None]:
data_df = df[list(df)[5:len(list(df))-2]]
data_df.head()

Descriptive stats

In [None]:
# data_df.mean(0)
data_df.median(0)
# data_df.describe()

Upgrade vs. year built (raw distribution)

In [None]:
# pulling specific upgrade

u = 'Electrical Panel Upgrade'
sub_df = df[df[u].notnull()]
sub_df.head()

In [None]:
# plotting upgrade vs. year built distribution

yr_blt = list(sub_df['Year Built'])
yr_blt_s = [i for i in yr_blt if i >= 1960]
sb.displot(yr_blt, binwidth=4)
sb.displot(yr_blt_s, binwidth=1, kde=True)
sb.displot(yr_blt, kind='ecdf')


Upgrade likelihood vs. year built

In [None]:
# prepating all data

all_years = list(df['Year Built'])
year_data = Counter(all_years)
epu_year_data = Counter(yr_blt) # needs previous section

likelihoods = [epu_year_data[year]/year_data[year] for year in year_data.keys()]
years = list(year_data.keys())
years.sort()

year_data = {i: 0 for i in range(1900, 2024)}
epu_year_data = {i: 0 for i in range(1900, 2024)}

for year in all_years:
    if year in year_data.keys():
        year_data[year] += 1

# to handle divide by zero error
for year in year_data.keys():
    if year_data[year] == 0:
        year_data[year] += 1

for year in yr_blt:
    if year in epu_year_data.keys():
        epu_year_data[year] += 1

In [None]:
# calculating likelihoods

p = [epu_year_data[year]/year_data[year] for year in range(1900, 2024)]
print(p)

In [None]:
# plots results (set up for electrical panel upgrades)

x = np.arange(1900, 2024)

plt.figure(figsize=(10, 8))
plt.plot(x, p)
plt.xlabel("Year Built")
plt.ylabel("Proportion of Homes with Electrical Panel Upgrade")
plt.title("Likelihood of Electrical Panel Upgrade vs. Year Home Built")

Finally, this section plots distributions of upgrades vs. time since last sale date. It also plots bivariate distribution of year built and time since last sale date.

In [None]:
# instead of this just take the columns you need out of the main property data dataframe

df_sd = pd.read_csv('sf_u_tenure.csv') 
df_sd.head()

In [None]:
data_df = df_sd[list(df_sd)[5:len(list(df_sd))-2]]
#print(data_df.mean(0))
print(data_df.median(0))


In [None]:
# preparing data and plotting for upgrade count vs. time since last sale date

e = 'Electrical Panel Upgrade'
u = 'Bathroom Remodel'

epu_upgrades = data_df[data_df[e].notnull()]
epu_data_t = epu_upgrades[e]

# preparing multiple sets of upgrade data to look at tighter time spans
edt_s = [i for i in epu_data_t if i <= 5]
edt_xs = [i for i in edt_s if i <= 2]
other_upgrades = data_df[data_df[u].notnull()]
odt = other_upgrades[u]
odt_s = [i for i in odt if i <= 5]

sb.displot({e: epu_data_t, u: odt}, binwidth=1)
sb.displot({e: edt_s, u: odt_s}, binwidth=0.25)
sb.displot(edt_xs, bins=12)
# sb.displot(edt_sm, binwidth=2)

In [None]:
epu_upgrades_full = df_sd[df_sd[u].notnull()]
epu_upgrades_full.head()

In [None]:
# plotting bivariate distribution

sub = epu_upgrades_full[(epu_upgrades_full[e] <= 5) & (epu_upgrades_full['Year Built'] >= 1900)]

from_sale = list(sub[e])
yb = list(sub['Year Built'])

sb.displot(x=from_sale, y=yb, binwidth=(2, 5))