In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 100, 'display.max_rows', 150)
import numpy as np
import matplotlib.pyplot as plt
import config

#Load google.cloud.bigquery
%load_ext google.cloud.bigquery

#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS

In [None]:
%%bigquery --use_rest_api ZRI_MF
SELECT *
FROM `high-empire-220313.ZRI.Multi_Family`

In [None]:
ZRI_MF = ZRI_MF.set_index('RegionID', drop = False)

# keep only ZRI Month/Year Columns
months_only = ZRI_MF.iloc[:,7:-4]

In [None]:
# missing values by month
## generally speaking, early months (closer to 2010) have more missing values
missing_per_month = pd.DataFrame(months_only.isnull().sum()).sort_values(0)

#percent missing by month
percent_missing_month = missing_per_month / len(months_only)

#Change index to datetime
percent_missing_month.index = pd.to_datetime(percent_missing_month.index, format = "_%Y_%m")

#sort_index (earliest first)
percent_missing_month = percent_missing_month.sort_index()

# Create Plot - Percent of ZRI Scores Missing per Month
plt.plot(percent_missing_month)
plt.title('Percent of ZRI Scores Missing per Month')
plt.xlabel('Month/Year')
plt.ylabel('Percent of ZRI Scores Missing')

In [None]:
#GOAL: FIND ALL ROWS WITH ANY MISSING ZRI VALUES

#convert entire dataframe into Boolean values
is_NaN = months_only.isnull()

#Create Boolean for rows with missing values
row_has_NaN = is_NaN.any(axis=1)

#retrun dataframe with rows that are missing at least one ZRI score
rows_with_NaN = months_only[row_has_NaN]

# 1632 out of 1861 rows have at least one missing value (87.69%)
rows_with_NaN

In [None]:
# Number of recorded ZRI's for each zipcode
ZRI_num = months_only.count(axis = 1)
#convert to Dataframe
pd_num_missing_ZRI=pd.DataFrame(ZRI_num)

# change column name from 0 to ZRIs
pd_num_missing_ZRI['ZRIs'] = pd_num_missing_ZRI[0]
pd_num_missing_ZRI = pd_num_missing_ZRI.drop(columns = 0)

#create dataframe with number of zipcodes having 0 - 109 ZRI scores
count_missing_num = pd.DataFrame(pd_num_missing_ZRI.groupby('ZRIs')['ZRIs'].count())

# Create 'Number of zipcodes having 0 - 109 ZRI scores' plot
plt.plot(count_missing_num)
plt.title('Number of zipcodes having 0 - 109 ZRI scores')
plt.xlabel('Number of ZRI scores')
plt.ylabel('Number of Zipcodes')
#Note: Having 67 and 68 scores breaks from the pattern

In [None]:
# zipcodes that have exacly 67 ZRI scores
zri_67 = pd_num_missing_ZRI[pd_num_missing_ZRI ==67].dropna()

# zipcodes that have exacly 68 ZRI scores
zri_68 = pd_num_missing_ZRI[pd_num_missing_ZRI ==68].dropna()

# combine the 67 and 68 dataframes
zri_67_68= pd.concat([zri_67,zri_68])

#separately merge zri_67, zri_68, and zri_67_68 with original ZRI_MF dataframe
df_zri_67 = pd.merge(zri_67, ZRI_MF, left_index=True, right_index=True)
df_zri_68 = pd.merge(zri_68, ZRI_MF, left_index=True, right_index=True)
df_zri_67_68 = pd.merge(zri_67_68, ZRI_MF, left_index=True, right_index=True)