In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 100, 'display.max_rows', 150)
import numpy as np
import matplotlib.pyplot as plt
import config
import pickle

#Load google.cloud.bigquery
%load_ext google.cloud.bigquery

#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS

In [None]:
%%bigquery --use_rest_api ZRI_MF
SELECT *
FROM `high-empire-220313.ZRI.Multi_Family`

In [None]:
ZRI_MF = ZRI_MF.set_index('RegionName', drop = False)

# keep only ZRI Month/Year Columns
months_only = ZRI_MF.iloc[:,7:]

In [None]:
months_only

In [None]:
early_months = months_only.loc[:, :'_2015_01']

In [None]:
early_early_months = months_only.loc[:, :'_2014_01']

In [None]:
# missing values by month
## generally speaking, early months (closer to 2010) have more missing values
missing_per_month = pd.DataFrame(months_only.isnull().sum()).sort_values(0)

#percent missing by month
percent_missing_month = missing_per_month / len(months_only)

#Change index to datetime
percent_missing_month.index = pd.to_datetime(percent_missing_month.index, format = "_%Y_%m")

#sort_index (earliest first)
percent_missing_month = percent_missing_month.sort_index()

# Create"Percent of ZRI Scores Missing per Month" Plot
plt.plot(percent_missing_month)
plt.title('Percent of ZRI Scores Missing per Month')
plt.xlabel('Month/Year')
plt.ylabel('Percent of ZRI Scores Missing')

In [None]:
missing_per_month 

In [None]:
percent_missing_month

In [None]:
#GOAL: FIND ALL ROWS WITH ANY MISSING ZRI VALUES

#convert entire dataframe into Boolean values
is_NaN = months_only.isnull()

#Create Boolean for rows with missing values
row_has_NaN = is_NaN.any(axis=1)

#retrun dataframe with rows that are missing at least one ZRI score
rows_with_NaN = months_only[row_has_NaN]

# 1632 out of 1861 rows have at least one missing value (87.69%)
rows_with_NaN

In [None]:
# rows with all NaN values
row_all_NaN = is_NaN.all(axis=1)

#rows that have NaN values
rows_with_NaN = months_only[row_all_NaN]

# list of zipcodes with all NaN values
zipcodes_all_NA = list(row_all_NaN.index)

In [None]:
# Number of recorded ZRI's for each zipcode
ZRI_num = months_only.count(axis = 1)
#convert to Dataframe
pd_num_missing_ZRI=pd.DataFrame(ZRI_num)

# change column name from 0 to ZRIs
pd_num_missing_ZRI['ZRIs'] = pd_num_missing_ZRI[0]
pd_num_missing_ZRI = pd_num_missing_ZRI.drop(columns = 0)

#create dataframe with number of zipcodes having 0 - 109 ZRI scores
count_missing_num = pd.DataFrame(pd_num_missing_ZRI.groupby('ZRIs')['ZRIs'].count())

# Create 'Number of zipcodes having 0 - 109 ZRI scores' plot
plt.plot(count_missing_num)
plt.title('Number of zipcodes having 0 - 109 ZRI scores')
plt.xlabel('Number of ZRI scores')
plt.ylabel('Number of Zipcodes')
#Note: Having 67 and 68 scores breaks from the pattern

In [None]:
#################same as above, but the early years###############################
# Number of recorded ZRI's for each zipcode
ZRI_num = early_early_months.count(axis = 1)
#convert to Dataframe
pd_num_missing_ZRI=pd.DataFrame(ZRI_num)

# change column name from 0 to ZRIs
pd_num_missing_ZRI['ZRIs'] = pd_num_missing_ZRI[0]
pd_num_missing_ZRI = pd_num_missing_ZRI.drop(columns = 0)

#create dataframe with number of zipcodes having 0 - 109 ZRI scores
count_missing_num = pd.DataFrame(pd_num_missing_ZRI.groupby('ZRIs')['ZRIs'].count())

# Create '2010-2014: Number of zipcodes having 0 - 41 ZRI scores' plot
plt.plot(count_missing_num)
plt.title('2010-2014: Number of zipcodes having 0 - 41 ZRI scores')
plt.xlabel('Number of ZRI scores')
plt.ylabel('Number of Zipcodes')
#Note: Having 67 and 68 scores breaks from the pattern

In [None]:
# Number of recorded ZRI's for each zipcode
ZRI_num = early_months.count(axis = 1)
#convert to Dataframe
pd_num_missing_ZRI=pd.DataFrame(ZRI_num)

# change column name from 0 to ZRIs
pd_num_missing_ZRI['ZRIs'] = pd_num_missing_ZRI[0]
pd_num_missing_ZRI = pd_num_missing_ZRI.drop(columns = 0)

#create dataframe with number of zipcodes having 0 - 109 ZRI scores
count_missing_num = pd.DataFrame(pd_num_missing_ZRI.groupby('ZRIs')['ZRIs'].count())

# Create 'Number of zipcodes having 0 - 53 ZRI scores' plot
plt.plot(count_missing_num)
plt.title('2010-2015: Number of zipcodes having 0 - 53 ZRI scores')
plt.xlabel('Number of ZRI scores')
plt.ylabel('Number of Zipcodes')
#Note: Having 67 and 68 scores breaks from the pattern

In [None]:
#################same as above but just the early years###############################
# Number of recorded ZRI's for each zipcode
ZRI_num = early_early_months.count(axis = 1)
#convert to Dataframe
pd_num_missing_ZRI=pd.DataFrame(ZRI_num)

# change column name from 0 to ZRIs
pd_num_missing_ZRI['ZRIs'] = pd_num_missing_ZRI[0]
pd_num_missing_ZRI = pd_num_missing_ZRI.drop(columns = 0)

#create dataframe with number of zipcodes having 0 - 109 ZRI scores
count_missing_num = pd.DataFrame(pd_num_missing_ZRI.groupby('ZRIs')['ZRIs'].count())

# Create '2010-2014: Number of zipcodes having 0 - 41 ZRI scores' plot
plt.plot(count_missing_num)
plt.title('2010-2014: Number of zipcodes having 0 - 41 ZRI scores')
plt.xlabel('Number of ZRI scores')
plt.ylabel('Number of Zipcodes')
#Note: Having 67 and 68 scores breaks from the pattern

In [None]:
 # count of amount of ZRIs missing for each zipcode in table form 
pd.DataFrame(ZRI_num.value_counts().sort_index(ascending = False))

In [None]:
##############SAME AS ABOVE USING EARLY YEARS ONLY
# missing values by month
## generally speaking, early months (closer to 2010) have more missing values
missing_per_month = pd.DataFrame(early_early_months.isnull().sum()).sort_values(0)

#percent missing by month
percent_missing_month = missing_per_month / len(months_only)

#Change index to datetime
percent_missing_month.index = pd.to_datetime(percent_missing_month.index, format = "_%Y_%m")

#sort_index (earliest first)
percent_missing_month = percent_missing_month.sort_index()

# Create Plot - Percent of ZRI Scores Missing per Month
plt.plot(percent_missing_month)
plt.xticks(rotation=70)
plt.title('2010-2014: Percent of ZRI Scores Missing per Month')
plt.xlabel('Month/Year')
plt.ylabel('Percent of ZRI Scores Missing')

In [None]:

##############SAME AS ABOVE USING EARLY YEARS ONLY
# missing values by month
## generally speaking, early months (closer to 2010) have more missing values
missing_per_month = pd.DataFrame(early_months.isnull().sum()).sort_values(0)

#percent missing by month
percent_missing_month = missing_per_month / len(months_only)

#Change index to datetime
percent_missing_month.index = pd.to_datetime(percent_missing_month.index, format = "_%Y_%m")

#sort_index (earliest first)
percent_missing_month = percent_missing_month.sort_index()

# Create Plot - Percent of ZRI Scores Missing per Month
plt.xticks(rotation=70)
plt.plot(percent_missing_month)
plt.title('2010-2015:Percent of ZRI Scores Missing per Month')
plt.xlabel('Month/Year')
plt.ylabel('Percent of ZRI Scores Missing')

##### Conclusion, significantly more zipcodes were given ZRI scores starting in 2014

##### Due to the sparsity from 2010-2014, we will focus on and primarily use the years 2014-2019.