In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

In [2]:
os.chdir(str(Path(os.getcwd()).parent / 'Data'))

# Constructing the Modeling Space

In [3]:
homeless_students_coc_rates = pd.read_csv('doe_homeless_students_coc_agg_rates.csv')
pit_ard_data = pd.read_csv('pit_ard_merged_cleaned.csv')
spm_data = pd.read_csv('spms_only_cleaned.csv')

In [4]:
homeless_students_coc_rates['year'] = pd.Series([int(yr[5:]) for yr in homeless_students_coc_rates['year']])
homeless_students_coc_rates.head()

Unnamed: 0,year,cocnumber,doe_total_rate,doe_unsheltered_rate,doe_sheltered_rate,doe_hotel_motel_rate,doe_doubled_up_rate
0,2015,AK-500,62,-46,39,-45,114
1,2016,AK-500,688,64,268,12,344
2,2017,AK-500,-429,13,-365,-70,-7
3,2015,AK-501,118,11,86,-14,4
4,2016,AK-501,237,33,16,37,164


In [5]:
pit_ard_data.rename(columns = {'coc_number':'cocnumber'}, inplace = True)
pit_ard_data.head()

Unnamed: 0,year,cocnumber,coc_number_and_name,pprn,estimated_ard,bonus,coc_planning,State,coc_name,coc_category,...,pit_homeless_parenting_youth_age_18_to_24,pit_sheltered_es_homeless_parenting_youth_age_18_to_24,pit_sheltered_th_homeless_parenting_youth_age_18_to_24,pit_sheltered_total_homeless_parenting_youth_age_18_to_24,pit_unsheltered_homeless_parenting_youth_age_18_to_24,pit_homeless_children_of_parenting_youth,pit_sheltered_es_homeless_children_of_parenting_youth,pit_sheltered_th_homeless_children_of_parenting_youth,pit_sheltered_total_homeless_children_of_parenting_youth,pit_unsheltered_homeless_children_of_parenting_youth
0,2018,AK-500,AK-500-ANCHORAGE,1293735,2847097,170826,85413,AK,Anchorage CoC,Other Urban CoCs,...,14.0,6.0,8.0,14.0,0.0,14.0,5.0,9.0,14.0,0.0
1,2018,AK-501,AK-501-ALASKA BALANCE OF STATE,756772,776502,46590,23295,AK,Alaska Balance of State CoC,Rural CoCs,...,11.0,8.0,1.0,9.0,2.0,12.0,9.0,1.0,10.0,2.0
2,2018,AL-500,"AL-500-BIRMINGHAM,JEFFERSON,ST.CLAIR,SHELBY CO...",6505662,9109238,546554,273277,AL,"Birmingham/Jefferson, St. Clair, Shelby Counti...",Suburban CoCs,...,3.0,1.0,2.0,3.0,0.0,7.0,1.0,6.0,7.0,0.0
3,2018,AL-501,"AL-501-MOBILE CITY & COUNTY,BALDWIN COUNTY",3230092,3741586,224495,112248,AL,Mobile City & County/Baldwin County CoC,Other Urban CoCs,...,9.0,4.0,1.0,5.0,4.0,12.0,8.0,1.0,9.0,3.0
4,2018,AL-502,"AL-502-FLORENCE,NORTHWEST ALABAMA",629035,447309,37742,18871,AL,Florence/Northwest Alabama CoC,Rural CoCs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
spm_data = spm_data.drop(columns = 'coc_name')
spm_data = spm_data = spm_data.rename(columns = {'coc_number' : 'cocnumber'})
spm_data.head()

Unnamed: 0,year,state,cocnumber,total_returns,0_to_6_month_returns,6_to_12_month_returns,12_to_24_month_returns
0,2015,AK,AK-500,680,119,31,43
1,2016,AK,AK-500,871,172,41,48
2,2017,AK,AK-500,998,131,40,65
3,2015,AK,AK-501,89,7,3,8
4,2016,AK,AK-501,168,3,4,6


In [7]:
modeling_data_space = pd.merge(spm_data, homeless_students_coc_rates)
modeling_data_space = pd.merge(modeling_data_space, pit_ard_data[['year', 'cocnumber', 'pprn', 'estimated_ard', 'bonus', 
                                                                 'coc_planning', 'coc_category']])
modeling_data_space.to_csv('modeling_data_space', index = False)
modeling_data_space.head()

Unnamed: 0,year,state,cocnumber,total_returns,0_to_6_month_returns,6_to_12_month_returns,12_to_24_month_returns,doe_total_rate,doe_unsheltered_rate,doe_sheltered_rate,doe_hotel_motel_rate,doe_doubled_up_rate,pprn,estimated_ard,bonus,coc_planning,coc_category
0,2015,AK,AK-500,680,119,31,43,62,-46,39,-45,114,1138309,2824705,423706,84741,Other Urban CoCs
1,2016,AK,AK-500,871,172,41,48,688,64,268,12,344,1169576,2891188,144559,86736,Other Urban CoCs
2,2017,AK,AK-500,998,131,40,65,-429,13,-365,-70,-7,1210110,2849941,170996,85498,Other Urban CoCs
3,2015,AK,AK-501,89,7,3,8,118,11,86,-14,4,602107,807072,121061,24212,Rural CoCs
4,2016,AK,AK-501,168,3,4,6,237,33,16,37,164,669635,738551,36928,22157,Rural CoCs


# Modeling Space EDA & Transformations


In [8]:
len(modeling_data_space)

1154

In [9]:
modeling_data_space.isna().sum()

year                      0
state                     0
cocnumber                 0
total_returns             0
0_to_6_month_returns      0
6_to_12_month_returns     0
12_to_24_month_returns    0
doe_total_rate            0
doe_unsheltered_rate      0
doe_sheltered_rate        0
doe_hotel_motel_rate      0
doe_doubled_up_rate       0
pprn                      0
estimated_ard             0
bonus                     0
coc_planning              0
coc_category              0
dtype: int64

In [10]:
national_time_series = modeling_data_space.groupby('year').sum()
national_time_series.to_csv('national_time_series.to_csv', index = False)
national_time_series.head()

Unnamed: 0_level_0,total_returns,0_to_6_month_returns,6_to_12_month_returns,12_to_24_month_returns,doe_total_rate,doe_unsheltered_rate,doe_sheltered_rate,doe_hotel_motel_rate,doe_doubled_up_rate,pprn,estimated_ard,bonus,coc_planning
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015,356373,34211,17121,20395,-68228,-4406,-9236,1505,-51027,1388668379,1800277047,288727511,53632237
2016,400592,37441,19168,24281,74198,6458,13637,4416,46006,1483403314,1843886524,99584690,55276148
2017,453260,41519,21157,25821,39392,6400,-5528,4930,32479,1546650856,1887209663,123302439,56943343


In [11]:
states_time_series = modeling_data_space.groupby(['state', 'year']).sum()
states_time_series.to_csv('states_time_series.csv', index = False)
states_time_series.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_returns,0_to_6_month_returns,6_to_12_month_returns,12_to_24_month_returns,doe_total_rate,doe_unsheltered_rate,doe_sheltered_rate,doe_hotel_motel_rate,doe_doubled_up_rate,pprn,estimated_ard,bonus,coc_planning
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AK,2015,769,126,34,51,180,-35,125,-59,118,1740416,3631777,544767,108953
AK,2016,1039,175,45,54,925,97,284,49,508,1839211,3629739,181487,108893
AK,2017,1270,144,51,75,-914,-12,-583,-121,-213,1917406,3588242,215294,107647
AL,2015,2154,282,105,130,279,-166,342,-60,89,14935229,16031815,2968469,593694
AL,2016,2010,210,75,105,-5117,-58,-502,-148,-4650,15327065,16898755,991250,594750


In [12]:
state_totals = modeling_data_space.groupby(['state']).sum().drop(columns = 'year')
state_totals.head()

Unnamed: 0_level_0,total_returns,0_to_6_month_returns,6_to_12_month_returns,12_to_24_month_returns,doe_total_rate,doe_unsheltered_rate,doe_sheltered_rate,doe_hotel_motel_rate,doe_doubled_up_rate,pprn,estimated_ard,bonus,coc_planning
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AK,3078,445,130,180,191,50,-174,-131,413,5497033,10849758,941548,325493
AL,6070,794,331,376,-3051,-116,-38,-173,-2801,46252496,49750370,5148322,1782747
AR,3724,328,148,193,1376,-9,-253,24,909,15398501,13887371,1866693,633538
AZ,26156,2820,1464,1703,-2927,-141,-2055,592,-1819,79625393,112928388,9914011,3387851
CA,120277,9436,5419,7164,-41026,604,-3046,2062,-39736,604692963,997498904,86662820,25307886


In [13]:
national_totals = national_time_series.sum(axis = 0)
national_totals.head()

total_returns             1210225
0_to_6_month_returns       113171
6_to_12_month_returns       57446
12_to_24_month_returns      70497
doe_total_rate              45362
dtype: int64

In [19]:
coc_plurality_by_state = pd.get_dummies(modeling_data_space[['state', 'coc_category']].groupby('state').agg(lambda x:x.value_counts().index[0]))
state_totals_plural_coc_category = state_totals.join(coc_plurality_by_state, on = 'state').copy()
state_totals_plural_coc_category = state_totals_plural_coc_category.rename(columns = {'coc_category':'most_common_coc_category'})
state_totals_plural_coc_category.to_csv('state_totals_plural_coc_category.csv', index = False)
state_totals_plural_coc_category.head()

Unnamed: 0_level_0,total_returns,0_to_6_month_returns,6_to_12_month_returns,12_to_24_month_returns,doe_total_rate,doe_unsheltered_rate,doe_sheltered_rate,doe_hotel_motel_rate,doe_doubled_up_rate,pprn,estimated_ard,bonus,coc_planning,coc_category_Major Cities,coc_category_Other Urban CoCs,coc_category_Rural CoCs,coc_category_Suburban CoCs
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AK,3078,445,130,180,191,50,-174,-131,413,5497033,10849758,941548,325493,0,1,0,0
AL,6070,794,331,376,-3051,-116,-38,-173,-2801,46252496,49750370,5148322,1782747,0,1,0,0
AR,3724,328,148,193,1376,-9,-253,24,909,15398501,13887371,1866693,633538,0,1,0,0
AZ,26156,2820,1464,1703,-2927,-141,-2055,592,-1819,79625393,112928388,9914011,3387851,1,0,0,0
CA,120277,9436,5419,7164,-41026,604,-3046,2062,-39736,604692963,997498904,86662820,25307886,0,0,0,1


In [20]:
coc_totals = modeling_data_space.groupby('cocnumber').sum().drop(columns = 'year')
coc_totals = pd.merge(coc_totals, pd.get_dummies(modeling_data_space[['cocnumber', 'coc_category']].groupby('cocnumber').first()), on = 'cocnumber')
coc_totals.to_csv('coc_totals.csv', index = False)
coc_totals.head()

Unnamed: 0_level_0,total_returns,0_to_6_month_returns,6_to_12_month_returns,12_to_24_month_returns,doe_total_rate,doe_unsheltered_rate,doe_sheltered_rate,doe_hotel_motel_rate,doe_doubled_up_rate,pprn,estimated_ard,bonus,coc_planning,coc_category_Major Cities,coc_category_Other Urban CoCs,coc_category_Rural CoCs,coc_category_Suburban CoCs
cocnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AK-500,2549,422,112,156,321,31,-58,-103,451,3517995,8565834,739261,256975,0,1,0,0
AK-501,529,23,18,24,-130,19,-116,-28,-38,1979038,2283924,202287,68518,0,0,1,0
AL-500,2440,478,190,204,-716,5,-118,84,-479,17177292,25715142,2204214,771454,0,0,0,1
AL-501,1336,141,68,83,-357,-118,106,-335,-16,8866321,11411779,991702,342353,0,1,0,0
AL-502,0,0,0,0,-168,-19,-46,-2,-107,1116419,938041,61523,33492,0,0,1,0


In [79]:
coc_category_total = modeling_data_space.groupby('coc_category').sum().drop(columns = 'year')
coc_urban = coc_category_totals.loc['Major Cities',:] + coc_category_totals.loc['Other Urban CoCs',:]
coc_non_urban = coc_category_totals.loc['Suburban CoCs',:] + coc_category_totals.loc['Rural CoCs',:]

coc_category_total = pd.DataFrame(index = ['Urban CoCs', 'Non Urban CoCs'], columns = coc_urban.index, data = [coc_urban, coc_non_urban])
coc_category_total

Unnamed: 0,total_returns,0_to_6_month_returns,6_to_12_month_returns,12_to_24_month_returns,doe_total_rate,doe_unsheltered_rate,doe_sheltered_rate,doe_hotel_motel_rate,doe_doubled_up_rate,pprn,estimated_ard,bonus,coc_planning
Urban CoCs,540423,56950,28211,34750,18511,6320,-878,3597,11179,2142461120,2971250100,260184439,77807368
Non Urban CoCs,669802,56221,29235,35747,26851,2132,-249,7254,16279,2276261429,2560123134,251430201,88044360


# Applying Tests to Datasets

