In [34]:
#importing modules
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

In [43]:
#setting file path to read
voc_path = Path("Resources/voc_rehab.csv")
homeless_path = Path("Resources/experiencing_homelessness-gender_demographics.csv")
success_path = Path('Resources/successful_closures.csv')
coc_county_path = Path('Resources/County-CoC Mapping.csv')

# Read with Pandas
vocation_df = pd.read_csv(voc_path)
vocation_df.head()

Unnamed: 0,Year,County,Sex,Number of Consumers,Annotation Code,Count Annotation Desc,Percent,Percent Annotation Desc
0,2014,Alameda,Not Reported,4.0,,,,
1,2014,Alameda,Female,2356.0,,,,
2,2014,Alameda,Male,2391.0,,,,
3,2014,Alpine,Female,,1.0,Cell suppressed for small number. A score over...,1.0,This value represents less than or equal to 1 ...
4,2014,Amador,Female,,1.0,Cell suppressed for small number. A score over...,1.0,This value represents less than or equal to 1 ...


In [36]:
#shows what columns to determine relevance for vocational data
vocation_df.columns

Index(['Year', 'County', 'Sex', 'Number of Consumers', 'Annotation Code',
       'Count Annotation Desc', 'Percent', 'Percent Annotation Desc'],
      dtype='object')

In [37]:
#removing irrelevant columns
del vocation_df["Annotation Code"]
del vocation_df["Count Annotation Desc"]
del vocation_df["Percent"]
del vocation_df["Percent Annotation Desc"]
del vocation_df["Sex"]

#removing commas from numerical data
vocation_df = vocation_df.replace(',', '', regex=True)
vocation_df.head()

Unnamed: 0,Year,County,Number of Consumers
0,2014,Alameda,4.0
1,2014,Alameda,2356.0
2,2014,Alameda,2391.0
3,2014,Alpine,
4,2014,Amador,


In [38]:
#filling na values w/0
vocation_df = vocation_df.fillna(0)

#reducing data to year 2017
vocation_df = vocation_df.loc[vocation_df['Year'] == 2017, :]

#filtering data by year
vocation_df = vocation_df.set_index('Year')

#changing 'number of consumers' to type int
vocation_df = vocation_df.astype({'Number of Consumers': int})

vocation_df.head()

Unnamed: 0_level_0,County,Number of Consumers
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2017,Alameda,1879
2017,Alameda,2145
2017,Alameda,2
2017,Alpine,0
2017,Alpine,0


In [39]:
#adding together total number of consumers per coutny per year
vocation_df = vocation_df.groupby(['Year', 'County'])['Number of Consumers'].sum().reset_index()
vocation_df

Unnamed: 0,Year,County,Number of Consumers
0,2017,Alameda,4026
1,2017,Alpine,0
2,2017,Amador,33
3,2017,Butte,883
4,2017,Calaveras,36
5,2017,Colusa,36
6,2017,Contra Costa,2682
7,2017,Del Norte,202
8,2017,El Dorado,344
9,2017,Fresno,2544


In [40]:
#introducing homeless data
homeless_df = pd.read_csv(homeless_path)
homeless_df

Unnamed: 0,CALENDAR_YEAR,COC_ID,COC_NAME,GENDER,EXPERIENCING_HOMELESSNESS_CNT
0,2017,All,California,Female,79356
1,2017,All,California,Male,101589
2,2017,All,California,Non-Singular Gender,142
3,2017,All,California,Questioning Gender,*
4,2017,All,California,Transgender,674
...,...,...,...,...,...
1459,2023 Q1,CA-614,San Luis Obispo County CoC,Female,989
1460,2023 Q1,CA-614,San Luis Obispo County CoC,Male,1239
1461,2023 Q1,CA-614,San Luis Obispo County CoC,Non-Singular Gender,*
1462,2023 Q1,CA-614,San Luis Obispo County CoC,Transgender,*


In [41]:
#cleaning homeless data
#renaming year column
homeless_df = homeless_df.rename(columns={'CALENDAR_YEAR': 'Year', 'EXPERIENCING_HOMELESSNESS_CNT': 'Homeless_count'})

#removing extra columns
del homeless_df['COC_ID']
del homeless_df['GENDER']

#removing extra parts from names/numbers
homeless_df = homeless_df.replace('CoC', '', regex=True)
homeless_df = homeless_df.replace('Q1', '', regex=True)

#replacing * with 0
homeless_df = homeless_df.loc[homeless_df['Homeless_count']!='*']

#changing types to ints
homeless_df = homeless_df.astype({'Year': int, 'COC_NAME': str, 'Homeless_count': int})

#narrowing time range to match with other data
homeless_df = homeless_df.loc[homeless_df["Year"] < 2018, :]

#grouping together by county
homeless_df = homeless_df.groupby(['Year', 'COC_NAME'])['Homeless_count'].sum().reset_index()

homeless_df

Unnamed: 0,Year,COC_NAME,Homeless_count
0,2017,Alameda County,6596
1,2017,"Alpine, Inyo, Mono Counties",161
2,2017,"Amador, Calaveras, Mariposa, Tuolumne Counties",383
3,2017,Butte County,1270
4,2017,California,183228
5,2017,"Colusa, Glenn, Trinity Counties",198
6,2017,Contra Costa County,4782
7,2017,El Dorado County,133
8,2017,"Fresno, Madera Counties",2225
9,2017,Glendale (Los Angeles County),336


In [44]:
#intoducing COC to County data
coc_county_df = pd.read_csv(coc_county_path)
coc_county_df

Unnamed: 0,County,Continuum of Care
0,Alameda County,Alameda County CoC
1,Alpine County,"Alpine, Inyo, Mono Counties CoC"
2,Inyo County,"Alpine, Inyo, Mono Counties CoC"
3,Mono County,"Alpine, Inyo, Mono Counties CoC"
4,Amador County,"Amador, Calaveras, Mariposa, Tuolumne Counties..."
5,Calaveras County,"Amador, Calaveras, Mariposa, Tuolumne Counties..."
6,Mariposa County,"Amador, Calaveras, Mariposa, Tuolumne Counties..."
7,Tuolumne County,"Amador, Calaveras, Mariposa, Tuolumne Counties..."
8,Butte County,Butte County CoC
9,California,California


In [20]:
#introducing successful vocational services
success_df = pd.read_csv(success_path)
success_df

Unnamed: 0,Year,County,Occupation,Successful Closures,Annotation Code,Count Annotation Desc,Percent,Percent Annotation Desc
0,2014,Alameda,Not Reported,67.0,,,,
1,2014,Alameda,Clerical and Administrative Support,114.0,,,,
2,2014,Alameda,Community and Social Service Occupations,3.0,,,,
3,2014,Alameda,Computer and Mathematical Occupations,5.0,,,,
4,2014,Alameda,"Education, Training, and Library Occupations",4.0,,,,
...,...,...,...,...,...,...,...,...
1935,2017,Yuba,"Professional, Paraprofessional and Technical",,1.0,Cell suppressed for small number. A score over...,1.0,This value represents less than or equal to 1 ...
1936,2017,Yuba,Protective Service Occupations,,1.0,Cell suppressed for small number. A score over...,1.0,This value represents less than or equal to 1 ...
1937,2017,Yuba,RSA Special Occupations and Miscellaneous,,1.0,Cell suppressed for small number. A score over...,1.0,This value represents less than or equal to 1 ...
1938,2017,Yuba,Sales and Related Occupations,,1.0,Cell suppressed for small number. A score over...,1.0,This value represents less than or equal to 1 ...


In [21]:
#displaying columns to display relevance
success_df.columns

Index(['Year', 'County', 'Occupation', 'Successful Closures',
       'Annotation Code', 'Count Annotation Desc', 'Percent',
       'Percent Annotation Desc'],
      dtype='object')

In [22]:
#removing irrelevant columns
del success_df['Occupation']
del success_df['Annotation Code']
del success_df['Count Annotation Desc']
del success_df['Percent']
del success_df['Percent Annotation Desc']

#replacing na w/ value of 0
success_df = success_df.fillna(0)
success_df

Unnamed: 0,Year,County,Successful Closures
0,2014,Alameda,67.0
1,2014,Alameda,114.0
2,2014,Alameda,3.0
3,2014,Alameda,5.0
4,2014,Alameda,4.0
...,...,...,...
1935,2017,Yuba,0.0
1936,2017,Yuba,0.0
1937,2017,Yuba,0.0
1938,2017,Yuba,0.0


In [23]:
#reducing data to year 2017
success_df = success_df.loc[success_df["Year"] == 2017 , :]

#adding total number of successful closures per county per year
success_df = success_df.groupby(['Year', 'County'])['Successful Closures'].sum().reset_index()
success_df

Unnamed: 0,Year,County,Successful Closures
0,2017,Alameda,368.0
1,2017,Amador,1.0
2,2017,Butte,104.0
3,2017,Calaveras,0.0
4,2017,Colusa,0.0
5,2017,Contra Costa,253.0
6,2017,Del Norte,1.0
7,2017,El Dorado,33.0
8,2017,Fresno,145.0
9,2017,Glenn,14.0


In [30]:
#merging together vocational data
merged_vocation_df = pd.merge(vocation_df, success_df, on=['Year', 'County'], how='inner')
merged_vocation_df

Unnamed: 0,Year,County,Number of Consumers,Successful Closures
0,2017,Alameda,4026,368.0
1,2017,Amador,33,1.0
2,2017,Butte,883,104.0
3,2017,Calaveras,36,0.0
4,2017,Colusa,36,0.0
5,2017,Del Norte,202,1.0
6,2017,El Dorado,344,33.0
7,2017,Fresno,2544,145.0
8,2017,Glenn,188,14.0
9,2017,Humboldt,639,74.0


In [31]:
#organized data by year by county
merged_vocation_df = merged_vocation_df.set_index(['County'])
merged_vocation_df

Unnamed: 0_level_0,Year,Number of Consumers,Successful Closures
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alameda,2017,4026,368.0
Amador,2017,33,1.0
Butte,2017,883,104.0
Calaveras,2017,36,0.0
Colusa,2017,36,0.0
Del Norte,2017,202,1.0
El Dorado,2017,344,33.0
Fresno,2017,2544,145.0
Glenn,2017,188,14.0
Humboldt,2017,639,74.0


In [32]:
#merging all data frames into one and adjusting formating
master_df = pd.merge(merged_vocation_df, homeless_df, on=['Year', 'County'], how='left')
pd.options.display.float_format = '{:,.2f}'.format
master_df

Unnamed: 0,Year,Number of Consumers,Successful Closures,COC_NAME,Homeless_count
0,2017,4026,368.00,Alameda County,6596
1,2017,4026,368.00,"Alpine, Inyo, Mono Counties",161
2,2017,4026,368.00,"Amador, Calaveras, Mariposa, Tuolumne Counties",383
3,2017,4026,368.00,Butte County,1270
4,2017,4026,368.00,California,183228
...,...,...,...,...,...
2503,2017,211,1.00,Sonoma County,3086
2504,2017,211,1.00,Stanislaus County,6187
2505,2017,211,1.00,Tehama County,395
2506,2017,211,1.00,Ventura County,1926


In [None]:
#scatter plot for # of consumers and successful closures
master_df.plot(kind= 'scatter',x= 'Successful Closures', y= 'Unemployment', loglog= True)