# Imports

In [2]:
import json
import pandas as pd
import numpy as np
import sqlalchemy as db
import statsmodels.api as sm

from sodapy import Socrata

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Data Scraping

### Data: DHS Daily Report
- Source: https://data.cityofnewyork.us/Social-Services/DHS-Daily-Report/k46n-sa2m

This dataset includes the daily number of individuals residing in the Department of Homeless Services (DHS) shelter system and the daily number of families applying to the DHS shelter system.

---

### 1. Authorization

- **Step 1** Save your credential as following in a json file

```
{'app_token': 'Enter your app token here'}```

- **Step 2** Name the json file "creds.json" and save it in the directory this jupyter notebook is located.

In [3]:
# Read "creds.json"
creds_file = open('./creds.json', 'r')
socrata_creds = json.loads(creds_file.read())

In [4]:
# Sign in using your app token
client = Socrata("data.cityofnewyork.us", socrata_creds['app_token'])

### 2. Return the dataset as Pandas Dataframe
Use Socrata Open Data API (SODA) to return the dataset as dictionary and convert it into Pandas Dataframe.
SODA provides programmatic access to NYC OpenData datasets.
- Source: https://dev.socrata.com/foundry/data.cityofnewyork.us/k46n-sa2m

In [5]:
# Get the DHS Daily Report dataset using "Dataset Identifier"
results = client.get("k46n-sa2m", limit = 10_000);

# Convert to pandas DataFrame
daily_df = pd.DataFrame.from_records(results);

# Data Wrangling

In [6]:
daily_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2230 entries, 0 to 2229
Data columns (total 13 columns):
date_of_census                                             2230 non-null object
total_adults_in_shelter                                    2230 non-null object
total_children_in_shelter                                  2230 non-null object
total_individuals_in_shelter                               2230 non-null object
single_adult_men_in_shelter                                2230 non-null object
single_adult_women_in_shelter                              2230 non-null object
total_single_adults_in_shelter                             2230 non-null object
families_with_children_in_shelter                          2230 non-null object
adults_in_families_with_children_in_shelter                2230 non-null object
children_in_families_with_children_in_shelter              2230 non-null object
total_individuals_in_families_with_children_in_shelter_    2230 non-null object
adult_familie

- The type of all columns is 'object'. Convert `date_of_census` column type to datetime and rest of the columns type to numerical in order to make timeseries analysis easier.
- No null value observed.

In [7]:
# Convert `data_of_census` column type to datetime and set the column as index.
daily_df['date_of_census'] = daily_df['date_of_census'].apply(lambda x: x.split('T')[0])
daily_df.index = pd.to_datetime(daily_df['date_of_census'], format = "%Y/%m/%d")
daily_df = daily_df.drop('date_of_census', axis = 1)

In [8]:
# Convert the type of all columns to numeric
daily_df = daily_df.apply(pd.to_numeric)

In [9]:
# Check and see the data cleaning is correctly executed.
daily_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2230 entries, 2019-11-30 to 2013-08-21
Data columns (total 12 columns):
total_adults_in_shelter                                    2230 non-null int64
total_children_in_shelter                                  2230 non-null int64
total_individuals_in_shelter                               2230 non-null int64
single_adult_men_in_shelter                                2230 non-null int64
single_adult_women_in_shelter                              2230 non-null int64
total_single_adults_in_shelter                             2230 non-null int64
families_with_children_in_shelter                          2230 non-null int64
adults_in_families_with_children_in_shelter                2230 non-null int64
children_in_families_with_children_in_shelter              2230 non-null int64
total_individuals_in_families_with_children_in_shelter_    2230 non-null int64
adult_families_in_shelter                                  2230 non-null int64
indivi

- The index of the dataset is now 'DatetimeIndex' and all columns are numeric.
- No null vlaue

---

In [10]:
daily_df.shape

(2230, 12)

In [11]:
daily_df.resample('D').mean().shape

(2293, 12)

In [12]:
daily_df.resample('D').mean().isnull().sum()

total_adults_in_shelter                                    63
total_children_in_shelter                                  63
total_individuals_in_shelter                               63
single_adult_men_in_shelter                                63
single_adult_women_in_shelter                              63
total_single_adults_in_shelter                             63
families_with_children_in_shelter                          63
adults_in_families_with_children_in_shelter                63
children_in_families_with_children_in_shelter              63
total_individuals_in_families_with_children_in_shelter_    63
adult_families_in_shelter                                  63
individuals_in_adult_families_in_shelter                   63
dtype: int64

When resampled by day ('D'), some rows are created. This indicates that some missing days exist and the missing days were filled in with null values. Resample by weekly average to avoid having any missing days.

In [13]:
daily_df.resample('W').mean().shape

(328, 12)

In [14]:
daily_df.resample('W').mean().isnull().sum()

total_adults_in_shelter                                    0
total_children_in_shelter                                  0
total_individuals_in_shelter                               0
single_adult_men_in_shelter                                0
single_adult_women_in_shelter                              0
total_single_adults_in_shelter                             0
families_with_children_in_shelter                          0
adults_in_families_with_children_in_shelter                0
children_in_families_with_children_in_shelter              0
total_individuals_in_families_with_children_in_shelter_    0
adult_families_in_shelter                                  0
individuals_in_adult_families_in_shelter                   0
dtype: int64

In [154]:
weekly_df = daily_df.resample('W').mean().sort_index(ascending=False)

In [155]:
weekly_df.head()

Unnamed: 0_level_0,total_adults_in_shelter,total_children_in_shelter,total_individuals_in_shelter,single_adult_men_in_shelter,single_adult_women_in_shelter,total_single_adults_in_shelter,families_with_children_in_shelter,adults_in_families_with_children_in_shelter,children_in_families_with_children_in_shelter,total_individuals_in_families_with_children_in_shelter_,adult_families_in_shelter,individuals_in_adult_families_in_shelter
date_of_census,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-12-01,38273.166667,21709.333333,59982.5,12128.833333,4590.5,16719.333333,12195.166667,16308.5,21709.333333,38017.833333,2492.333333,5245.333333
2019-11-24,38490.571429,21801.571429,60292.142857,12251.142857,4613.428571,16864.571429,12230.142857,16361.857143,21801.571429,38163.428571,2499.714286,5264.142857
2019-11-17,38533.142857,21885.857143,60419.0,12178.285714,4616.142857,16794.428571,12265.142857,16425.428571,21885.857143,38311.285714,2523.571429,5313.285714
2019-11-10,38291.857143,21797.285714,60089.142857,12072.285714,4551.428571,16623.714286,12213.857143,16347.571429,21797.285714,38144.857143,2523.571429,5320.571429
2019-11-03,38242.0,21747.142857,59989.142857,12013.714286,4556.285714,16570.0,12205.714286,16342.142857,21747.142857,38089.285714,2526.428571,5329.857143


# Database Contruction

PostgreSQL database was contructed using Heroku.

- Source: https://www.heroku.com/postgres

---
### Sign in

- **Step 1** Save your PostgreSQL URL as following in a json file

```
{"url":"Enter your PostgreSQL URL here"}
```

- **Step 2** Name the json file "database_creds.json" and save it in the directory this jupyter notebook is located.

In [5]:
# Load your PostgreSQL credential to `database_creds`
database_creds_file = open('./database_cred.json', 'r')
database_creds = json.loads(database_creds_file.read())

In [6]:
# Sign into your database
engine = db.create_engine(database_creds['url'])

connection = engine.connect()
metadata = db.MetaData()

### Create table `ny_dhs_weekly`

In [160]:
sql = f"""CREATE TABLE ny_dhs_weekly (date_of_census DATE"""
for col in list(results[0].keys())[1:]:
    sql += f", {col} INT"
sql += ")"

connection.execute(sql)

<sqlalchemy.engine.result.ResultProxy at 0x1c1ae23f60>

### Insert values in 'weekly_df' dataframe to `ny_dhs_weekly` table 

In [161]:
sql = "INSERT INTO ny_dhs_weekly VALUES"
for n_row in range(len(weekly_df)):
    sql += f" ('{str(weekly_df.index[n_row].date())}'"
    for n_col in range(len(weekly_df.columns)):
        sql += f", {int(weekly_df.iloc[n_row,n_col])}"
    sql += "),"

connection.execute(sql[:-1])

<sqlalchemy.engine.result.ResultProxy at 0x1c1aed84a8>

### Query `ny_dhs_weekly` to see if the database is correctly constructed.

In [7]:
sql = """
SELECT *
FROM ny_dhs_weekly
"""
connection.execute(sql)

<sqlalchemy.engine.result.ResultProxy at 0x113fb6198>

In [8]:
database_df = pd.read_sql_query(sql, engine)

In [9]:
database_df.head()

Unnamed: 0,date_of_census,total_adults_in_shelter,total_children_in_shelter,total_individuals_in_shelter,single_adult_men_in_shelter,single_adult_women_in_shelter,total_single_adults_in_shelter,families_with_children_in_shelter,adults_in_families_with_children_in_shelter,children_in_families_with_children_in_shelter,total_individuals_in_families_with_children_in_shelter_,adult_families_in_shelter,individuals_in_adult_families_in_shelter
0,2019-12-01,38273,21709,59982,12128,4590,16719,12195,16308,21709,38017,2492,5245
1,2019-11-24,38490,21801,60292,12251,4613,16864,12230,16361,21801,38163,2499,5264
2,2019-11-17,38533,21885,60419,12178,4616,16794,12265,16425,21885,38311,2523,5313
3,2019-11-10,38291,21797,60089,12072,4551,16623,12213,16347,21797,38144,2523,5320
4,2019-11-03,38242,21747,59989,12013,4556,16570,12205,16342,21747,38089,2526,5329


In [10]:
database_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328 entries, 0 to 327
Data columns (total 13 columns):
date_of_census                                             328 non-null object
total_adults_in_shelter                                    328 non-null int64
total_children_in_shelter                                  328 non-null int64
total_individuals_in_shelter                               328 non-null int64
single_adult_men_in_shelter                                328 non-null int64
single_adult_women_in_shelter                              328 non-null int64
total_single_adults_in_shelter                             328 non-null int64
families_with_children_in_shelter                          328 non-null int64
adults_in_families_with_children_in_shelter                328 non-null int64
children_in_families_with_children_in_shelter              328 non-null int64
total_individuals_in_families_with_children_in_shelter_    328 non-null int64
adult_families_in_shelter           