In [95]:
from datetime import date
import random
import pandas as pd

In [96]:
def pick_date(): 
    start_date = date.today().replace(day=1, month=1).toordinal()
    end_date = date.today().replace(day=31, month=12).toordinal()
    random_day = date.fromordinal(random.randint(start_date, end_date))
    return random_day

In [97]:
patient_ids = []
er_visit_dates = [] 

num_users = 1000

for i in range(num_users):
    dates_per_user = random.randint(1,5)
    for j in range(dates_per_user):
        patient_ids.append(i)
        er_visit_dates.append(pick_date())

In [98]:
df = pd.DataFrame({"patient_id": patient_ids, "er_visit_date": er_visit_dates})

In [99]:
df.head(20)

Unnamed: 0,patient_id,er_visit_date
0,0,2020-11-25
1,0,2020-09-07
2,0,2020-10-13
3,0,2020-08-29
4,1,2020-10-14
5,1,2020-09-30
6,1,2020-09-18
7,2,2020-07-05
8,2,2020-08-02
9,2,2020-02-13


In [100]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [101]:
#cumulative visits

In [102]:
pysqldf("SELECT * FROM df LIMIT 20")

Unnamed: 0,patient_id,er_visit_date
0,0,2020-11-25
1,0,2020-09-07
2,0,2020-10-13
3,0,2020-08-29
4,1,2020-10-14
5,1,2020-09-30
6,1,2020-09-18
7,2,2020-07-05
8,2,2020-08-02
9,2,2020-02-13


In [103]:
df_month = pysqldf("""
SELECT 
    strftime('%m', er_visit_date) as month, COUNT(patient_id) as er_visits
FROM
    df
GROUP BY 
    month
""")

In [104]:
df_month

Unnamed: 0,month,er_visits
0,1,253
1,2,255
2,3,259
3,4,256
4,5,242
5,6,235
6,7,260
7,8,233
8,9,283
9,10,240


In [105]:
# month on month change, as a percentage

In [106]:
pysqldf("""
SELECT 
    a.month, a.er_visits, b.month, b.er_visits, (a.er_visits * 1.0) / (b.er_visits * 1.0) - 1 as pct_change
FROM 
    df_month a
LEFT JOIN
    df_month b
ON
    a.month -1 = b.month + 0
""")

Unnamed: 0,month,er_visits,month.1,er_visits.1,pct_change
0,1,253,,,
1,2,255,1.0,253.0,0.007905
2,3,259,2.0,255.0,0.015686
3,4,256,3.0,259.0,-0.011583
4,5,242,4.0,256.0,-0.054688
5,6,235,5.0,242.0,-0.028926
6,7,260,6.0,235.0,0.106383
7,8,233,7.0,260.0,-0.103846
8,9,283,8.0,233.0,0.214592
9,10,240,9.0,283.0,-0.151943


#### exercise
Try building a more realistic distribution of ER visits. How would you model random events that send large numbers of people to the ER? How would you model predictable fluctuations in ER visits? How would you model the high frequency of ER visits from a small number of ER patients?

### Repeat ER Visitors  

What percentage of ER visitors were admitted in a previous month?

In [134]:
pysqldf("""

WITH prev_visits AS (

SELECT 
    strftime('%m', a.er_visit_date) as month, COUNT(a.patient_id) as visits
FROM 
    df a 
JOIN
    df b
ON
    a.patient_id = b.patient_id
AND
    strftime('%m', a.er_visit_date) - 1 = strftime('%m', b.er_visit_date) + 0
GROUP BY
    month
    
),

total_visits AS (

SELECT
    strftime('%m', er_visit_date) as month, COUNT(patient_id) as visits
FROM 
    df
GROUP BY
    month
)


SELECT
    (a.visits * 1.0) / (b.visits * 1.0)
FROM 
    prev_visits a
JOIN
    total_visits b
ON a.month = b.month
""")

Unnamed: 0,(a.visits * 1.0) / (b.visits * 1.0)
0,0.227451
1,0.258687
2,0.21875
3,0.235537
4,0.2
5,0.215385
6,0.270386
7,0.24735
8,0.270833
9,0.209091
