In [9]:
# TODO: refactor this into utility

import random
from itertools import chain
import pandas as pd
import networkx as nx
from itertools import combinations  
import requests 
import numpy as np
from datetime import datetime, date, timedelta

class DataGenerator:
    def __init__(self,num_docs=10,num_patients=100,max_notes_per_discharge=10,max_stay_per_patient=6,start=None,end=None):
        self.MAX_NOTES_PER_DIS = max_notes_per_discharge
        self.MAX_STAY_PER_PATIENT = max_stay_per_patient     
        self.drs = [name for name in self._get_random_names(num_docs)]
        self.patients = [name for name in self._get_random_names(num_patients)]    
        self.end = datetime.now() if end == None else end
        self.start = datetime(self.end.year-1,self.end.month,self.end.day) if start == None else start
        self._create_date_range()
        self._generate_data()

    def _get_random_names(self,count):
        response = requests.get(f'https://randomuser.me/api?results={count}&inc=name&nat=us,gb,es')
        names = [f"{user['name']['first']} {user['name']['last']}" for user in response.json()['results']]
        return names

    def _get_date(self):
        return random.choice(self.date_range)

    def _generate_notes_for_discharge(self, discharge):
        notes = [(discharge[0], random.choice(self.drs), discharge[2]) 
                 for _ in range(0, random.randint(1,self.MAX_NOTES_PER_DIS))]
        return notes

    def _create_date_range(self):
        dr = np.arange(self.start, self.end, np.timedelta64(1, 'D'))
        self.date_range = [date for date in dr.astype(object)]

    def _generate_notes(self,discharges):
        for d in discharges:
            yield self._generate_notes_for_discharge(d)

    def _get_random_timedelta(self):
        return timedelta(days=random.randint(1,self.MAX_STAY_PER_PATIENT))
    
    def _flip(self):
        return 0 if random.random() > .1 else 1

    def _generate_data(self):  
        discharges = [(i, p, d := self._get_date(), d + self._get_random_timedelta(), self._flip(), random.randint(65,80))
            for i,p in enumerate(self.patients)]

        notes = list(chain.from_iterable(self._generate_notes(discharges)))
        
        dis_columns = ['id', 'patient', 'arrive_date', 'discharge_date', 'disposition', 'age']

        self.dis_df = pd.DataFrame(discharges, columns=dis_columns)

        note_columns=['discharge_id', 'dr', 'date']
        
        self.note_df = pd.DataFrame(notes, columns=note_columns)
    
if __name__ == "__main__":
    data = DataGenerator(num_docs=500, num_patients=5000)
    print(data.note_df.head())
    print(data.note_df.shape)
    print(data.dis_df.head())
    print(data.dis_df.shape)
    data.note_df.to_csv('../data/notes_w_disposition_large.csv', index=False)
    data.dis_df.to_csv('../data/discharges_w_disposition_large.csv', index=False)



   discharge_id               dr       date
0             0  Guillermo Reyes 2020-06-07
1             0       Marc Vidal 2020-06-07
2             0  Nicolas Santana 2020-06-07
3             0    Rosario Reyes 2020-06-07
4             1     Isaac Vargas 2020-04-02
(27328, 3)
   id            patient arrive_date discharge_date  disposition  age
0   0     Kathryn Morris  2020-06-07     2020-06-10            0   71
1   1  Alexander Ramirez  2020-04-02     2020-04-05            0   71
2   2       Arron Walker  2020-09-20     2020-09-23            0   80
3   3       Judith Brown  2021-01-12     2021-01-14            0   80
4   4   Gloria Hernandez  2020-03-25     2020-03-31            0   72
(5000, 6)
