In [1]:
# TODO: refactor this into utility

import random
from itertools import chain
import pandas as pd
import networkx as nx
from itertools import combinations  
import requests 
import numpy as np
from datetime import datetime, date, timedelta

class DataGenerator:
    def __init__(self,num_docs=10,num_patients=100,max_notes_per_discharge=10,max_stay_per_patient=6,start=None,end=None):
        self.MAX_NOTES_PER_DIS = max_notes_per_discharge
        self.MAX_STAY_PER_PATIENT = max_stay_per_patient     
        self.drs = [name for name in self._get_random_names(num_docs)]
        self.patients = [name for name in self._get_random_names(num_patients)]    
        self.end = datetime.now() if end == None else end
        self.start = datetime(self.end.year-1,self.end.month,self.end.day) if start == None else start
        self._create_date_range()
        self._generate_data()

    def _get_random_names(self,count):
        response = requests.get(f'https://randomuser.me/api?results={count}&inc=name&nat=us,gb,es')
        names = [f"{user['name']['first']} {user['name']['last']}" for user in response.json()['results']]
        return names

    def _get_date(self):
        return random.choice(self.date_range)

    def _generate_notes_for_discharge(self, discharge):
        notes = [(discharge[0], random.choice(self.drs), discharge[2]) 
                 for _ in range(0, random.randint(1,self.MAX_NOTES_PER_DIS))]
        return notes

    def _create_date_range(self):
        dr = np.arange(self.start, self.end, np.timedelta64(1, 'D'))
        self.date_range = [date for date in dr.astype(object)]

    def _generate_notes(self,discharges):
        for d in discharges:
            yield self._generate_notes_for_discharge(d)

    def _get_random_timedelta(self):
        return timedelta(days=random.randint(1,self.MAX_STAY_PER_PATIENT))

    def _generate_data(self):  
        discharges = [(i, p, d := self._get_date(), d + self._get_random_timedelta())
            for i,p in enumerate(self.patients)]

        notes = list(chain.from_iterable(self._generate_notes(discharges)))

        self.dis_df = pd.DataFrame(discharges, columns=['id', 'patient', 'arrive_date', 'discharge_date'])

        self.note_df = pd.DataFrame(notes, columns=['discharge_id', 'dr', 'date'])
    
if __name__ == "__main__":
    data = DataGenerator()
    print(data.note_df.head())
    print(data.note_df.shape)



   discharge_id                   dr       date
0             0    Dolores Hernandez 2021-01-09
1             0           Mario Dunn 2021-01-09
2             1         Dora Gregory 2020-08-21
3             1  Patricia Washington 2020-08-21
4             1           Ivan Ramos 2020-08-21
(516, 3)
