In [None]:
import pandas as pd
import random

# Load the dataset
df_visits = pd.read_csv('df_melt_sorted_fin.csv')

# Ensure that VISIT_IDs are unique before sampling
unique_visit_ids = df_visits['VISIT_ID'].unique()

# Randomly select 10 VISIT_IDs
random_visit_ids = random.sample(list(unique_visit_ids), 10)

# Extract the activity sequence for each of these VISIT_IDs
for visit_id in random_visit_ids:
    visit_data = df_visits[df_visits['VISIT_ID'] == visit_id]
    activities_sequence = visit_data[['Activity', 'Start_Time']].sort_values('Start_Time')
    print(f"Visit ID: {visit_id}\n", activities_sequence)


In [None]:
df_visits['Activity'].unique()

In [None]:

# Group by 'VISIT_ID' and 'Start_Time', and count the number of activities for each group
grouped = df_visits.groupby(['VISIT_ID', 'Start_Time'])['Activity'].count()

# Filter groups where the count of activities is 2 or more
same_time_activities = grouped[grouped >= 2]

# Get the indices of the original DataFrame where the combinations of 'VISIT_ID' and 'Start_Time' match
# those with 2 or more activities
valid_indices = same_time_activities.index

# Filter the original DataFrame to get only the rows with the required 'VISIT_ID' and 'Start_Time'
result = df_visits[df_visits.set_index(['VISIT_ID', 'Start_Time']).index.isin(valid_indices)].reset_index(drop=True)

result.info()


In [None]:
# Define allowed combinations of activities
allowed_combinations = [
    {'Assessment', 'Triage'}, 
    {'Triage', 'Providing service'}
]

In [None]:
# Group by 'VISIT_ID' and 'Start_Time', without reducing to a count first
grouped = df_visits.groupby(['VISIT_ID', 'Start_Time'])

In [None]:
# List to collect the indices of rows that do not meet the condition
invalid_indices = []

In [None]:
# Iterate through each group
for (visit_id, start_time), group in grouped:
    # Create a set of activities present in the current group
    activity_set = set(group['Activity'])
    
    # Check if the activity set does not match any of the allowed combinations
    if not any(activity_set == allowed_set for allowed_set in allowed_combinations):
        # If the group is invalid and the group size is more than 1 (implying multiple activities at the same time),
        # append the indices of the invalid rows to our list
        if len(activity_set) > 1:
            invalid_indices.extend(group.index.tolist())

In [None]:
# using the collected indices to remove the invalid rows from the original DataFrame
filtered_df = df_visits.drop(invalid_indices)


filtered_df.info()

In [None]:
df_visits.info()

In [None]:
filtered_df.head()

In [None]:
clean_df_visits = filtered_df

In [None]:
print(f"Original DataFrame size: {df_visits.shape[0]}")
print(f"Cleaned DataFrame size: {clean_df_visits.shape[0]}")



In [None]:
clean_df_visits.info()

In [None]:
clean_df_visits['Activity'].unique()

In [None]:
import pandas as pd

# Assuming df_visits_dur is your DataFrame
# Ensure 'Start_Time' is converted to datetime
clean_df_visits['Start_Time'] = pd.to_datetime(clean_df_visits['Start_Time'])

# Group by 'VISIT_ID' and calculate the earliest and latest 'Start_Time'
time_stats = clean_df_visits.groupby('VISIT_ID')['Start_Time'].agg(['min', 'max']).reset_index()
time_stats.rename(columns={'min': 'Earliest_Time', 'max': 'End_Time'}, inplace=True)

# Calculate duration for each group
# The subtraction here will automatically yield a Timedelta since 'min' and 'max' are datetime objects
time_stats['Duration'] = time_stats['End_Time'] - time_stats['Earliest_Time']

# Merge the 'Earliest_Time', 'End_Time', and 'Duration' back into the original DataFrame
clean_df_visits_dur = clean_df_visits.merge(time_stats[['VISIT_ID', 'Earliest_Time', 'End_Time', 'Duration']], on='VISIT_ID', how='left')



In [None]:
clean_df_visits_dur.info()

In [None]:

# The DataFrame now includes 'End_Time' and 'Duration' for each visit
clean_df_visits_dur.head(10)

In [None]:
clean_df_visits_dur['FACILITY_NAME'].unique()

In [None]:
# List of facilities to keep
facilities_to_keep = [
    'Health Sciences Centre - St. Johns',
    'St. Clares Mercy Hospital - St. Johns',
    'Dr. G.B. Cross Memorial Hospital - Clarenville',
    'Carbonear General Hospital - Carbonear'
]

# Filter the DataFrame to include only the rows where the FACILITY_NAME is in the facilities_to_keep list
df_visits_dur_filtered = clean_df_visits_dur[clean_df_visits_dur['FACILITY_NAME'].isin(facilities_to_keep)]


In [None]:
# Define the sequences of activities to keep
sequence1 = ['Triage', 'Providing service', 'Assessment', 'Patient departed']
sequence2 = ['Triage', 'Providing service', 'Patient departed']
sequence3 = ['Triage', 'Providing service', 'Assessment', 'Making admit decision', 'Admitting patient', 'Patient departed', 'Patient discharge']

# Function to check if the visit activities match one of the sequences
def check_sequence(group):
    activities = group.sort_values('Start_Time')['Activity'].tolist()  # Get the sorted list of activities
    return activities == sequence1 or activities == sequence2 or activities == sequence3

# Group by VISIT_ID and filter
filtered_visits = clean_df_visits_dur.groupby('VISIT_ID').filter(check_sequence)

# Count unique VISIT_IDs in the filtered dataset
unique_visit_count = filtered_visits['VISIT_ID'].nunique()

unique_visit_count


In [None]:
import pandas as pd


# Step 1: Sort by VISIT_ID and Start_Time to ensure the sequence
sorted_visits = filtered_visits.sort_values(by=['VISIT_ID', 'Start_Time'])

# Calculate the actual duration for each activity 
# calculated as difference between Start_Time and End_Time
sorted_visits['Activity_Duration'] = (sorted_visits['End_Time'] - sorted_visits['Start_Time']).dt.total_seconds() / 3600.0

# Calculate the transition duration between the end of one activity and the start of the next
sorted_visits['Next_Start_Time'] = sorted_visits.groupby('VISIT_ID')['Start_Time'].shift(-1)
sorted_visits['Step_Duration'] = (sorted_visits['Next_Start_Time'] - sorted_visits['End_Time']).dt.total_seconds() / 3600.0

#  calculate the standard deviation for each Activity's own duration
std_activity_durations = sorted_visits.groupby('Activity')['Activity_Duration'].std()

mean_activity_durations = sorted_visits.groupby('Activity')['Activity_Duration'].mean()

#  the standard deviation for the transitions between activities
std_step_durations = sorted_visits.groupby('Activity')['Step_Duration'].std()

std_activity_durations, mean_activity_durations

In [None]:
df_visits_dur_filtered['Activity'].unique()

In [None]:
# Define the sequences of activities to keep
sequence1 = ['Triage', 'Providing service', 'Assessment', 'Patient departed']
sequence2 = ['Triage', 'Providing service', 'Patient departed']

# Function to check if the visit activities match one of the sequences
def check_sequence(group):
    activities = group.sort_values('Start_Time')['Activity'].tolist()  # Get the sorted list of activities
    return activities == sequence1 or activities == sequence2

# Group by VISIT_ID and filter
filtered_visits = df_visits_dur_filtered.groupby('VISIT_ID').filter(check_sequence)

filtered_visits['VISIT_ID'].nunique()

In [None]:
import pandas as pd

# Assuming 'df_visits_dur' is your DataFrame and 'Duration' is already calculated as timedelta
# Convert 'Duration' to a suitable numeric form for calculation, such as seconds or hours
filtered_visits['Duration_seconds'] = filtered_visits['Duration'].dt.total_seconds()

# Now you can calculate the standard deviation of the 'Duration_seconds' column
std_duration_seconds = filtered_visits['Duration_seconds'].std()

# If you prefer the standard deviation in hours, you can convert the seconds to hours first
filtered_visits['Duration_hours'] = filtered_visits['Duration_seconds'] / 3600
std_duration_hours = filtered_visits['Duration_hours'].std()

print(f"Standard Deviation of Duration in Seconds: {std_duration_seconds} seconds")
print(f"Standard Deviation of Duration in Hours: {std_duration_hours} hours")


In [None]:
filtered_visits.head()

In [None]:
# Calculate the mean duration for each VISIT_ID
mean_duration_hours_per_visit = filtered_visits.groupby('VISIT_ID')['Duration_hours'].mean()

# Calculate the overall mean from the mean durations per VISIT_ID
overall_mean_duration_hours = mean_duration_hours_per_visit.mean()
overall_mean_duration_hours

In [None]:
import pandas as pd

# Sort by VISIT_ID and Start_Time to ensure the sequence
sorted_visits = filtered_visits.sort_values(by=['VISIT_ID', 'Start_Time'])

# Calculate the actual duration for each activity
#  Duration or calculated as difference between Start_Time and End_Time
sorted_visits['Activity_Duration'] = (sorted_visits['End_Time'] - sorted_visits['Start_Time']).dt.total_seconds() / 3600.0

#  the transition duration between the end of one activity and the start of the next
sorted_visits['Next_Start_Time'] = sorted_visits.groupby('VISIT_ID')['Start_Time'].shift(-1)
sorted_visits['Step_Duration'] = (sorted_visits['Next_Start_Time'] - sorted_visits['End_Time']).dt.total_seconds() / 3600.0

#  calculate the standard deviation for each Activity's own duration
std_activity_durations = sorted_visits.groupby('Activity')['Activity_Duration'].std()

mean_activity_durations = sorted_visits.groupby('Activity')['Activity_Duration'].mean()

# the standard deviation for the transitions between activities
std_step_durations = sorted_visits.groupby('Activity')['Step_Duration'].std()

std_activity_durations, mean_activity_durations

In [None]:
# Check unique durations for the 'Assessment' activity
assessment_durations = filtered_visits[filtered_visits['Activity'] == 'Assessment']['Duration_hours']
unique_assessment_durations = assessment_durations.unique()

# Get basic statistics for the 'Assessment' activity to understand its distribution
assessment_durations_stats = assessment_durations.describe()

unique_assessment_durations, assessment_durations_stats


In [None]:
clean_df_visits_dur['VISIT_ID'].nunique()

## Creating single row dataframe

In [None]:
clean_df_visits_dur.info()

In [None]:

# Determine the earliest 'Start_Time' for each 'VISIT_ID'
earliest_times = clean_df_visits.groupby('VISIT_ID')['Start_Time'].min().reset_index()
earliest_times.rename(columns={'Start_Time': 'First_Start_Time'}, inplace=True)

# Define a function to categorize 'First_Start_Time' into the time ranges we discussed
def categorize_time_of_day(time):
    if time.hour < 8:
        return 'Early Morning to Morning'
    elif 8 <= time.hour < 16:
        return 'Late Morning to Late Afternoon'
    else:  # From 16:00 until midnight
        return 'Evening to Night'

# Apply the function to categorize each 'First_Start_Time'
earliest_times['TIME_OF_THE_DAY'] = earliest_times['First_Start_Time'].apply(categorize_time_of_day)

# Merge the 'TIME_OF_THE_DAY' back into the original DataFrame
df_visits_dur_hours = clean_df_visits_dur.merge(earliest_times[['VISIT_ID', 'TIME_OF_THE_DAY']], on='VISIT_ID', how='left')

# Display the first few rows of the updated DataFrame
print(df_visits_dur_hours.head())


In [None]:
df_visits_dur_hours.info()

In [None]:
df_visits_dur_hours.to_csv('df_visits_dur_timeofday-singlerow.csv', index=False)

In [None]:
df_visits_dur_hours.info()

## RANDOMLY ORDERING SAME TIME ACTIVITIES

In [None]:
df_visits_dur_hours.head()

In [None]:
# Group by VISIT_ID and Start_Time and count occurrences
duplicate_groups = df_visits_dur_hours.groupby(['VISIT_ID', 'Start_Time']).size()

# Filter groups where the count is greater than 1
duplicate_groups = duplicate_groups[duplicate_groups > 1]

# Display the duplicate groups, if any
if not duplicate_groups.empty:
    print("There are still groups with the same Start_Time within the same VISIT_ID:")
    print(duplicate_groups)
else:
    print("No groups with the same Start_Time within the same VISIT_ID found.")


In [None]:
# Sort the dataframe to ensure proper ordering
df_visits_dur_hours_sorted = df_visits_dur_hours.sort_values(by=['VISIT_ID', 'Start_Time'])

# Create a mask to identify rows with the same VISIT_ID and Start_Time
mask = df_visits_dur_hours_sorted.duplicated(subset=['VISIT_ID', 'Start_Time'], keep=False)

# Group by VISIT_ID and Start_Time and get group indices
group_indices = df_visits_dur_hours_sorted[mask].groupby(['VISIT_ID', 'Start_Time']).ngroup()

# Create random offsets within each group
df_visits_dur_hours_sorted.loc[mask, 'Random_Offset'] = group_indices.groupby(group_indices).cumcount() + 1

# Apply the offsets to Start_Time
df_visits_dur_hours_sorted.loc[mask, 'Start_Time'] += pd.to_timedelta(df_visits_dur_hours_sorted['Random_Offset'].fillna(0), unit='s')

# Drop the temporary Random_Offset column
df_visits_dur_hours_sorted = df_visits_dur_hours_sorted.drop(columns=['Random_Offset'])

In [None]:

# Group by VISIT_ID and Start_Time and count occurrences
duplicate_groups = df_visits_dur_hours_sorted.groupby(['VISIT_ID', 'Start_Time']).size()

# Filter groups where the count is greater than 1
duplicate_groups = duplicate_groups[duplicate_groups > 1]

# Display the duplicate groups, if any
if not duplicate_groups.empty:
    print("There are still groups with the same Start_Time within the same VISIT_ID:")
    print(duplicate_groups)
else:
    print("No groups with the same Start_Time within the same VISIT_ID found.")



In [None]:
df_visits_dur_hours_sorted.info()

In [None]:
df_visits_dur_hours_sorted.head()

In [None]:
df_visits_dur_hours_sorted.to_csv('df_visits_dur_timeofday-randomorder.csv', index=False)