In [37]:
import pandas as pd
import numpy as np
import uuid
import random
from faker import Faker
import bcrypt
from datetime import datetime, timedelta
import os
import time
import math

In [38]:
fake = Faker()

num_users = 5000
num_projects = 50

num_feedback_answers = 8000

In [39]:
def generateUsers() :
    roles = ['admin', 'user', 'manager', 'employee']
    weights = [10, 50, 20, 20]

    email_ids = []
    passwords = []
    roles_list = []
    names = []

    password = bcrypt.hashpw('org_pass_1234'.encode('utf-8'), bcrypt.gensalt()).decode('utf-8')

    # Generate and store random data for each row
    for _ in range(num_users):
        while True:
            email = fake.email()
            if email not in email_ids:
                break
        role = np.random.choice(roles, p=np.array(weights)/sum(weights))
        name = fake.name()

        email_ids.append(email)
        passwords.append(password)
        roles_list.append(role)
        names.append(name)

    # Create a DataFrame
    data = {
        'email': email_ids,
        'pass': passwords,
        'role': roles_list,
        'name': names
    }

    return data

users_df = pd.DataFrame(generateUsers())
users_df

Unnamed: 0,email,pass,role,name
0,wbriggs@example.net,$2b$12$CfQEFX9iEiWxEgaqGJO5wO0Yw5RUHPK7zes274R...,user,Steven Fox
1,ywilson@example.com,$2b$12$CfQEFX9iEiWxEgaqGJO5wO0Yw5RUHPK7zes274R...,employee,William Harvey
2,wilsonanthony@example.org,$2b$12$CfQEFX9iEiWxEgaqGJO5wO0Yw5RUHPK7zes274R...,admin,Wayne Walters
3,bstephens@example.net,$2b$12$CfQEFX9iEiWxEgaqGJO5wO0Yw5RUHPK7zes274R...,employee,Marilyn Watson
4,eschaefer@example.net,$2b$12$CfQEFX9iEiWxEgaqGJO5wO0Yw5RUHPK7zes274R...,employee,Ashley Stokes
...,...,...,...,...
4995,mritter@example.net,$2b$12$CfQEFX9iEiWxEgaqGJO5wO0Yw5RUHPK7zes274R...,user,Jessica Hernandez
4996,emily52@example.org,$2b$12$CfQEFX9iEiWxEgaqGJO5wO0Yw5RUHPK7zes274R...,user,Michele Peck
4997,julia63@example.org,$2b$12$CfQEFX9iEiWxEgaqGJO5wO0Yw5RUHPK7zes274R...,employee,Alicia English
4998,wallerheather@example.com,$2b$12$CfQEFX9iEiWxEgaqGJO5wO0Yw5RUHPK7zes274R...,manager,Dennis Martinez


In [40]:
def generateProjects():
    project_domain_choices = ['Financial Services', 'Healthcare', 'Education Technology', 'E-commerce', 'Transportation and Logistics', 'Renewable Energy', 'Artificial Intelligence and Machine Learning', 'Cybersecurity', 'Media and Entertainment', 'Telecommunications']
    ids = []
    names = []
    domains = []

    for _ in range(num_projects):

        while True:
            project_id = str(uuid.uuid4())
            if project_id not in ids:
                break

        while True:
            project_name = fake.company() + " " + fake.word(ext_word_list=['Project', 'Solution', 'Initiative'])
            if project_name not in names:
                break
        
        project_domain = random.choice(project_domain_choices)

        ids.append(project_id)
        names.append(project_name)
        domains.append(project_domain)
    
    data = {
        'id': ids, 
        'name': names,
        'domain': domains
    }
    return data

project_df = pd.DataFrame(generateProjects())
project_df

Unnamed: 0,id,name,domain
0,1246e477-bca6-4c1e-913b-2a12cb3325ae,Ward Ltd Project,Artificial Intelligence and Machine Learning
1,0b776229-ce8b-4cff-ba8f-f6be0d908328,Saunders LLC Solution,Healthcare
2,bdfed411-79b4-4208-86e7-ef7362580a04,Green Ltd Project,Education Technology
3,54f468f9-02e7-48c9-9d87-e8afd5e508ba,Johnson-Duncan Initiative,Artificial Intelligence and Machine Learning
4,914fa739-f1da-40f4-83ed-fd690363046b,"Jones, Meyer and Smith Solution",Telecommunications
5,d5962b60-bc79-421c-b338-39f9770c8a88,Parker-Osborn Project,Cybersecurity
6,137b761a-ec7c-425f-8c96-0e2e0615e494,"Thompson, Adkins and Harris Initiative",Healthcare
7,b82b4770-3724-4b89-8a9b-e057588d933e,"Taylor, Elliott and Trevino Solution",Media and Entertainment
8,655659e4-cd5a-4c19-84d9-530ade35266b,Moreno Ltd Project,Financial Services
9,1f3fa3c4-0a1e-4a93-a0af-c8a8f5e48439,"Wilson, Jackson and Green Solution",Education Technology


In [41]:
def generateFeedbackQuestions():
    ids = []
    feedback_questions = ['How satisfied are you with the overall progress of the project?', 'Rate the communication between team members on a scale of 1 to 5', 'Did you find the project timeline realistic? Please rate from 1 to 5', 'How would you rate the quality of work delivered by the team?', 'Rate the effectiveness of the project management on a scale of 1 to 5', 'Were the project goals clearly defined? Please rate from 1 to 5', 'How satisfied are you with the level of collaboration within the team?', 'Rate the accuracy of the project budget estimation from 1 to 5', 'Did the project meet your expectations? Please rate from 1 to 5', 'How would you rate the problem-solving skills of the team?', 'Rate the level of stakeholder engagement on a scale of 1 to 5', 'Were the project risks effectively managed? Please rate from 1 to 5', 'How satisfied are you with the project deliverables?', 'Rate the level of innovation demonstrated in the project from 1 to 5', 'Did the project meet the specified deadlines? Please rate from 1 to 5', 'How would you rate the level of client satisfaction with the project outcomes?', 'Rate the effectiveness of the project feedback mechanisms on a scale of 1 to 5', 'Were the project resources allocated efficiently? Please rate from 1 to 5', 'How satisfied are you with the level of transparency in project communication?', 'Rate the level of adaptability shown by the team members on a scale of 1 to 5']

    for _ in range(len(feedback_questions)):

        while True:
            question_id = str(uuid.uuid4())
            if question_id not in ids:
                break

        ids.append(question_id)
    
    data = {
        'id': ids, 
        'question': feedback_questions
    }
    return data

feedbackQuestion_df = pd.DataFrame(generateFeedbackQuestions())
feedbackQuestion_df

Unnamed: 0,id,question
0,b8cb65a3-272a-41ca-a1ba-2691c561e237,How satisfied are you with the overall progres...
1,62ae49a5-859a-4659-8333-114149fcc2e1,Rate the communication between team members on...
2,ba7a3c95-17d2-4a00-ae47-266a58b7e6d2,Did you find the project timeline realistic? P...
3,d1d33464-4bf2-4b2c-bf9d-7c73e64a2ddd,How would you rate the quality of work deliver...
4,e0033b5b-134f-435d-a24c-5f258a6b6830,Rate the effectiveness of the project manageme...
5,cf02cc06-32bb-4b96-87a3-62fdda3c2154,Were the project goals clearly defined? Please...
6,405b9801-a845-4267-877b-68c998043775,How satisfied are you with the level of collab...
7,48abba80-f7f3-44f4-aa57-0e96b980fdbb,Rate the accuracy of the project budget estima...
8,5d7afee3-b00b-4966-b1c7-6108d4cac23b,Did the project meet your expectations? Please...
9,db2df44d-9801-4186-96a8-18f27e43623a,How would you rate the problem-solving skills ...


In [42]:
def generateProjectsToUsers(): 
    project_names = []
    user_emails = []
    for user_email in users_df['email']:
        projects = random.sample(list(project_df['name']), k=random.randint(1, project_df['name'].shape[0]))
        for project in projects:
            user_emails.append(user_email)
            project_names.append(project)

    data = {
        'projectName': project_names,
        'userEmail': user_emails
    }
    return data

projectsToUsers_df = pd.DataFrame(generateProjectsToUsers())
projectsToUsers_df

Unnamed: 0,projectName,userEmail
0,"Francis, Quinn and Cook Initiative",wbriggs@example.net
1,Ross and Sons Solution,wbriggs@example.net
2,Barnett-Harris Initiative,wbriggs@example.net
3,"Gonzales, Martin and Carter Initiative",wbriggs@example.net
4,"Castillo, Frey and Stewart Project",wbriggs@example.net
...,...,...
127273,Kim-Gutierrez Solution,rodneyfischer@example.org
127274,"Wilson, Jackson and Green Solution",rodneyfischer@example.org
127275,Johnson-Duncan Initiative,rodneyfischer@example.org
127276,"Hill, Hernandez and Jones Project",rodneyfischer@example.org


In [43]:
def generateProjectsToFeedbackQuestions(): 
    project_names = []
    question_names = []
    for project in project_df['name']:
        questions = random.sample(list(feedbackQuestion_df['question']), k=random.randint(0, feedbackQuestion_df['question'].shape[0]))
        for question in questions:
            project_names.append(project)
            question_names.append(question)

    data = {
        'projectName': project_names,
        'questionName': question_names
    }
    return data

projectsToFeedbackQuestions_df = pd.DataFrame(generateProjectsToFeedbackQuestions())
projectsToFeedbackQuestions_df

Unnamed: 0,projectName,questionName
0,Ward Ltd Project,Rate the level of adaptability shown by the te...
1,Ward Ltd Project,Rate the effectiveness of the project manageme...
2,Ward Ltd Project,Rate the level of stakeholder engagement on a ...
3,Ward Ltd Project,How would you rate the quality of work deliver...
4,Ward Ltd Project,Were the project risks effectively managed? Pl...
...,...,...
516,"Hill, Hernandez and Jones Project",Rate the communication between team members on...
517,Davis LLC Initiative,How satisfied are you with the level of collab...
518,Davis LLC Initiative,Rate the effectiveness of the project manageme...
519,Davis LLC Initiative,Rate the communication between team members on...


In [44]:
def generate_random_sunday():
    today = datetime.today()
    random_days = random.randint(0, 365 * 2) 
    random_date = today - timedelta(days=random_days)
    while random_date.weekday() != 6:
        random_date -= timedelta(days=1)

    return {
            'formated': random_date.strftime('%d-%m-%Y'),
            'original': random_date
        }

def generateTimesheets():
    comments_choices = ["Excited to get started!", "Looking forward to the challenge", "Ready to dive in", "Feeling motivated", "Can't wait to see the end result", "Feeling optimistic about this project", "Hopeful for a successful outcome", "Eager to collaborate with the team"]
    hours_choices = list(range(0, 12))
    weekend_weights = [3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1]
    normal_weights = [1, 1, 1, 1, 1, 1, 1, 5, 10, 9, 3, 1]
    activity_names_choices = ['BAU Activity', 'Sales Activity']

    user_emails = []
    date_starts = []
    date_ends = []
    projects_selected = []
    tasks_selected = []
    comments_added = []
    d0s = []
    d1s = []
    d2s = []
    d3s = []
    d4s = []
    d5s = []
    d6s = []
    activity_names = []

    num_timesheets = 0

    for current_num_user in range(num_users):
        start_time = time.time()

        user_email = users_df['email'].iloc[current_num_user] if random.random() > 0.1 else None
        if user_email is None:
            continue

        project_list_under_user = projectsToUsers_df.groupby('userEmail')['projectName'].agg(list).get(user_email, [])
        # print('project_list_under_user: ', end='')
        # print(project_list_under_user)

        num_timesheets_for_current_user = random.randint(0, len(project_list_under_user))
        # print('num timesheets to generate for ' + str(user_email) + ': ' + str(num_timesheets_for_current_user))

        memoized_date_starts_for_project = []

        for _ in range(num_timesheets_for_current_user):

            random_sunday = generate_random_sunday()
            # print('finding date...')
            while random_sunday['formated'] in memoized_date_starts_for_project: # [date_starts[i] for i, project_item in enumerate([projects_selected[i] for i, email_item in enumerate(user_emails) if email_item == user_email]) if project_item == project_name]:
                random_sunday = generate_random_sunday()
            # print('found date: ' + str(random_sunday['formated']))

            date_start = random_sunday['formated']
            date_end = (random_sunday['original'] + timedelta(days=6)).strftime('%d-%m-%Y')

            memoized_date_starts_for_project.append(date_start)

            num_timesheets_for_random_day = random.randint(0, 4) # user can do atleast 4 projects in a day 
            if num_timesheets_for_random_day > len(project_list_under_user):
                num_timesheets_for_random_day = random.randint(0, len(project_list_under_user))

            project_names = random.sample(project_list_under_user, k=num_timesheets_for_random_day)

            # print('num timesheets to generate for ' + str(user_email) + ' | ' + str(project_name) + ': ' + str(num_timesheets_for_random_day))

            d0_num_hours_to_work = np.random.choice(hours_choices, p=np.array(weekend_weights)/sum(weekend_weights))
            d1_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d2_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d3_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d4_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d5_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d6_num_hours_to_work = np.random.choice(hours_choices, p=np.array(weekend_weights)/sum(weekend_weights))

            for project_name in project_names:

                task_selected = None
                comment_added = random.choice(comments_choices) if random.random() > 0.5 else None

                d0 = (1 if math.ceil(d0_num_hours_to_work / num_timesheets_for_random_day) <= 1 else random.randint(1, math.ceil(d0_num_hours_to_work / num_timesheets_for_random_day))) if random.random() > 0.3 else None
                d1 = (1 if math.ceil(d1_num_hours_to_work / num_timesheets_for_random_day) <= 1 else random.randint(1, math.ceil(d1_num_hours_to_work / num_timesheets_for_random_day))) if random.random() > 0.3 else None
                d2 = (1 if math.ceil(d2_num_hours_to_work / num_timesheets_for_random_day) <= 1 else random.randint(1, math.ceil(d2_num_hours_to_work / num_timesheets_for_random_day))) if random.random() > 0.3 else None
                d3 = (1 if math.ceil(d3_num_hours_to_work / num_timesheets_for_random_day) <= 1 else random.randint(1, math.ceil(d3_num_hours_to_work / num_timesheets_for_random_day))) if random.random() > 0.3 else None
                d4 = (1 if math.ceil(d4_num_hours_to_work / num_timesheets_for_random_day) <= 1 else random.randint(1, math.ceil(d4_num_hours_to_work / num_timesheets_for_random_day))) if random.random() > 0.3 else None
                d5 = (1 if math.ceil(d5_num_hours_to_work / num_timesheets_for_random_day) <= 1 else random.randint(1, math.ceil(d5_num_hours_to_work / num_timesheets_for_random_day))) if random.random() > 0.3 else None
                d6 = (1 if math.ceil(d6_num_hours_to_work / num_timesheets_for_random_day) <= 1 else random.randint(1, math.ceil(d6_num_hours_to_work / num_timesheets_for_random_day))) if random.random() > 0.3 else None

                for activity_name in random.sample(activity_names_choices, k=random.randint(1, len(activity_names_choices))):
                    # print('activity for ' + str(user_email) + ' | ' + str(project_name) + ' | ' + str(date_start) + ': ' + str(activity_name))

                    user_emails.append(user_email)
                    projects_selected.append(project_name)
                    date_starts.append(date_start)
                    date_ends.append(date_end)
                    tasks_selected.append(task_selected)
                    comments_added.append(comment_added)
                    d0s.append(d0)
                    d1s.append(d1)
                    d2s.append(d2)
                    d3s.append(d3)
                    d4s.append(d4)
                    d5s.append(d5)
                    d6s.append(d6)
                    activity_names.append(activity_name)

                    num_timesheets += 1
        print(f'Added timesheets for: {user_email:30s}  | current_num_user: {current_num_user:4d} | total_users: {num_users:4d} | timesheet_count: {num_timesheets:8d} | ETA: {((num_users - current_num_user) * (time.time() - start_time)) / 60:4.0f} min | elapsed_time: {(time.time() - start_time) * 1000:4.0f} ms')

    data = {
        'userEmail': user_emails,
        'projectSelected': projects_selected,
        'dateStart': date_starts,
        'dateEnd': date_ends,
        'taskSelected': tasks_selected,
        'commentAdded': comments_added,
        'd0': d0s,
        'd1': d1s,
        'd2': d2s,
        'd3': d3s,
        'd4': d4s,
        'd5': d5s,
        'd6': d6s,
        'activityName': activity_names
    }

    return data

timesheets_df = pd.DataFrame(generateTimesheets())
timesheets_df

Added timesheets for: wbriggs@example.net             | current_num_user:    0 | total_users: 5000 | timesheet_count:       31 | ETA:    6 min | elapsed_time:   74 ms
Added timesheets for: ywilson@example.com             | current_num_user:    1 | total_users: 5000 | timesheet_count:       31 | ETA:    6 min | elapsed_time:   74 ms
Added timesheets for: bstephens@example.net           | current_num_user:    3 | total_users: 5000 | timesheet_count:       73 | ETA:    6 min | elapsed_time:   72 ms
Added timesheets for: eschaefer@example.net           | current_num_user:    4 | total_users: 5000 | timesheet_count:      116 | ETA:    6 min | elapsed_time:   78 ms
Added timesheets for: mariah97@example.net            | current_num_user:    5 | total_users: 5000 | timesheet_count:      191 | ETA:    6 min | elapsed_time:   74 ms
Added timesheets for: toddbonilla@example.com         | current_num_user:    6 | total_users: 5000 | timesheet_count:      203 | ETA:   15 min | elapsed_time:  177 m

Unnamed: 0,userEmail,projectSelected,dateStart,dateEnd,taskSelected,commentAdded,d0,d1,d2,d3,d4,d5,d6,activityName
0,wbriggs@example.net,Chaney Inc Solution,04-09-2022,10-09-2022,,,,1.0,1.0,2.0,,2.0,1.0,Sales Activity
1,wbriggs@example.net,"Wilson, Jackson and Green Solution",04-09-2022,10-09-2022,,Excited to get started!,1.0,1.0,,1.0,1.0,,,BAU Activity
2,wbriggs@example.net,"Wilson, Jackson and Green Solution",04-09-2022,10-09-2022,,Excited to get started!,1.0,1.0,,1.0,1.0,,,Sales Activity
3,wbriggs@example.net,"Gonzales, Martin and Carter Initiative",04-09-2022,10-09-2022,,Excited to get started!,,1.0,2.0,,2.0,2.0,,Sales Activity
4,wbriggs@example.net,"Gonzales, Martin and Carter Initiative",04-09-2022,10-09-2022,,Excited to get started!,,1.0,2.0,,2.0,2.0,,BAU Activity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170853,rodneyfischer@example.org,Olson-Craig Solution,25-12-2022,31-12-2022,,Ready to dive in,2.0,2.0,8.0,1.0,7.0,,,Sales Activity
170854,rodneyfischer@example.org,Olson-Craig Solution,25-12-2022,31-12-2022,,Ready to dive in,2.0,2.0,8.0,1.0,7.0,,,BAU Activity
170855,rodneyfischer@example.org,Saunders LLC Solution,10-12-2023,16-12-2023,,,,1.0,9.0,,6.0,4.0,1.0,Sales Activity
170856,rodneyfischer@example.org,Briggs-Jennings Project,22-05-2022,28-05-2022,,,1.0,3.0,,,,10.0,,BAU Activity


In [45]:
def generateFeedbackAnswers():

    text_answer_choices = ['Excellent', 'Good', 'Average', 'Fair', 'Poor']

    project_names = []
    feedback_question_names = []
    checked_answers = []
    text_answers = []
    date_starts = []
    date_ends = []
    user_emails = []

    for _ in range(num_feedback_answers):

        random_projectsToFeedbackQuestions = projectsToFeedbackQuestions_df.sample()
        project_name = random_projectsToFeedbackQuestions['projectName'].iloc[0]
        feedback_question_name = random_projectsToFeedbackQuestions['questionName'].iloc[0]

        corresponding_random_timesheet = timesheets_df[timesheets_df['projectSelected'] == project_name].sample()
        date_start = corresponding_random_timesheet['dateStart'].iloc[0]
        date_end = corresponding_random_timesheet['dateEnd'].iloc[0]
        user_email = corresponding_random_timesheet['userEmail'].iloc[0]

        checked_answer = random.randint(1, 5) if random.random() > 0.2 else None
        text_answer = random.choice(text_answer_choices) if random.random() > 0.8 else None

        project_names.append(project_name)
        feedback_question_names.append(feedback_question_name)
        date_starts.append(date_start)
        date_ends.append(date_end)
        user_emails.append(user_email)
        checked_answers.append(checked_answer)
        text_answers.append(text_answer)

    data = {
        'projectName': project_names,
        'feedbackQuestionName': feedback_question_names,
        'dateStart': date_starts,
        'dateEnd': date_ends,
        'userEmail': user_emails,
        'checkedAnswer': checked_answers,
        'textAnswer': text_answers
    }

    return data

feedbackAnswers_df = pd.DataFrame(generateFeedbackAnswers())
feedbackAnswers_df

Unnamed: 0,projectName,feedbackQuestionName,dateStart,dateEnd,userEmail,checkedAnswer,textAnswer
0,"Castillo, Frey and Stewart Project",How satisfied are you with the level of transp...,09-07-2023,15-07-2023,jonathonorozco@example.org,3.0,
1,Wheeler and Sons Initiative,How satisfied are you with the project deliver...,26-11-2023,02-12-2023,lucasmiddleton@example.org,3.0,Fair
2,"Jones, Meyer and Smith Solution",Did the project meet your expectations? Please...,04-09-2022,10-09-2022,caitlinparker@example.net,,
3,Barnett-Harris Initiative,Rate the accuracy of the project budget estima...,27-11-2022,03-12-2022,lblankenship@example.com,3.0,
4,Green Ltd Project,Were the project resources allocated efficient...,01-05-2022,07-05-2022,walterschristopher@example.org,,
...,...,...,...,...,...,...,...
7995,Ward Ltd Project,Were the project resources allocated efficient...,01-10-2023,07-10-2023,shudson@example.org,5.0,
7996,Ross and Sons Solution,Rate the accuracy of the project budget estima...,31-07-2022,06-08-2022,matthew56@example.net,3.0,Good
7997,Allison and Sons Project,How would you rate the quality of work deliver...,06-11-2022,12-11-2022,brianlee@example.org,4.0,
7998,"Roberts, Brown and Lee Initiative",Did the project meet your expectations? Please...,09-10-2022,15-10-2022,rsmith@example.com,2.0,


In [46]:
if not os.path.exists('csvs'):
    os.makedirs('csvs')

users_df.to_csv('csvs/users.csv', index=False)
project_df.to_csv('csvs/projects.csv', index=False)
feedbackQuestion_df.to_csv('csvs/feedbackQuestions.csv', index=False)
projectsToUsers_df.to_csv('csvs/projectsToUsers.csv', index=False)
projectsToFeedbackQuestions_df.to_csv('csvs/projectsToFeedbackQuestions.csv', index=False)
timesheets_df.to_csv('csvs/timesheets.csv', index=False)
feedbackAnswers_df.to_csv('csvs/feedbackAnswers.csv', index=False)