In [18]:
import pandas as pd
import numpy as np
import uuid
import random
from faker import Faker
import bcrypt
from datetime import datetime, timedelta
import os
import time

In [2]:
fake = Faker()

num_users = 5000
num_projects = 50

num_feedback_answers = 8000

In [3]:
def generateUsers() :
    roles = ['admin', 'user', 'manager', 'employee']
    weights = [10, 50, 20, 20]

    email_ids = []
    passwords = []
    roles_list = []
    names = []

    password = bcrypt.hashpw('org_pass_1234'.encode('utf-8'), bcrypt.gensalt()).decode('utf-8')

    # Generate and store random data for each row
    for _ in range(num_users):
        while True:
            email = fake.email()
            if email not in email_ids:
                break
        role = np.random.choice(roles, p=np.array(weights)/sum(weights))
        name = fake.name()

        email_ids.append(email)
        passwords.append(password)
        roles_list.append(role)
        names.append(name)

    # Create a DataFrame
    data = {
        'email': email_ids,
        'pass': passwords,
        'role': roles_list,
        'name': names
    }

    return data

users_df = pd.DataFrame(generateUsers())
users_df

Unnamed: 0,email,pass,role,name
0,orodriguez@example.net,$2b$12$vlvNioIxzbg7Z/JCBmQ8MuGAMKBWKyZRH9gHACA...,manager,Erin Flores
1,huntergood@example.org,$2b$12$vlvNioIxzbg7Z/JCBmQ8MuGAMKBWKyZRH9gHACA...,admin,Jonathan Solomon
2,tommy77@example.com,$2b$12$vlvNioIxzbg7Z/JCBmQ8MuGAMKBWKyZRH9gHACA...,user,Virginia Wagner
3,tgalvan@example.org,$2b$12$vlvNioIxzbg7Z/JCBmQ8MuGAMKBWKyZRH9gHACA...,manager,Monica Barnes
4,rachael85@example.net,$2b$12$vlvNioIxzbg7Z/JCBmQ8MuGAMKBWKyZRH9gHACA...,user,Casey King
...,...,...,...,...
4995,lbutler@example.com,$2b$12$vlvNioIxzbg7Z/JCBmQ8MuGAMKBWKyZRH9gHACA...,employee,Gina Mathis
4996,lopezmegan@example.net,$2b$12$vlvNioIxzbg7Z/JCBmQ8MuGAMKBWKyZRH9gHACA...,manager,Sarah Adkins
4997,grant48@example.com,$2b$12$vlvNioIxzbg7Z/JCBmQ8MuGAMKBWKyZRH9gHACA...,user,Patricia Morris
4998,alyssacampbell@example.net,$2b$12$vlvNioIxzbg7Z/JCBmQ8MuGAMKBWKyZRH9gHACA...,admin,Alejandra Gray


In [4]:
def generateProjects():
    project_domain_choices = ['Financial Services', 'Healthcare', 'Education Technology', 'E-commerce', 'Transportation and Logistics', 'Renewable Energy', 'Artificial Intelligence and Machine Learning', 'Cybersecurity', 'Media and Entertainment', 'Telecommunications']
    ids = []
    names = []
    domains = []

    for _ in range(num_projects):

        while True:
            project_id = str(uuid.uuid4())
            if project_id not in ids:
                break

        while True:
            project_name = fake.company() + " " + fake.word(ext_word_list=['Project', 'Solution', 'Initiative'])
            if project_name not in names:
                break
        
        project_domain = random.choice(project_domain_choices)

        ids.append(project_id)
        names.append(project_name)
        domains.append(project_domain)
    
    data = {
        'id': ids, 
        'name': names,
        'domain': domains
    }
    return data

project_df = pd.DataFrame(generateProjects())
project_df

Unnamed: 0,id,name,domain
0,a85bb34f-62cb-45d4-84cc-60df2047ffba,Jackson-Ingram Solution,Renewable Energy
1,ec6b31c8-f460-40eb-aeec-970f0e044e90,"Morgan, Riley and Patrick Initiative",Financial Services
2,77207041-d18c-4d63-a70f-be2dc8b3734a,Anderson Ltd Initiative,Artificial Intelligence and Machine Learning
3,0289e5a4-ef81-4482-8b62-e842cd508949,"Gonzalez, Bowman and Hebert Project",Renewable Energy
4,ee359c63-64d1-4251-b37b-359068b2f122,"Smith, Thompson and White Initiative",Artificial Intelligence and Machine Learning
5,afae96fe-9694-490a-a870-38daf8d1a933,Buck Inc Solution,Media and Entertainment
6,73a36609-4e9c-4c79-8cf3-9624318a4479,Young-Rojas Project,Education Technology
7,567563c0-4e8a-4b7b-84fd-c3e1a6292b3a,Johnson-Daniel Solution,Education Technology
8,91d67992-940d-4427-8566-11524d9a0ae3,Graham-Hernandez Project,Media and Entertainment
9,c05c589c-fd21-447c-9a11-78c851eab260,Johnson-Briggs Solution,E-commerce


In [5]:
def generateFeedbackQuestions():
    ids = []
    feedback_questions = ['How satisfied are you with the overall progress of the project?', 'Rate the communication between team members on a scale of 1 to 5', 'Did you find the project timeline realistic? Please rate from 1 to 5', 'How would you rate the quality of work delivered by the team?', 'Rate the effectiveness of the project management on a scale of 1 to 5', 'Were the project goals clearly defined? Please rate from 1 to 5', 'How satisfied are you with the level of collaboration within the team?', 'Rate the accuracy of the project budget estimation from 1 to 5', 'Did the project meet your expectations? Please rate from 1 to 5', 'How would you rate the problem-solving skills of the team?', 'Rate the level of stakeholder engagement on a scale of 1 to 5', 'Were the project risks effectively managed? Please rate from 1 to 5', 'How satisfied are you with the project deliverables?', 'Rate the level of innovation demonstrated in the project from 1 to 5', 'Did the project meet the specified deadlines? Please rate from 1 to 5', 'How would you rate the level of client satisfaction with the project outcomes?', 'Rate the effectiveness of the project feedback mechanisms on a scale of 1 to 5', 'Were the project resources allocated efficiently? Please rate from 1 to 5', 'How satisfied are you with the level of transparency in project communication?', 'Rate the level of adaptability shown by the team members on a scale of 1 to 5']

    for _ in range(len(feedback_questions)):

        while True:
            question_id = str(uuid.uuid4())
            if question_id not in ids:
                break

        ids.append(question_id)
    
    data = {
        'id': ids, 
        'question': feedback_questions
    }
    return data

feedbackQuestion_df = pd.DataFrame(generateFeedbackQuestions())
feedbackQuestion_df

Unnamed: 0,id,question
0,5caf438a-c4f0-4c8b-a8d9-83ae86a79444,How satisfied are you with the overall progres...
1,13baad5f-bc17-47cb-bb02-67933b396b5e,Rate the communication between team members on...
2,266dfcc6-7494-4203-a7bb-e5613d7b68ef,Did you find the project timeline realistic? P...
3,4633947d-a230-43fc-93fa-57c925530385,How would you rate the quality of work deliver...
4,c0c7dbdd-6aca-486e-a612-6cd458e61b45,Rate the effectiveness of the project manageme...
5,16805aa3-3dcc-4f19-bcd4-646124295ea4,Were the project goals clearly defined? Please...
6,e5a26fda-abe7-422a-86be-f50d6ec3b44f,How satisfied are you with the level of collab...
7,0bdc677a-ef9f-4744-b66a-45c0826a1280,Rate the accuracy of the project budget estima...
8,3005152a-3632-4daf-9cea-e5d9ab8b0976,Did the project meet your expectations? Please...
9,ec1ab2b0-fecd-4264-acec-87b169733f44,How would you rate the problem-solving skills ...


In [6]:
def generateProjectsToUsers(): 
    project_names = []
    user_emails = []
    for user_email in users_df['email']:
        projects = random.sample(list(project_df['name']), k=random.randint(1, project_df['name'].shape[0]))
        for project in projects:
            user_emails.append(user_email)
            project_names.append(project)

    data = {
        'projectName': project_names,
        'userEmail': user_emails
    }
    return data

projectsToUsers_df = pd.DataFrame(generateProjectsToUsers())
projectsToUsers_df

Unnamed: 0,projectName,userEmail
0,"Bird, Stevenson and Munoz Initiative",orodriguez@example.net
1,Young-Rojas Project,orodriguez@example.net
2,Jacobs-Ward Solution,orodriguez@example.net
3,Gould-Johnson Project,orodriguez@example.net
4,"Thomas, Carter and Lucas Initiative",orodriguez@example.net
...,...,...
128632,Nolan PLC Project,brownfrancisco@example.org
128633,Arnold-Hudson Solution,brownfrancisco@example.org
128634,"Gonzalez, Bowman and Hebert Project",brownfrancisco@example.org
128635,Rodriguez-Hunt Project,brownfrancisco@example.org


In [7]:
def generateProjectsToFeedbackQuestions(): 
    project_names = []
    question_names = []
    for project in project_df['name']:
        questions = random.sample(list(feedbackQuestion_df['question']), k=random.randint(0, feedbackQuestion_df['question'].shape[0]))
        for question in questions:
            project_names.append(project)
            question_names.append(question)

    data = {
        'projectName': project_names,
        'questionName': question_names
    }
    return data

projectsToFeedbackQuestions_df = pd.DataFrame(generateProjectsToFeedbackQuestions())
projectsToFeedbackQuestions_df

Unnamed: 0,projectName,questionName
0,Jackson-Ingram Solution,How would you rate the problem-solving skills ...
1,Jackson-Ingram Solution,Rate the effectiveness of the project feedback...
2,Jackson-Ingram Solution,How satisfied are you with the level of transp...
3,"Morgan, Riley and Patrick Initiative",How satisfied are you with the overall progres...
4,"Morgan, Riley and Patrick Initiative",Rate the effectiveness of the project manageme...
...,...,...
571,Cole Inc Project,Rate the accuracy of the project budget estima...
572,Cole Inc Project,Rate the communication between team members on...
573,Cole Inc Project,Did you find the project timeline realistic? P...
574,Cole Inc Project,How would you rate the quality of work deliver...


In [24]:
def generate_random_sunday():
    today = datetime.today()
    random_days = random.randint(0, 365 * 2) 
    random_date = today - timedelta(days=random_days)
    while random_date.weekday() != 6:
        random_date -= timedelta(days=1)

    return {
            'formated': random_date.strftime('%d-%m-%Y'),
            'original': random_date
        }

def generateTimesheets():
    comments_choices = ["Excited to get started!", "Looking forward to the challenge", "Ready to dive in", "Feeling motivated", "Can't wait to see the end result", "Feeling optimistic about this project", "Hopeful for a successful outcome", "Eager to collaborate with the team"]
    hours_choices = list(range(0, 13))
    weekend_weights = [6, 6, 5, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    normal_weights = [1, 1, 1, 1, 6, 5, 4, 1, 1, 1, 1, 1, 1]
    activity_names_choices = ['BAU Activity', 'Sales Activity']

    user_emails = []
    date_starts = []
    date_ends = []
    projects_selected = []
    tasks_selected = []
    comments_added = []
    d0s = []
    d1s = []
    d2s = []
    d3s = []
    d4s = []
    d5s = []
    d6s = []
    activity_names = []

    num_timesheets = 0

    for current_num_user in range(num_users):
        start_time = time.time()

        user_email = users_df['email'].iloc[current_num_user] if random.random() > 0.1 else None
        if user_email is None:
            continue

        project_list_under_user = projectsToUsers_df.groupby('userEmail')['projectName'].agg(list).get(user_email, [])
        # print('project_list_under_user: ', end='')
        # print(project_list_under_user)

        num_timesheets_for_current_user = random.randint(1, len(project_list_under_user))
        # print('num timesheets to generate for ' + str(user_email) + ': ' + str(num_timesheets_for_current_user))

        project_names = random.sample(project_list_under_user, k=num_timesheets_for_current_user)

        for project_name in project_names:

            num_timesheets_for_project = random.choice(hours_choices)
            # print('num timesheets to generate for ' + str(user_email) + ' | ' + str(project_name) + ': ' + str(num_timesheets_for_project))

            d0_num_hours_to_work = np.random.choice(hours_choices, p=np.array(weekend_weights)/sum(weekend_weights))
            d1_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d2_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d3_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d4_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d5_num_hours_to_work = np.random.choice(hours_choices, p=np.array(normal_weights)/sum(normal_weights))
            d6_num_hours_to_work = np.random.choice(hours_choices, p=np.array(weekend_weights)/sum(weekend_weights))

            memoized_date_starts_for_project = []

            for _ in range(num_timesheets_for_project):

                random_sunday = generate_random_sunday()
                # print('finding date...')
                while random_sunday['formated'] in memoized_date_starts_for_project: # [date_starts[i] for i, project_item in enumerate([projects_selected[i] for i, email_item in enumerate(user_emails) if email_item == user_email]) if project_item == project_name]:
                    random_sunday = generate_random_sunday()
                # print('found date: ' + str(random_sunday['formated']))

                date_start = random_sunday['formated']
                date_end = (random_sunday['original'] + timedelta(days=6)).strftime('%d-%m-%Y')

                memoized_date_starts_for_project.append(date_start)

                task_selected = None
                comment_added = random.choice(comments_choices) if random.random() > 0.5 else None

                d0 = random.randint(0, round(d0_num_hours_to_work / num_timesheets_for_project)) if random.random() > 0.3 else None
                d1 = random.randint(0, round(d1_num_hours_to_work / num_timesheets_for_project)) if random.random() > 0.3 else None
                d2 = random.randint(0, round(d2_num_hours_to_work / num_timesheets_for_project)) if random.random() > 0.3 else None
                d3 = random.randint(0, round(d3_num_hours_to_work / num_timesheets_for_project)) if random.random() > 0.3 else None
                d4 = random.randint(0, round(d4_num_hours_to_work / num_timesheets_for_project)) if random.random() > 0.3 else None
                d5 = random.randint(0, round(d5_num_hours_to_work / num_timesheets_for_project)) if random.random() > 0.3 else None
                d6 = random.randint(0, round(d6_num_hours_to_work / num_timesheets_for_project)) if random.random() > 0.3 else None

                for activity_name in random.sample(activity_names_choices, k=random.randint(0, len(activity_names_choices))):
                    # print('activity for ' + str(user_email) + ' | ' + str(project_name) + ' | ' + str(date_start) + ': ' + str(activity_name))

                    user_emails.append(user_email)
                    projects_selected.append(project_name)
                    date_starts.append(date_start)
                    date_ends.append(date_end)
                    tasks_selected.append(task_selected)
                    comments_added.append(comment_added)
                    d0s.append(d0)
                    d1s.append(d1)
                    d2s.append(d2)
                    d3s.append(d3)
                    d4s.append(d4)
                    d5s.append(d5)
                    d6s.append(d6)
                    activity_names.append(activity_name)

                    num_timesheets += 1
        print(f'Added timesheets for: {user_email:30s}  | current_num_user: {current_num_user:4d} | total_users: {num_users:4d} | timesheet_count: {num_timesheets:8d} | elapsed_time: {(time.time() - start_time) * 1000:.0f} ms')

    data = {
        'userEmail': user_emails,
        'projectSelected': projects_selected,
        'dateStart': date_starts,
        'dateEnd': date_ends,
        'taskSelected': tasks_selected,
        'commentAdded': comments_added,
        'd0': d0s,
        'd1': d1s,
        'd2': d2s,
        'd3': d3s,
        'd4': d4s,
        'd5': d5s,
        'd6': d6s,
        'activityName': activity_names
    }

    return data

timesheets_df = pd.DataFrame(generateTimesheets())
timesheets_df

Added timesheets for: orodriguez@example.net          | current_num_user:    0 | total_users: 5000 | timesheet_count:       63 | elapsed_time: 77 ms
Added timesheets for: tommy77@example.com             | current_num_user:    2 | total_users: 5000 | timesheet_count:       77 | elapsed_time: 74 ms
Added timesheets for: tgalvan@example.org             | current_num_user:    3 | total_users: 5000 | timesheet_count:      248 | elapsed_time: 72 ms
Added timesheets for: rachael85@example.net           | current_num_user:    4 | total_users: 5000 | timesheet_count:      303 | elapsed_time: 73 ms
Added timesheets for: lisachoi@example.net            | current_num_user:    5 | total_users: 5000 | timesheet_count:      413 | elapsed_time: 68 ms
Added timesheets for: christopher77@example.com       | current_num_user:    6 | total_users: 5000 | timesheet_count:      425 | elapsed_time: 68 ms
Added timesheets for: jennifer20@example.org          | current_num_user:    7 | total_users: 5000 | times

Unnamed: 0,userEmail,projectSelected,dateStart,dateEnd,taskSelected,commentAdded,d0,d1,d2,d3,d4,d5,d6,activityName
0,orodriguez@example.net,Jacobs-Ward Solution,10-07-2022,16-07-2022,,Ready to dive in,0.0,,0.0,,,0.0,,BAU Activity
1,orodriguez@example.net,Jacobs-Ward Solution,21-01-2024,27-01-2024,,Feeling optimistic about this project,0.0,0.0,2.0,1.0,0.0,0.0,0.0,BAU Activity
2,orodriguez@example.net,Jacobs-Ward Solution,05-03-2023,11-03-2023,,,0.0,,,0.0,1.0,0.0,0.0,Sales Activity
3,orodriguez@example.net,Jacobs-Ward Solution,05-03-2023,11-03-2023,,,0.0,,,0.0,1.0,0.0,0.0,BAU Activity
4,orodriguez@example.net,Jacobs-Ward Solution,11-06-2023,17-06-2023,,,0.0,0.0,0.0,0.0,1.0,0.0,,BAU Activity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352248,brownfrancisco@example.org,"Smith, Thompson and White Initiative",28-08-2022,03-09-2022,,,0.0,,0.0,1.0,1.0,,0.0,BAU Activity
352249,brownfrancisco@example.org,"Smith, Thompson and White Initiative",02-10-2022,08-10-2022,,Looking forward to the challenge,0.0,1.0,1.0,0.0,0.0,0.0,,Sales Activity
352250,brownfrancisco@example.org,"Turner, Shah and Clark Solution",10-12-2023,16-12-2023,,,0.0,1.0,0.0,,0.0,0.0,0.0,Sales Activity
352251,brownfrancisco@example.org,"Turner, Shah and Clark Solution",10-04-2022,16-04-2022,,Can't wait to see the end result,,,,1.0,,0.0,0.0,BAU Activity


In [25]:
def generateFeedbackAnswers():

    text_answer_choices = ['Excellent', 'Good', 'Average', 'Fair', 'Poor']

    project_names = []
    feedback_question_names = []
    checked_answers = []
    text_answers = []
    date_starts = []
    date_ends = []
    user_emails = []

    for _ in range(num_feedback_answers):

        random_projectsToFeedbackQuestions = projectsToFeedbackQuestions_df.sample()
        project_name = random_projectsToFeedbackQuestions['projectName'].iloc[0]
        feedback_question_name = random_projectsToFeedbackQuestions['questionName'].iloc[0]

        corresponding_random_timesheet = timesheets_df[timesheets_df['projectSelected'] == project_name].sample()
        date_start = corresponding_random_timesheet['dateStart'].iloc[0]
        date_end = corresponding_random_timesheet['dateEnd'].iloc[0]
        user_email = corresponding_random_timesheet['userEmail'].iloc[0]

        checked_answer = random.randint(1, 5) if random.random() > 0.2 else None
        text_answer = random.choice(text_answer_choices) if random.random() > 0.8 else None

        project_names.append(project_name)
        feedback_question_names.append(feedback_question_name)
        date_starts.append(date_start)
        date_ends.append(date_end)
        user_emails.append(user_email)
        checked_answers.append(checked_answer)
        text_answers.append(text_answer)

    data = {
        'projectName': project_names,
        'feedbackQuestionName': feedback_question_names,
        'dateStart': date_starts,
        'dateEnd': date_ends,
        'userEmail': user_emails,
        'checkedAnswer': checked_answers,
        'textAnswer': text_answers
    }

    return data

feedbackAnswers_df = pd.DataFrame(generateFeedbackAnswers())
feedbackAnswers_df

Unnamed: 0,projectName,feedbackQuestionName,dateStart,dateEnd,userEmail,checkedAnswer,textAnswer
0,Hopkins-Lopez Initiative,Were the project risks effectively managed? Pl...,23-04-2023,29-04-2023,stevenbaldwin@example.com,4.0,Excellent
1,"Gonzalez, Bowman and Hebert Project",Rate the communication between team members on...,07-08-2022,13-08-2022,donald12@example.net,,
2,"Bowman, Clark and Trevino Initiative",Were the project resources allocated efficient...,31-03-2024,06-04-2024,alyssabrewer@example.net,1.0,Poor
3,Buck Inc Solution,Rate the effectiveness of the project manageme...,14-04-2024,20-04-2024,whitemarissa@example.net,5.0,Excellent
4,"Ryan, Garner and Huber Solution",Rate the level of stakeholder engagement on a ...,07-04-2024,13-04-2024,millerlaura@example.org,4.0,
...,...,...,...,...,...,...,...
7995,Curtis Group Initiative,How satisfied are you with the project deliver...,22-05-2022,28-05-2022,davisderrick@example.org,4.0,
7996,Brewer Group Project,Did you find the project timeline realistic? P...,29-01-2023,04-02-2023,smooney@example.com,4.0,
7997,Butler and Sons Solution,How satisfied are you with the level of transp...,04-09-2022,10-09-2022,reesejillian@example.net,4.0,
7998,Hopkins-Lopez Initiative,How satisfied are you with the overall progres...,18-12-2022,24-12-2022,christopher78@example.org,5.0,


In [26]:
if not os.path.exists('csvs'):
    os.makedirs('csvs')

users_df.to_csv('csvs/users.csv', index=False)
project_df.to_csv('csvs/projects.csv', index=False)
feedbackQuestion_df.to_csv('csvs/feedbackQuestions.csv', index=False)
projectsToUsers_df.to_csv('csvs/projectsToUsers.csv', index=False)
projectsToFeedbackQuestions_df.to_csv('csvs/projectsToFeedbackQuestions.csv', index=False)
timesheets_df.to_csv('csvs/timesheets.csv', index=False)
feedbackAnswers_df.to_csv('csvs/feedbackAnswers.csv', index=False)