In [1]:
import pandas as pd
import random

# Define sample projects and their tasks
projects = {
    "Customer Insights Dashboard": [
        "Define dashboard KPIs",
        "Extract customer data",
        "Design dashboard layout",
        "Develop visualization components",
        "Optimize dashboard performance",
        "Validate data accuracy",
        "Deploy dashboard",
        "Gather user feedback",
        "Iterate based on feedback"
    ],
    "Fraud Detection Model": [
        "Collect transaction data",
        "Perform exploratory data analysis",
        "Feature engineering",
        "Train fraud detection model",
        "Evaluate model performance",
        "Optimize model hyperparameters",
        "Deploy model",
        "Monitor model performance",
        "Retrain model periodically"
    ],
    "Data Access Automation": [
        "Define role-based access levels",
        "Build access control mechanisms",
        "Implement authentication layers",
        "Ensure compliance with governance policies",
        "Develop logging and audit features",
        "Automate access requests",
        "Perform security testing",
        "Deploy access automation system",
        "Monitor system effectiveness"
    ],
    "Marketing Attribution System": [
        "Collect campaign data",
        "Build attribution model",
        "Develop scoring algorithm",
        "Implement model in data pipeline",
        "Validate attribution results",
        "Optimize scoring parameters",
        "Deploy attribution model",
        "Automate reporting",
        "Monitor campaign impact"
    ],
    "Real-time Inventory Management": [
        "Integrate warehouse data sources",
        "Design real-time data ingestion pipeline",
        "Develop inventory tracking dashboard",
        "Implement demand forecasting",
        "Optimize supply chain analytics",
        "Develop alerting mechanisms",
        "Deploy inventory management system",
        "Monitor stock levels",
        "Improve demand predictions"
    ]
}

# Define team members
teams = {
    "Business Intelligence": [
        "BI Lead", "Senior BI Analyst", "Mid-Level BI Analyst", "Junior BI Analyst"
    ],
    "Data Engineering": [
        "Data Engineering Lead", "Senior Data Engineer", "Mid-Level Data Engineer", "Junior Data Engineer"
    ],
    "Data Science": [
        "Data Science Lead", "Senior Data Scientist", "Mid-Level Data Scientist", "Junior Data Scientist"
    ],
    "Data Governance": [
        "Data Governance Lead", "Senior Data Governance Analyst", "Mid-Level Data Governance Analyst", "Junior Data Governance Analyst"
    ]
}

# Create an empty list to hold the data
data = []

# Assign tasks to random people from relevant teams
for project, tasks in projects.items():
    for task in tasks:
        # Randomly choose 2-4 people from relevant teams
        relevant_teams = random.sample(list(teams.keys()), k=2)  # Pick 2 departments involved
        crew = []
        for team in relevant_teams:
            crew.extend(random.sample(teams[team], k=random.randint(2, 3)))  # Pick 2-3 people from each
        # Convert crew list to a comma-separated string
        crew_str = ",".join(crew)
        data.append({"project": project, "task": task, "crew": crew_str})

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

In [2]:
df.head(10)

Unnamed: 0,project,task,crew
0,Customer Insights Dashboard,Define dashboard KPIs,"Junior BI Analyst,BI Lead,Mid-Level Data Gover..."
1,Customer Insights Dashboard,Extract customer data,"Data Governance Lead,Junior Data Governance An..."
2,Customer Insights Dashboard,Design dashboard layout,"Junior Data Engineer,Senior Data Engineer,Mid-..."
3,Customer Insights Dashboard,Develop visualization components,"Data Science Lead,Junior Data Scientist,Junior..."
4,Customer Insights Dashboard,Optimize dashboard performance,"Senior Data Scientist,Mid-Level Data Scientist..."
5,Customer Insights Dashboard,Validate data accuracy,"Senior Data Engineer,Junior Data Engineer,Mid-..."
6,Customer Insights Dashboard,Deploy dashboard,"Senior BI Analyst,Mid-Level BI Analyst,Data En..."
7,Customer Insights Dashboard,Gather user feedback,"Senior BI Analyst,Junior BI Analyst,Mid-Level ..."
8,Customer Insights Dashboard,Iterate based on feedback,"Mid-Level BI Analyst,BI Lead,Data Engineering ..."
9,Fraud Detection Model,Collect transaction data,"Mid-Level Data Scientist,Data Science Lead,Sen..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   project  45 non-null     object
 1   task     45 non-null     object
 2   crew     45 non-null     object
dtypes: object(3)
memory usage: 1.2+ KB


In [4]:
# Assuming your DataFrame is named 'df'
df['crew'] = df['crew'].apply(lambda x: x.split(','))  # Make sure it's a list of strings, if it's a string of comma-separated values.
df_exploded = df.explode('crew')

# Now df_exploded will have one row per crew member
df_exploded.head(10)

Unnamed: 0,project,task,crew
0,Customer Insights Dashboard,Define dashboard KPIs,Junior BI Analyst
0,Customer Insights Dashboard,Define dashboard KPIs,BI Lead
0,Customer Insights Dashboard,Define dashboard KPIs,Mid-Level Data Governance Analyst
0,Customer Insights Dashboard,Define dashboard KPIs,Data Governance Lead
0,Customer Insights Dashboard,Define dashboard KPIs,Senior Data Governance Analyst
1,Customer Insights Dashboard,Extract customer data,Data Governance Lead
1,Customer Insights Dashboard,Extract customer data,Junior Data Governance Analyst
1,Customer Insights Dashboard,Extract customer data,Mid-Level Data Governance Analyst
1,Customer Insights Dashboard,Extract customer data,Junior BI Analyst
1,Customer Insights Dashboard,Extract customer data,BI Lead


In [5]:
df_crew = df_exploded['crew'].drop_duplicates()
df_crew = pd.DataFrame(df_crew, columns=['crew'])

df_crew.insert(0, 'emp_id', range(1, len(df_crew) + 1))

df_crew.head()
df_crew.to_csv('data/employees.csv', index=False)

In [6]:
df_project = df_exploded['project'].drop_duplicates()
df_project = pd.DataFrame(df_project, columns=['project'])

df_project.insert(0, 'proj_id', range(1, len(df_project) + 1))

df_project.head()

Unnamed: 0,proj_id,project
0,1,Customer Insights Dashboard
9,2,Fraud Detection Model
18,3,Data Access Automation
27,4,Marketing Attribution System
36,5,Real-time Inventory Management


In [7]:
df_project.to_csv('data/projects.csv', index=False)

In [8]:
df_task = df_exploded['task'].drop_duplicates()
df_task = pd.DataFrame(df_task, columns=['task'])

df_task.insert(0, 'task_id', range(1, len(df_task) + 1))

df_task.head()
df_task.to_csv('data/tasks.csv', index=False)

In [9]:
df_exploded = df_exploded.reset_index(drop=True)
df_exploded.head()

Unnamed: 0,project,task,crew
0,Customer Insights Dashboard,Define dashboard KPIs,Junior BI Analyst
1,Customer Insights Dashboard,Define dashboard KPIs,BI Lead
2,Customer Insights Dashboard,Define dashboard KPIs,Mid-Level Data Governance Analyst
3,Customer Insights Dashboard,Define dashboard KPIs,Data Governance Lead
4,Customer Insights Dashboard,Define dashboard KPIs,Senior Data Governance Analyst


In [11]:
df_exploded = df_exploded.merge(df_crew, left_on='crew', right_on='crew', how='left')
df_exploded.head()

Unnamed: 0,project,task,crew,emp_id
0,Customer Insights Dashboard,Define dashboard KPIs,Junior BI Analyst,1
1,Customer Insights Dashboard,Define dashboard KPIs,BI Lead,2
2,Customer Insights Dashboard,Define dashboard KPIs,Mid-Level Data Governance Analyst,3
3,Customer Insights Dashboard,Define dashboard KPIs,Data Governance Lead,4
4,Customer Insights Dashboard,Define dashboard KPIs,Senior Data Governance Analyst,5


In [12]:
df_exploded = df_exploded.merge(df_project, left_on='project', right_on='project', how='left')

In [13]:
df_exploded = df_exploded.merge(df_task, left_on='task', right_on='task', how='left')

In [14]:
df_exploded.head()

Unnamed: 0,project,task,crew,emp_id,proj_id,task_id
0,Customer Insights Dashboard,Define dashboard KPIs,Junior BI Analyst,1,1,1
1,Customer Insights Dashboard,Define dashboard KPIs,BI Lead,2,1,1
2,Customer Insights Dashboard,Define dashboard KPIs,Mid-Level Data Governance Analyst,3,1,1
3,Customer Insights Dashboard,Define dashboard KPIs,Data Governance Lead,4,1,1
4,Customer Insights Dashboard,Define dashboard KPIs,Senior Data Governance Analyst,5,1,1


In [15]:
df_exploded.to_csv('data/emp_project_tasks.csv', index=True)