In [1]:
import numpy as np
import pandas as pd

def get_project_data():
    n_projects = 1000000
    np.random.seed(0)
    # Generate project attributes
    past_p  = np.array(['High', 'Low'])[np.random.binomial(1, 0.5, n_projects)]
    job_s = np.array(['High', 'Low'])[np.random.binomial(1, 0.5, n_projects)]
    new_e = np.array(['No', 'Yes'])[np.random.binomial(1, 0.5, n_projects)]
    # Assign treatment
    prob_table1 = {'High': {'High': {'No': 0.1, 'Yes': 0.1},
                           'Low': {'No': 0.9, 'Yes': 0.1}},
                   'Low': {'High': {'No': 0.9, 'Yes': 0.1},
                           'Low': {'No': 0.9, 'Yes': 0.9}}}
    probs_b = [prob_table1[past_p[i]][job_s[i]][new_e[i]] for i in range(n_projects)]
    training_b = np.array(['Yes', 'No'])[np.random.binomial(1, probs_b)]
    # Build data frame
    data = {"AttendedTraining": training_b, "PastPerformance": past_p, "RecentlyHired": new_e, "JobSatisfaction": job_s}
    data = pd.DataFrame(data)
    data = data.groupby(['AttendedTraining', 'PastPerformance', 
                         'JobSatisfaction', 'RecentlyHired']).apply(lambda x: x.sample(frac=.00123))
    data = data.reset_index(drop=True).sample(frac=1)
    # Obtain performance
    performance = (data.PastPerformance == 'Low')*-4 + (data.RecentlyHired == 'Yes')*-3 + (data.JobSatisfaction == 'Low')*-1
    performance += -performance.min() + 1 
    data['Performance'] = performance / performance.max()   
    return data

df = get_project_data()
print(df.shape)
df.head()
df.to_csv("employee_training.csv", index=False)

(1231, 5)


In [4]:
df.groupby("AttendedTraining").Performance.mean()

AttendedTraining
No     0.511255
Yes    0.600072
Name: Performance, dtype: float64