In [1]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# Initialize Faker with French localization
fake = Faker('fr_FR')
Faker.seed(42)
np.random.seed(42)
random.seed(42)

In [4]:
# Configuration
num_records = 8950

In [5]:
# Regions & Cities in Senegal
regions_cities = {
    'Dakar': ['Dakar', 'Pikine', 'Guédiawaye'],
    'Thiès': ['Thiès', 'Mbour', 'Tivaouane'],
    'Saint-Louis': ['Saint-Louis', 'Podor', 'Dagana'],
    'Ziguinchor': ['Ziguinchor', 'Bignona', 'Oussouye'],
    'Kaolack': ['Kaolack', 'Nioro du Rip', 'Guinguinéo'],
    'Kolda': ['Kolda', 'Vélingara', 'Médina Yoro Foulah'],
    'Diourbel': ['Diourbel', 'Bambey', 'Mbacké'],
    'Fatick': ['Fatick', 'Foundiougne', 'Gossas'],
    'Matam': ['Matam', 'Kanel', 'Ranérou'],
    'Tambacounda': ['Tambacounda', 'Goudiry', 'Bakel'],
    'Kaffrine': ['Kaffrine', 'Koungheul', 'Malem Hodar'],
    'Kédougou': ['Kédougou', 'Salémata', 'Saraya'],
    'Sédhiou': ['Sédhiou', 'Bounkiling', 'Goudomp'],
    'Louga': ['Louga', 'Kébémer', 'Linguère']
}
regions = list(regions_cities.keys())
region_prob = [0.25, 0.12, 0.08, 0.05, 0.1, 0.05, 0.08, 0.04, 0.03, 0.05, 0.03, 0.02, 0.02, 0.08]
assigned_regions = np.random.choice(regions, size=num_records, p=region_prob)
assigned_cities = [np.random.choice(regions_cities[region]) for region in assigned_regions]

In [6]:
# Departments & Jobtitles
departments = ['Ressources Humaines', 'Informatique', 'Ventes', 'Marketing', 'Finance', 'Opérations', 'Service Clientèle']
departments_prob = [0.02, 0.15, 0.21, 0.08, 0.05, 0.30, 0.19]
jobtitles = {
    'Ressources Humaines': ['Responsable RH', 'Coordinateur RH', 'Recruteur', 'Assistant RH'],
    'Informatique': ['Responsable Informatique', 'Développeur Logiciel', 'Administrateur Système', 'Spécialiste Support Informatique'],
    'Ventes': ['Responsable des Ventes', 'Consultant Commercial', 'Spécialiste des Ventes', 'Représentant Commercial'],
    'Marketing': ['Responsable Marketing', 'Spécialiste SEO', 'Créateur de Contenu', 'Coordinateur Marketing'],
    'Finance': ['Responsable Financier', 'Comptable', 'Analyste Financier', 'Spécialiste des Comptes Fournisseurs'],
    'Opérations': ['Responsable des Opérations', 'Analyste des Opérations', 'Coordinateur Logistique', 'Spécialiste des Stocks'],
    'Service Clientèle': ['Responsable Service Clientèle', 'Représentant Service Clientèle', 'Spécialiste Support', 'Technicien Support']
}
jobtitles_prob = {
    'Ressources Humaines': [0.03, 0.3, 0.47, 0.2],  # Responsable RH, Coordinateur RH, Recruteur, Assistant RH
    'Informatique': [0.02, 0.47, 0.2, 0.31],  # Responsable Informatique, Développeur Logiciel, Administrateur Système, Spécialiste Support Informatique
    'Ventes': [0.03, 0.25, 0.32, 0.4],  # Responsable des Ventes, Consultant Commercial, Spécialiste des Ventes, Représentant Commercial
    'Marketing': [0.04, 0.25, 0.41, 0.3],  # Responsable Marketing, Spécialiste SEO, Créateur de Contenu, Coordinateur Marketing
    'Finance': [0.03, 0.37, 0.4, 0.2],  # Responsable Financier, Comptable, Analyste Financier, Spécialiste des Comptes Fournisseurs
    'Opérations': [0.02, 0.2, 0.4, 0.38],  # Responsable des Opérations, Analyste des Opérations, Coordinateur Logistique, Spécialiste des Stocks
    'Service Clientèle': [0.04, 0.3, 0.38, 0.28]  # Responsable Service Clientèle, Représentant Service Clientèle, Spécialiste Support, Technicien Support
}

In [7]:
# Educations
educations = ['Baccalauréat', "Licence", "Master", 'Doctorat']

education_mapping = {
    'Responsable RH': ["Master", "Doctorat"],
    'Coordinateur RH': ["Licence", "Master"],
    'Recruteur': ["Baccalauréat", "Licence"],
    'Assistant RH': ["Baccalauréat", "Licence"],
    'Responsable Informatique': ["Doctorat", "Master"],
    'Développeur Logiciel': ["Licence", "Master"],
    'Administrateur Système': ["Licence", "Master"],
    'Spécialiste Support Informatique': ["Baccalauréat", "Licence"],
    'Responsable des Ventes': ["Master","Doctorat"],
    'Consultant Commercial': ["Licence", "Master", "Doctorat"],
    'Spécialiste des Ventes': ["Licence", "Master", "Doctorat"],
    'Représentant Commercial': ["Licence"],
    'Responsable Marketing': ["Licence", "Master","Doctorat"],
    'Spécialiste SEO': ["Baccalauréat", "Licence"],
    'Créateur de Contenu': ["Baccalauréat", "Licence"],
    'Coordinateur Marketing': ["Licence"],
    'Responsable Financier': ["Master", "Doctorat"],
    'Comptable': ["Licence"],
    'Analyste Financier': ["Licence", "Master", "Doctorat"],
    'Spécialiste des Comptes Fournisseurs': ["Licence"],
    'Responsable des Opérations': ["Licence", "Master"],
    'Analyste des Opérations': ["Licence", "Master"],
    'Coordinateur Logistique': ["Licence"],
    'Spécialiste des Stocks': ["Baccalauréat", "Licence"],
    'Responsable Service Clientèle': ["Licence", "Master", "Doctorat"],
    'Représentant Service Clientèle': ["Baccalauréat", "Licence"],
    'Spécialiste Support': ["Baccalauréat", "Licence"],
    'Technicien Support': ["Baccalauréat", "Licence"]
}

In [8]:
# Hiring Date
# Define custom probability weights for each year
year_weights = {
    2015: 5,   # 15% probability
    2016: 8,   # 15% probability
    2017: 17,   # 20% probability
    2018: 9,  # 15% probability
    2019: 10,  # 10% probability
    2020: 11,  # 10% probability
    2021: 5,  # 8% probability
    2022: 12,  # 5% probability
    2023: 14,  # 2% probability
    2024: 9   # 2% probability
}


In [9]:
# Generate a random date based on custom probabilities
def generate_custom_date(year_weights):
    year = random.choices(list(year_weights.keys()), weights=list(year_weights.values()))[0]
    month = random.randint(1, 12)
    day = random.randint(1, 28)  # Assuming all months have 28 days for simplicity
    return fake.date_time_between(start_date=datetime(year, 1, 1), end_date=datetime(year, 12, 31))

In [10]:
def generate_salary(department, job_title):
    salary_dict = {
            'Ressources Humaines': {
                'Responsable RH': np.random.randint(2000000, 3000000),
                'Coordinateur RH': np.random.randint(1500000, 2000000),
                'Recruteur': np.random.randint(1500000, 2200000),
                'Assistant RH': np.random.randint(1500000, 2000000)
            },
            'Informatique': {
                'Responsable Informatique': np.random.randint(2500000, 4000000),
                'Développeur Logiciel': np.random.randint(2200000, 3000000),
                'Administrateur Système': np.random.randint(2000000, 3000000),
                'Spécialiste Support Informatique': np.random.randint(1500000, 2000000)
            },
            'Ventes': {
                'Responsable des Ventes': np.random.randint(2200000, 3500000),
                'Consultant Commercial': np.random.randint(2000000, 3000000),
                'Spécialiste des Ventes': np.random.randint(1500000, 2500000),
                'Représentant Commercial': np.random.randint(1500000, 2200000)
            },
            'Marketing': {
                'Responsable Marketing': np.random.randint(2200000, 3500000),
                'Spécialiste SEO': np.random.randint(1500000, 2500000),
                'Créateur de Contenu': np.random.randint(1500000, 2000000),
                'Coordinateur Marketing': np.random.randint(1500000, 2200000)
            },
            'Finance': {
                'Responsable Financier': np.random.randint(2500000, 4000000),
                'Comptable': np.random.randint(1500000, 2500000),
                'Analyste Financier': np.random.randint(2000000, 3000000),
                'Spécialiste des Comptes Fournisseurs': np.random.randint(1500000, 2000000)
            },
            'Opérations': {
                'Responsable des Opérations': np.random.randint(2200000, 3500000),
                'Analyste des Opérations': np.random.randint(1500000, 2500000),
                'Coordinateur Logistique': np.random.randint(1500000, 2000000),
                'Spécialiste des Stocks': np.random.randint(1500000, 2000000)
            },
            'Service Clientèle': {
                'Responsable Service Clientèle': np.random.randint(2000000, 3000000),
                'Représentant Service Clientèle': np.random.randint(1500000, 2000000),
                'Spécialiste Support': np.random.randint(1500000, 2000000),
                'Technicien Support': np.random.randint(1500000, 2500000)
            }
        }
    return salary_dict[department][job_title]

In [11]:
# Liste de prénoms et noms sénégalais
first_names = ['Mamadou', 'Awa', 'Fatou', 'Cheikh', 'Mariama', 'Ousmane', 'Adama', 'Abdoulaye', 'Coumba', 'Ibrahima']
last_names = ['Diop', 'Ba', 'Ndiaye', 'Sow', 'Diallo', 'Diouf', 'Gueye', 'Fall', 'Thiam', 'Faye']

In [12]:
# Generate the dataset
data = []

for _ in range(num_records):
    employee_id = f"00-{random.randint(10000000, 99999999)}"
    first_name = random.choice(first_names)
    last_name = random.choice(last_names)
    gender = np.random.choice(['Féminin', 'Masculin'], p=[0.46, 0.54])
    region = np.random.choice(regions, p=region_prob)
    city = np.random.choice(regions_cities[region])
    hiredate = generate_custom_date(year_weights)
    department = np.random.choice(departments, p=departments_prob)
    job_title = np.random.choice(jobtitles[department], p=jobtitles_prob[department])
    education_level = np.random.choice(education_mapping[job_title])
    performance_rating = np.random.choice(['Excellent', 'Bon', 'Satisfaisant', 'À améliorer'], p=[0.12, 0.5, 0.3, 0.08])
    overtime = np.random.choice(['Oui', 'Non'], p=[0.3, 0.7])
    salary = generate_salary(department, job_title)
    data.append([
        employee_id,
        first_name,
        last_name,
        gender,
        region,
        city,
        hiredate,
        department,
        job_title,
        education_level,
        salary,
        performance_rating,
        overtime
    ])

In [13]:
# Create DataFrame
columns = [
    'employee_id',
    'first_name',
    'last_name',
    'gender',
    'region',
    'city',
    'hiredate',
    'department',
    'job_title',
    'education_level',
    'salary',
    'performance_rating',
    'overtime'
]

df = pd.DataFrame(data, columns=columns)

In [14]:
# Add Birthdate
def generate_birthdate(row):
    age_distribution = {
        'under_25': 0.11,
        '25_34': 0.25,
        '35_44': 0.31,
        '45_54': 0.24,
        'over_55': 0.09
    }
    age_groups = list(age_distribution.keys())
    age_probs = list(age_distribution.values())
    age_group = np.random.choice(age_groups, p=age_probs)
    
    if any('Manager' in title for title in row['job_title']):
        age = np.random.randint(30, 65)
    elif row['education_level'] == 'PhD':
        age = np.random.randint(27, 65)
    elif age_group == 'under_25':
        age = np.random.randint(20, 25)
    elif age_group == '25_34':
        age = np.random.randint(25, 35)
    elif age_group == '35_44':
        age = np.random.randint(35, 45)
    elif age_group == '45_54':
        age = np.random.randint(45, 55)
    else:
        age = np.random.randint(56, 65)

    birthdate = fake.date_of_birth(minimum_age=age, maximum_age=age)
    return birthdate

In [15]:
# Apply the function to generate birthdates
df['birthdate'] = df.apply(generate_birthdate, axis=1)

In [16]:
# Terminations
# Define termination distribution
year_weights = {
    2015: 5,
    2016: 7,
    2017: 10,
    2018: 12,
    2019: 9,
    2020: 10,
    2021: 20,
    2022: 10,
    2023: 7,
    2024: 10
}

In [17]:
# Calculate the total number of terminated employees
total_employees = num_records
termination_percentage = 0.112  # 11.2%
total_terminated = int(total_employees * termination_percentage)

In [18]:
# Generate termination dates based on distribution
termination_dates = []
for year, weight in year_weights.items():
    num_terminations = int(total_terminated * (weight / 100))
    termination_dates.extend([year] * num_terminations)

In [19]:
# Randomly shuffle the termination dates
random.shuffle(termination_dates)

In [20]:
# Assign termination dates to terminated employees
terminated_indices = df.index[:total_terminated]
for i, year in enumerate(termination_dates[:total_terminated]):
    df.at[terminated_indices[i], 'termdate'] = datetime(year, 1, 1) + timedelta(days=random.randint(0, 365))

In [21]:
# Assign None to termdate for employees who are not terminated
df['termdate'] = df['termdate'].where(df['termdate'].notnull(), None)

In [22]:
# Ensure termdate is at least 6 months after hiredat
df['termdate'] = df.apply(lambda row: row['hiredate'] + timedelta(days=180) if row['termdate'] and row['termdate'] < row['hiredate'] + timedelta(days=180) else row['termdate'], axis=1)

In [23]:
education_multiplier = {
    'Baccalauréat': {'Homme': 1.03, 'Femme': 1.0},
    "Licence": {'Homme': 1.115, 'Femme': 1.0},
    "Master": {'Homme': 1.0, 'Femme': 1.07},
    'Doctorat': {'Homme': 1.0, 'Femme': 1.17}
}

In [24]:
# Function to calculate age from birthdate
def calculate_age(birthdate):
    today = pd.Timestamp('today')
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age

In [25]:
# Function to calculate the adjusted salary
def calculate_adjusted_salary(row):
    base_salary = row['salary']
    gender = row['gender']
    education = row['education_level']
    age = calculate_age(row['birthdate'])

    # Apply education multiplier
    multiplier = education_multiplier.get(education, {}).get(gender, 1.0)
    adjusted_salary = base_salary * multiplier

    # Apply age increment (between 0.1% and 0.3% per year of age)
    age_increment = 1 + np.random.uniform(0.001, 0.003) * age
    adjusted_salary *= age_increment

    # Ensure the adjusted salary is not lower than the base salary
    adjusted_salary = max(adjusted_salary, base_salary)

    # Round the adjusted salary to the nearest integer
    return round(adjusted_salary)

In [26]:
# Apply the function to the DataFrame
df['salary'] = df.apply(calculate_adjusted_salary, axis=1)

In [27]:
# Convert 'hiredate' and 'birthdate' to datetime
df['hiredate'] = pd.to_datetime(df['hiredate']).dt.date
df['birthdate'] = pd.to_datetime(df['birthdate']).dt.date
df['termdate'] = pd.to_datetime(df['termdate']).dt.date


In [28]:
df.head()

Unnamed: 0,employee_id,first_name,last_name,gender,region,city,hiredate,department,job_title,education_level,salary,performance_rating,overtime,birthdate,termdate
0,00-95822412,Awa,Diop,Féminin,Dakar,Pikine,2022-09-06,Service Clientèle,Technicien Support,Baccalauréat,2013846,Bon,Non,2000-10-05,2023-03-05
1,00-28728463,Awa,Thiam,Masculin,Kédougou,Saraya,2016-02-13,Service Clientèle,Spécialiste Support,Licence,1690477,Bon,Non,1978-12-18,2021-12-12
2,00-13999315,Awa,Sow,Masculin,Thiès,Thiès,2017-01-10,Opérations,Coordinateur Logistique,Licence,2089404,Bon,Non,1981-07-22,2018-01-27
3,00-85329037,Cheikh,Thiam,Féminin,Louga,Kébémer,2019-10-15,Ventes,Consultant Commercial,Doctorat,2249028,Bon,Non,2002-01-07,2022-02-08
4,00-47338124,Mamadou,Ndiaye,Masculin,Thiès,Mbour,2022-04-17,Opérations,Coordinateur Logistique,Licence,1691202,Satisfaisant,Non,1999-04-20,2022-11-05


In [29]:
# Save to CSV
df.to_csv('HumanResources_Senegal.csv', index=False)