In [5]:
from faker import Faker
import pandas as pd
import random
import os
import numpy as np

fake = Faker(seed=123) 

num_rows = 10000

data = {
    "name": [fake.name() for _ in range(num_rows)],
    "email": [fake.email() for _ in range(num_rows)],
    "phone": [fake.phone_number() for _ in range(num_rows)],
    "address": [fake.address() for _ in range(num_rows)],
    "date_of_birth": [fake.date_of_birth(minimum_age=18, maximum_age=80).strftime("%Y-%m-%d") for _ in range(num_rows)],
    "salary": [fake.random_int(min=30000, max=100000) for _ in range(num_rows)],
    "department": [fake.random_element(elements=("HR", "IT", "Finance", "Sales", "Marketing")) for _ in range(num_rows)],
    "hire_date": [fake.date_this_century().strftime("%Y-%m-%d") for _ in range(num_rows)],
    "employee_id": [fake.unique.random_number() for _ in range(num_rows)]
}

# Introduce null values in some columns
for i in range(100):
    data["email"][random.randint(0, num_rows - 1)] = None
    data["salary"][random.randint(0, num_rows - 1)] = None

# Replace some values in the "Department" column with rare values
for i in range(50):
    data["department"][random.randint(0, num_rows - 1)] = fake.random_element(
        elements=("Legal", "R&D", "Customer Support")
    )
    
df = pd.DataFrame(data) # Create a DataFrame from the generated data

output_folder = "Data/"
os.makedirs(output_folder, exist_ok=True)

split_dfs = np.array_split(df, 5)  # Split the DataFrame

for i, split_df in enumerate(split_dfs):
    file_name = f"{output_folder}Example_batch_{i}.csv"
    split_df.to_csv(file_name, sep=",", index=False)
