In [3]:
import pandas as pd
import json
from faker import Faker
import random
from bcrypt import hashpw, gensalt

# Initialize Faker
fake = Faker("it_IT")

# Number of users to generate
num_users = 1000

# Define postal codes for each city
postal_codes = {
    "Pisa": [56121, 56122, 56123, 56124, 56125, 56126, 56127],
    "Firenze": [50100, 50121, 50122, 50123, 50124, 50125, 50126, 50127, 50129, 
                50131, 50132, 50133, 50134, 50135, 50136, 50137, 50139, 50141, 
                50142, 50143, 50144, 50145],
    "Lucca": [55100],
    "Livorno": [57100, 57121, 57122, 57123, 57124, 57125, 57126, 57127, 57128]
}

# Define roles and email domains
roles = ["USER", "ADMIN"]
admin_domain = "distribooked.org"

# Cities for addresses with weights
cities = ["Pisa", "Lucca", "Livorno", "Firenze"]
city_weights = [0.7, 0.1, 0.1, 0.1]  # 70% Pisa, 10% each for others

# Lists to store user data
users_data = []
login_credentials = []

# Generate user data
for _ in range(num_users):
    role = random.choices(roles, weights=[95, 5])[0]  # 95% USERS, 5% ADMINS
    city = random.choices(cities, weights=city_weights)[0]
    email_domain = admin_domain if role == "ADMIN" else fake.free_email_domain()
    
    # Get random postal code for the selected city
    postal_code = str(random.choice(postal_codes[city]))
    
    # Generate plaintext and hashed password
    plaintext_password = fake.password(length=10, special_chars=False)
    hashed_password = hashpw(plaintext_password.encode(), gensalt()).decode()

    # Create user details
    address = {
        "street": fake.street_address(),
        "city": city,
        "postalCode": postal_code,
        "province": city,
        "country": "Italy"
    }
    
    username = fake.user_name()

    user = {
        "username": username,
        "name": fake.first_name(),
        "surname": fake.last_name(),
        "dateOfBirth": fake.date_of_birth(minimum_age=18, maximum_age=100).strftime("%Y-%m-%d"),
        "password": hashed_password,
        "userType": role,
        "email": f"{username}@{email_domain}",
        "address": address
    }
    
    # Store login credentials for stress testing
    login_credentials.append({
        "username": username,
        "password": plaintext_password
    })

    users_data.append(user)

# Save users with hashed passwords (for the database)
output_users_file = "users_dataset_05.json"
with open(output_users_file, "w", encoding="utf-8") as f:
    json.dump(users_data, f, indent=4, ensure_ascii=False)

# Save plaintext login credentials (for testing)
output_credentials_file = "users_login_credentials_05.json"
with open(output_credentials_file, "w", encoding="utf-8") as f:
    json.dump(login_credentials, f, indent=4, ensure_ascii=False)

print(f"Dataset saved to {output_users_file}!")
print(f"Login credentials saved to {output_credentials_file}!")


Dataset saved to users_dataset_05.json!
Login credentials saved to users_login_credentials_05.json!
