In [2]:
from google.colab import drive
drive.mount('/drive')
!pip install faker
!pip install bcrypt
!pip install bson

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).
Collecting faker
  Downloading Faker-36.1.1-py3-none-any.whl.metadata (15 kB)
Downloading Faker-36.1.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-36.1.1
Collecting bcrypt
  Downloading bcrypt-4.2.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (9.8 kB)
Downloading bcrypt-4.2.1-cp39-abi3-manylinux_2_28_x86_64.whl (278 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bcrypt
Successfully installed bcrypt-4.2.1
Collecting bson
  Downloading bson-0.5.10.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bson
  Building wheel for bson (setup.py) .

In [3]:
import os
import pandas as pd
import random
import json
from faker import Faker
from bcrypt import hashpw, gensalt
from bson import ObjectId

# Initialize Faker for generating fake names, emails, etc.
fake = Faker()

# Function to hash passwords using bcrypt (with a low number of rounds for speed)
def hash_password(password: str) -> str:
    return hashpw(password.encode('utf-8'), gensalt(rounds=4)).decode('utf-8')

def generate_buyers_from_properties(properties_file: str, output_file: str, total_buyers: int = 10000):
    """
    Generates buyers based on the properties listed in the given CSV file.
    Each buyer will have favorite properties chosen from a single city.

    Args:
        properties_file (str): Path to the properties_on_sale CSV file.
        output_file (str): Path to save the generated buyers CSV file.
        total_buyers (int): Total number of buyers to generate (default is 10,000).
    """
    # Load the properties CSV into a DataFrame
    properties_df = pd.read_csv(properties_file)

    # Group properties by city, retaining a summary of each property
    city_properties = {}
    for _, row in properties_df.iterrows():
        city = row['city']
        # Create a summary of the property with the essential fields
        summary = {
            '_id': row['_id'],           # Keep the original property ID
            'thumbnail': row['thumbnail'],
            'address': row['address'],
            'price': row['price'],
            'area': row['area']
        }
        # Append the summary to the list for that city
        if city not in city_properties:
            city_properties[city] = []
        city_properties[city].append(summary)

    # List to hold all generated buyers
    buyers = []
    # List of all available cities (keys from the grouped properties)
    available_cities = list(city_properties.keys())

    # Generate the specified number of buyers
    for i in range(total_buyers):
        # Randomly select a city so that all favorites come from that city
        chosen_city = random.choice(available_cities)
        properties_in_city = city_properties[chosen_city]

        # Generate fake first and last names for the buyer
        fake_first_name = fake.first_name()
        fake_last_name = fake.last_name()

        # Create a buyer record with a unique ObjectId and fake data
        buyer = {
            '_id': str(ObjectId()),
            'name': fake_first_name,
            'surname': fake_last_name,
            'email': fake_first_name.lower() + fake_last_name.lower() + '@buyer.com',
            'password': hash_password(fake.password()),
            'phone_number': fake.phone_number(),
            'favourites': []  # This will hold favorite properties from the chosen city
        }

        # If properties exist in the chosen city, randomly select between 1 and 3 as favorites
        if properties_in_city:
            num_favs = random.randint(1, min(3, len(properties_in_city)))
            buyer['favourites'] = random.sample(properties_in_city, num_favs)

        buyers.append(buyer)

    # Convert the list of buyers to a DataFrame
    buyers_df = pd.DataFrame(buyers)
    # Convert the list of favorite properties to a JSON string for CSV export
    buyers_df['favourites'] = buyers_df['favourites'].apply(json.dumps)
    buyers_df.to_csv(output_file, index=False)
    print(f"Generated {total_buyers} buyers and saved to {output_file}")

if __name__ == "__main__":
    # Define the input file (final properties on sale CSV) and the output file path for buyers
    properties_file_path = '/drive/MyDrive/Intermediate Corporation/Dati/Zillow/Formatted/properties_on_sale.csv'
    buyers_output_file_path = '/drive/MyDrive/Intermediate Corporation/Dati/Zillow/Formatted/buyers_2.csv'

    # Generate 10,000 buyers from the properties file
    generate_buyers_from_properties(properties_file_path, buyers_output_file_path, total_buyers=10000)


Generated 10000 buyers and saved to /drive/MyDrive/Intermediate Corporation/Dati/Zillow/Formatted/buyers_2.csv
