In [1]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [2]:
import csv
import json
import sys
from datetime import datetime, timedelta
from typing import Optional, Tuple

# Increase CSV field size limit to handle larger fields.
csv.field_size_limit(sys.maxsize)

def next_weekday(target_day: str) -> Optional[str]:
    """
    Returns the next date (formatted as YYYY-MM-DD) on which the target day occurs.
    For example, if today is Thursday and target_day is 'saturday', it returns the date for the next Saturday.
    """
    days = {
        "monday": 0,
        "tuesday": 1,
        "wednesday": 2,
        "thursday": 3,
        "friday": 4,
        "saturday": 5,
        "sunday": 6
    }
    target_day_lower = target_day.lower()
    if target_day_lower not in days:
        return None
    today = datetime.now()
    today_weekday = today.weekday()
    target_weekday = days[target_day_lower]
    days_ahead = target_weekday - today_weekday
    if days_ahead <= 0:
        days_ahead += 7
    next_day = today + timedelta(days=days_ahead)
    return next_day.strftime("%Y-%m-%d")

def parse_disponibility(disponibility_str: str, fallback_date: str = '') -> Tuple[str, str, Optional[int]]:
    """
    Attempts to parse the JSON string in the 'disponibility' field to obtain:
      - a calculated date in YYYY-MM-DD format (if the field 'day' is not already a date, it is interpreted as a weekday and converted),
      - time (as a string),
      - max_attendees (the maximum number of participants, as an integer) if present.
    If no valid day is provided, fallback_date is used; otherwise, next_weekday("saturday") is used.
    """
    if not disponibility_str:
        return (fallback_date if fallback_date else next_weekday("saturday"), '', None)
    try:
        dispo = json.loads(disponibility_str)
        raw_day = dispo.get('day', '')
        time_val = dispo.get('time', '')
        max_attendees = dispo.get('max_attendees', None)
        # Try to interpret raw_day as a date; if it fails, assume it's a weekday name and compute the date.
        calculated_date = raw_day
        try:
            datetime.strptime(raw_day, '%Y-%m-%d')
        except (ValueError, TypeError):
            calculated_date = next_weekday(raw_day)
        if not calculated_date:
            calculated_date = fallback_date if fallback_date else next_weekday("saturday")
        if max_attendees is not None:
            try:
                max_attendees = int(max_attendees)
            except ValueError:
                max_attendees = None
        return calculated_date, time_val, max_attendees
    except json.JSONDecodeError:
        return (fallback_date if fallback_date else next_weekday("saturday"), '', None)

def read_csv(filename: str):
    """Reads a CSV file and returns a list of dictionaries."""
    with open(filename, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        return list(reader)

def main():
    # Read input CSV files
    buyers = read_csv('/drive/MyDrive/Intermediate Corporation/Dati/Zillow/Formatted/buyers.csv')
    properties = read_csv('/drive/MyDrive/Intermediate Corporation/Dati/Zillow/Formatted/properties_on_sale.csv')
    sellers = read_csv('/drive/MyDrive/Intermediate Corporation/Dati/Zillow/Formatted/sellers.csv')

    # Create a dictionary for quick access to properties by _id (from properties CSV)
    properties_dict = {prop['_id']: prop for prop in properties}
    print(len(properties))

    # Extract property IDs from the seller's "property_on_sale" field.
    # The field is expected to be a JSON array of objects (each with an _id)
    seller_property_ids = set()
    for seller in sellers:
        seller_prop_field = seller.get('property_on_sale', '')
        if seller_prop_field:
            try:
                seller_props = json.loads(seller_prop_field)
                if isinstance(seller_props, list):
                    for sp in seller_props:
                        prop_id = sp.get('_id')
                        if prop_id:
                            seller_property_ids.add(prop_id.strip())
                elif isinstance(seller_props, dict):
                    prop_id = seller_props.get('_id')
                    if prop_id:
                        seller_property_ids.add(prop_id.strip())
            except json.JSONDecodeError:
                seller_property_ids.update([p.strip() for p in seller_prop_field.split(';') if p.strip()])
    print(len(seller_property_ids))

    # Dictionaries to accumulate reservations:
    # reservations_buyer: key = buyer_id, value = list of dictionaries for ReservationB
    # reservations_seller: key = property_on_sale_id, value = list of dictionaries for ReservationS
    reservations_buyer = {}
    reservations_seller = {}
    # Dictionary to count reservations per property (to enforce max_attendees)
    property_attendee_count = {}

    # Iterate over each buyer and their favourites (expected as JSON array of objects)
    for buyer in buyers:
        buyer_id = buyer['_id']
        favourites_field = buyer.get('favourites', '')
        favourites_list = []
        if favourites_field:
            try:
                favourites_list = json.loads(favourites_field)
                if not isinstance(favourites_list, list):
                    favourites_list = [favourites_list]
            except json.JSONDecodeError:
                favourites_list = [{'property_on_sale_id': fav.strip()} for fav in favourites_field.split(';') if fav.strip()]

        # Iterate over each favourite property object
        for fav in favourites_list:
            # Expecting the favourite object to contain an '_id' field for the property
            prop_id = fav.get('_id')
            if not prop_id:
                continue

            # Check if the property exists in the properties CSV
            if prop_id in properties_dict:
                prop = properties_dict[prop_id]
                registration_date = prop.get('registration_date', '')
                date, time_val, max_attendees = parse_disponibility(prop.get('disponibility', ''), fallback_date=registration_date)
                current_count = property_attendee_count.get(prop_id, 0)
                if max_attendees is not None and current_count >= max_attendees:
                    continue

                # Use thumbnail and address from the favourite object if available; otherwise, fall back to the property values.
                thumbnail = fav.get('thumbnail', prop.get('thumbnail', ''))
                address = fav.get('address', prop.get('address', ''))
                reservation_b = {
                    'property_on_sale_id': prop_id,
                    'date': date,  # Clearly calculated date
                    'time': time_val,
                    'thumbnail': thumbnail,
                    'address': address
                }
                reservations_buyer.setdefault(buyer_id, []).append(reservation_b)

                # If the property is also listed in the seller CSV, create a seller reservation
                if prop_id in seller_property_ids:
                    reservation_s = {
                        'buyer_id': buyer_id,
                        'full_name': f"{buyer.get('name', '').strip()} {buyer.get('surname', '').strip()}".strip(),
                        'email': buyer.get('email', ''),
                        'phone': buyer.get('phone_number', '')
                    }
                    reservations_seller.setdefault(prop_id, []).append(reservation_s)

                property_attendee_count[prop_id] = current_count + 1

    # Write the CSV for ReservationsBuyer
    # Each row contains a JSON string under the 'reservations' field, which will be inserted into Redis.
    with open('/drive/MyDrive/Intermediate Corporation/Dati/Zillow/Formatted/reservations_buyer.csv', mode='w', newline='', encoding='utf-8') as f_buyer:
        fieldnames = ['redis_key', 'buyer_id', 'reservations']
        writer = csv.DictWriter(f_buyer, fieldnames=fieldnames)
        writer.writeheader()
        for buyer_id, res_list in reservations_buyer.items():
            redis_key = f"buyer_id:{buyer_id}:reservations_buyer"
            writer.writerow({
                'redis_key': redis_key,
                'buyer_id': buyer_id,
                'reservations': json.dumps(res_list)
            })

    # Write the CSV for ReservationsSeller
    # Each row contains a JSON string under the 'reservations' field.
    with open('/drive/MyDrive/Intermediate Corporation/Dati/Zillow/Formatted/reservations_seller.csv', mode='w', newline='', encoding='utf-8') as f_seller:
        fieldnames = ['redis_key', 'property_on_sale_id', 'reservations']
        writer = csv.DictWriter(f_seller, fieldnames=fieldnames)
        writer.writeheader()
        for prop_id, res_list in reservations_seller.items():
            redis_key = f"property_on_sale_id:{prop_id}:reservations_seller"
            writer.writerow({
                'redis_key': redis_key,
                'property_on_sale_id': prop_id,
                'reservations': json.dumps(res_list)
            })

if __name__ == '__main__':
    main()


13151
13151
