# project payton

## part 1- Web scraper 

### Presented by: Ester Carmiel  & Talya Cuperman 

#### Importing Libraries and Defining Variables:

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import json
import numpy as np

In [12]:
TARGET_NEIGHBORHOODS = ["שפירא", "בצרון"]
API_KEY = '' #enter here your API KEY

#### Collecting Apartment Links:

In [3]:
def fetch_all_apartment_links(base_url):
    apartment_links = []
    current_page_url = base_url
    visited_pages = set()

    while current_page_url and current_page_url not in visited_pages:
        print(f"Fetching links from: {current_page_url}")
        visited_pages.add(current_page_url)
        try:
            response = requests.get(current_page_url)
            response.raise_for_status()
            page_content = BeautifulSoup(response.content, 'html.parser')

            for card in page_content.find_all('div', class_='card overflow-hidden'):
                link_element = card.find('a')
                if link_element and link_element.get('href'):
                    full_link = urljoin(base_url, link_element['href'])
                    apartment_links.append(full_link)

            next_page_element = page_content.select_one('nav.card-pagination ul.pagination.justify-content-between.justify-content-sm-center.flex-wrap li.page-item.mx-1.nextPage a.page-link.text-nowrap.px-3.py-2.rounded-pill')
            current_page_url = urljoin(base_url, next_page_element['href']) if next_page_element and 'href' in next_page_element.attrs else None

        except Exception as e:
            print(f"Error fetching links from {current_page_url}: {e}")
            break

        time.sleep(0.1)

    print(f"\nCollected {len(apartment_links)} apartment links.\n")
    return apartment_links


In [4]:
def extract_icon_feature(features_dict, feature_name):
    return features_dict.get(feature_name, None)


#### Calculating Distance from City Center:

In [5]:
def calculate_distance(origin, destination):
    directions_url = f"https://routes.googleapis.com/directions/v2:computeRoutes"
    headers = {
        'Content-Type': 'application/json',
        'X-Goog-Api-Key': API_KEY,
        'X-Goog-FieldMask': 'routes.distanceMeters,routes.duration'
    }
    directions_body = {
        "origin": {"address": origin},
        "destination": {"address": destination},
        "travelMode": "DRIVE",
        "routingPreference": "TRAFFIC_AWARE"
    }

    try:
        directions_response = requests.post(directions_url, headers=headers, data=json.dumps(directions_body))
        if directions_response.status_code == 200:
            try:
                directions_data = directions_response.json()
                distance_meters = directions_data['routes'][0]['distanceMeters']
                return distance_meters
            except json.JSONDecodeError:
                print("Invalid JSON response from API.")
        else:
            print(f"API Error: {directions_response.status_code}, {directions_response.text}")
        return None
    except requests.exceptions.RequestException:
        print("Network error during API request.")
        return None

#### Extracting Data from a Single Apartment:

In [6]:
def process_single_apartment(apartment_url):
    property_features_list = ["מרוהטת", "מזגן", 'חניה', 'ממ"ד', "מרפסת", "נגישות", "סורגים", "מעלית", "מחסן", "משופצת"]

    try:
        response = requests.get(apartment_url)
        response.raise_for_status()
        apartment_page = BeautifulSoup(response.content, 'html.parser')

        details_table = apartment_page.find('table', class_='table')
        if not details_table:
            return None

        apartment_details = {cell.text.strip(): content.text.strip() for row in details_table.find_all('tr') for cell, content in [row.find_all('td')] if len(row.find_all('td')) == 2}

        if not any(neighborhood in apartment_details.get('שכונה', '') for neighborhood in TARGET_NEIGHBORHOODS):
            return None

        description_text = apartment_page.find('p', class_='text-word-break').get_text(strip=True) if apartment_page.find('p', class_='text-word-break') else ''
        price_value = float(apartment_page.find_all('h2', class_='card-title')[1].text.replace('₪', '').replace(',', '').strip())

        number_of_images = len(apartment_page.find_all('div', class_='justify-content-center px-1'))

        floor_info = apartment_details.get('קומה', '')
        floor_number_value = None
        total_floors_value = None

        if floor_info:
            floor_digits = re.findall(r'\d+', floor_info)
            if 'קרקע' in floor_info:
                floor_number_value = 0
                total_floors_value = floor_digits[-1] if floor_digits else 0 if floor_info.count('קרקע') == 2 else None
            elif len(floor_digits) >= 1:
                floor_number_value = int(floor_digits[0])
                total_floors_value = int(floor_digits[1]) if len(floor_digits) >= 2 else int(floor_digits[0])

        features_dictionary = {}
        features_section = apartment_page.find('div', class_='card-icons')
        if features_section:
            for icon_div in features_section.find_all('div', class_='card-icon'):
                feature_name = icon_div.find('span').get_text(strip=True)
                features_dictionary[feature_name] = int(icon_div.find('i', class_='fa-check') is not None)

        address_of_apartment = f"תל אביב,{apartment_details.get('כתובת', '')}"
        distance_from_dizengoff = float(f"{calculate_distance('תל אביב, כיכר דיזינגוף', address_of_apartment) / 1000:.2f}")

        apartment_data = {
            'property_type': apartment_details.get('פרטי הנכס', ''),
            'neighborhood': apartment_details.get('שכונה', ''),
            'address': apartment_details.get('כתובת', ''),
            'room_num': float(apartment_details.get('חדרים', np.nan)),
            'floor': floor_number_value,
            'area': int(apartment_details.get('שטח בנוי', np.nan)),
            'garden_area': int(apartment_details.get('שטח גינה', '0')),
            'days_to_enter': np.nan if 'תאריך כניסה' not in apartment_details else (0 if apartment_details.get('תאריך כניסה') == 'מיידית' else apartment_details.get('תאריך כניסה')),
            'num_of_payments': int(apartment_details.get('תשלומים בשנה', '0')),
            'monthly_arnona': apartment_details.get('ארנונה בחודש', np.nan),
            'building_tax': int(apartment_details.get('ועד בית בחודש', '0')),
            'total_floors': total_floors_value,
            'description': description_text,
            'has_parking': extract_icon_feature(features_dictionary, 'חניה'),
            'has_stotsge': extract_icon_feature(features_dictionary, 'מחסן'),
            'elevator': extract_icon_feature(features_dictionary, 'מעלית'),
            'ac': extract_icon_feature(features_dictionary, 'מזגן'),
            'handicap': extract_icon_feature(features_dictionary, 'נגישות'),
            'has_bars': extract_icon_feature(features_dictionary, 'סורגים'),
            'has_safe_room': extract_icon_feature(features_dictionary, 'ממ"ד'),
            'has_balcon': extract_icon_feature(features_dictionary, 'מרפסת'),
            'is_furnished': extract_icon_feature(features_dictionary, 'מרוהטת'),
            'is_renovated': extract_icon_feature(features_dictionary, 'משופצת'),
            'price': price_value,
            'num_of_images': number_of_images,
            'distance_from_center': distance_from_dizengoff,
        }
        print(f"Apartment added: {apartment_data['address']} in {apartment_data['neighborhood']}")
        return apartment_data

    except Exception as e:
        print(f"Error processing apartment {apartment_url}: {e}")
        return None


#### Processing All Apartments and Creating DataFrame:

In [7]:
def fetch_and_process_apartments(base_url):
    all_apartment_data = []
    apartment_urls = fetch_all_apartment_links(base_url)
    print("\nStarting apartment data processing.\n")

    with ThreadPoolExecutor(max_workers=20) as executor:
        apartment_futures = {executor.submit(process_single_apartment, url): url for url in apartment_urls}
        for completed_future in as_completed(apartment_futures):
            apartment_result = completed_future.result()
            if apartment_result:
                all_apartment_data.append(apartment_result)

    if all_apartment_data:
        apartment_dataframe = pd.DataFrame(all_apartment_data)
        apartment_dataframe['days_to_enter'] = pd.to_numeric(apartment_dataframe['days_to_enter'], errors='coerce').fillna(np.nan).astype('Int64')
        apartment_dataframe['monthly_arnona'] = pd.to_numeric(apartment_dataframe['monthly_arnona'], errors='coerce').fillna(np.nan).astype('Int64')
        apartment_dataframe.to_csv('apartments_data.csv', index=False, encoding='utf-8-sig')
        print("\napartments_data.csv created successfully.")
        return apartment_dataframe
    else:
        print("\nNo matching apartments found.")
        return None
  

#### Running the Code and Creating the Data File:

In [8]:
apartment_data_frame = fetch_and_process_apartments("https://www.ad.co.il/nadlanrent")

Fetching links from: https://www.ad.co.il/nadlanrent
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=2
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=3
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=4
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=5
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=6
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=7
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=8
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=9
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=10
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=11
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=12
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=13
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=14
Fetching links from: https://www.ad.co.il/nadlanrent?pageindex=15
Fetching links from: https://ww

#### Displaying the Data:

In [9]:
apartment_data_frame.dtypes

Unnamed: 0,property_type,neighborhood,address,room_num,floor,area,garden_area,days_to_enter,num_of_payments,monthly_arnona,...,ac,handicap,has_bars,has_safe_room,has_balcon,is_furnished,is_renovated,price,num_of_images,distance_from_center
0,דירה,בצרון,מיטב 5,3.0,4,93,0,,12,,...,1,1,0,1,1,0,0,8700.0,6,3.28
1,דירה,שפירא,ישראל מסלנט 45,3.5,3,90,0,,12,300.0,...,1,1,0,1,1,1,1,6500.0,10,4.1
2,החלפת דירות,שפירא,הסדנה 7,2.5,4,80,0,,12,230.0,...,1,1,0,1,1,0,1,6650.0,2,5.32
3,דירה,בצרון,מיטב 5,4.0,5,108,0,,12,,...,1,1,0,1,1,0,0,10200.0,5,3.28
4,דירה,בצרון,שדרות ההשכלה 17,4.0,0,103,0,,12,556.0,...,1,0,0,1,1,0,0,11500.0,9,2.89
5,דירה,שפירא,טורי זהב 23,4.0,1,100,0,,12,275.0,...,1,1,0,1,1,1,1,7000.0,10,3.96
6,גג/ פנטהאוז,שפירא,ישראל מסלנט 55,3.0,5,90,0,0.0,12,,...,1,0,0,1,1,0,1,6000.0,7,4.02
7,גג/ פנטהאוז,שפירא,ישראל מסלנט 20,4.5,5,151,33,,11,452.0,...,1,1,0,1,1,1,0,10000.0,5,6.0
8,כללי,שפירא,העמל 3,2.0,0,56,8,0.0,12,555.0,...,0,1,0,0,0,0,1,4500.0,4,5.2
9,סטודיו/לופט,שפירא,העליה 3,1.0,1,40,0,0.0,12,350.0,...,1,0,0,0,0,0,1,4300.0,5,3.98


In [None]:
apartment_data_frame