In [2]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, date
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import calendar

In [3]:
# URL for Nissan's car listings on the ad.co.il website
url = 'https://www.ad.co.il/car?sp261=13903'
print("Nissan's cars page url is:", url)

Nissan's cars page url is: https://www.ad.co.il/car?sp261=13903


In [4]:
# Fetch the HTML content of the main Nissan page
html = requests.get(url)
nissan_page = BeautifulSoup(html.content, 'html.parser')

In [5]:
# Function to get posts from a specific page
def get_posts(page):
    url = f"https://www.ad.co.il/car?sp261=13903&pageindex={page}"
    response = requests.get(url)
    if response.status_code != 200:
        print("Error: status code", response.status_code)
        return None, None
    
    page_soup = BeautifulSoup(response.content, 'html.parser')
    posts = page_soup.find('div', {'class': "content mt-3"}).find_all('div', {'class': 'card-body p-md-3'})
    posts_links_list = ["https://www.ad.co.il" + post.find('a').get('href') for post in posts]
    
    posts_image_num_list = []
    for post in posts:
        data_images = post.get("data-images")
        if data_images is not None:
            posts_image_num_list.append(int(data_images))
        else:
            posts_image_num_list.append(0)  # Default value if no images are present

    return posts_links_list, posts_image_num_list

In [6]:
# Function to get the total number of pages
def amount_of_pages():
    pages_n = nissan_page.find('h6', {'class': "ms-2 d-none d-sm-flex fs--1"})
    return int(pages_n.string.split(' ')[-1].split(')')[0])

In [7]:
# Function to fetch all posts across all pages
def fetch_all_pages():
    all_links = []
    all_images = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(get_posts, page) for page in range(1, amount_of_pages() + 1)]
        for future in futures:
            links, images = future.result()
            if links and images:
                all_links.extend(links)
                all_images.extend(images)
    return all_links, all_images

In [8]:
# Function to fetch detailed information for a specific post
def fetch_post_details(link):
    post_page_response = requests.get(link)
    post_page = BeautifulSoup(post_page_response.content, 'html.parser')
    main_info = list(post_page.find('table', {'class': "table table-sm mb-4"}).find_all("td"))
    sub_info = post_page.find('div', {'class': "order-first order-sm-0"})
    
    details = {
        'model': str(post_page.find('div', {'class': "d-flex justify-content-between"}).find_all('h2')[0].string),
        'Price': replace_comma(str(post_page.find('div', {'class': "d-flex justify-content-between"}).find_all('h2')[1].string).split(' ')[0]),
        'Description': desc(sub_info),
        'Cre_date': cre_repub_dates(sub_info)[0],
        'Repub_date': cre_repub_dates(sub_info)[1]
    }
    info_dict = info_to_dict(main_info)
    updated_dict = update_or_add_keys(info_dict)
    details.update(updated_dict)
    return details

In [9]:
# Helper function to replace commas in strings (used for prices)
def replace_comma(string):
    return int(string.replace(",", ""))

In [10]:
# Function to convert list of info from the post to a dictionary
def info_to_dict(info):
    keys_list = []
    values_list = []
    for index, value in enumerate(info):
        if index % 2 == 0:
            keys_list.append(str(value.string.strip()))
        else:
            values_list.append(str(value.string.strip()))
    return dict(zip(keys_list, values_list))

In [11]:
# Function to update keys in the dictionary from Hebrew to English
def update_or_add_keys(dictionary):
    heb_keys = ['שנה', 'יד', 'ת. הילוכים', 'נפח', 'סוג מנוע', 'ק"מ', 'טסט עד', 'צבע', 'בעלות קודמת', 'בעלות נוכחית', 'אזור', 'עיר']
    en_keys = ['Year', 'Hand', 'Gear', 'capacity_Engine', 'Engine_type', 'Km', 'Test', 'Color', 'Prev_ownership', 'Curr_ownership', 'Area', 'City']
    for heb_key, en_key in zip(heb_keys, en_keys):
        if heb_key in dictionary:
            dictionary[en_key] = dictionary.pop(heb_key)
        else:
            dictionary[en_key] = None
    return dictionary

In [12]:
# Function to fetch sub-info details of a post
def link_to_info_sub(link):
    post_page_response = requests.get(link)
    post_page = BeautifulSoup(post_page_response.content, 'html.parser')
    return post_page.find('div', {'class': "order-first order-sm-0"})

In [13]:
# Function to extract and clean the description from sub-info
def desc(sub_info):
    if sub_info:
        post_description = sub_info.find('div', {'class': 'p-3'})
        if post_description:
            return post_description.find('p', {'class': 'text-word-break'}).get_text().strip().replace('\n', '')
    return None

In [14]:
# Function to extract creation and re-publication dates
def cre_repub_dates(sub_info):
    if sub_info:
        post_dates = sub_info.find('div', {'class': 'd-flex flex-row align-items-center justify-content-center flex-wrap'}).find_all('div', {'class': 'px-3'})
        cre_date_string = str(post_dates[0].string).split(" ")[-1]
        repub_date_string = str(post_dates[1].string).split(" ")[-1]
        dates_list = [datetime.strptime(cre_date_string, "%d/%m/%Y")]
        dates_list.append(datetime.strptime(repub_date_string, "%d/%m/%Y") if repub_date_string else None)
        return dates_list
    return [None, None]

In [15]:
# Function to calculate days until a given date (for the "Test" field)
def days_until(date_str):
    if date_str:
        given_date = datetime.strptime(date_str, "%m/%Y")
        _, last_day = calendar.monthrange(given_date.year, given_date.month)
        given_date = given_date.replace(day=last_day)
        current_date = date.today()
        return (given_date.date() - current_date).days
    return None

In [16]:
# Function to correct data types in the dataframe
def type_correction(df):
    df['Year'] = df['Year'].astype(int)
    df['Price'] = df['Price'].astype(float)
    df['Hand'] = df['Hand'].astype(int)
    df['Gear'] = df['Gear'].astype(str)
    df['Color'] = df['Color'].astype(str)
    df['Prev_ownership'] = df['Prev_ownership'].astype(str)
    df['Curr_ownership'] = df['Curr_ownership'].astype(str)
    df['Area'] = df['Area'].astype(str)
    df['City'] = df['City'].astype(str)
    df['Test'] = df['Test'].astype('Int64')
    try:
        df['capacity_Engine'] = df['capacity_Engine'].str.replace(',', '').astype(int)
        df['Km'] = df['Km'].str.replace(',', '').astype(int)
    except Exception:
        pass
    return df

In [17]:
# Main code
all_links, all_images = fetch_all_pages()

data = []
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(fetch_post_details, link) for link in all_links]
    for future in futures:
        data.append(future.result())

df = pd.DataFrame(data)
df['Pic_num'] = all_images
df.insert(0, 'manufactor', ['ניסאן'] * len(df))

df['Test'] = df['Test'].apply(days_until)
df = type_correction(df)
df = df.drop(columns=['תת דגם'])
df.to_csv('adNissanData.csv', index=False, encoding='utf-8-sig')