In [23]:
# Name (שם מלא): Gal Davidi (גל דוידי)
# ID (תעודת זהות): 206555112

# Github link:
# https://github.com/gdavidi/Projects/tree/main/Yad2%20cars

In [24]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, date
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import calendar

In [25]:
# URL for Nissan's car listings on the ad.co.il website
url = 'https://www.ad.co.il/car?sp261=13903'
print("Nissan's cars page url is:", url)

Nissan's cars page url is: https://www.ad.co.il/car?sp261=13903


In [26]:
# Fetch the HTML content of the main Nissan page
html = requests.get(url)
nissan_page = BeautifulSoup(html.content, 'html.parser')

In [27]:
# Function to get posts from a specific page
def get_posts(page):
    url = f"https://www.ad.co.il/car?sp261=13903&pageindex={page}"
    response = requests.get(url)
    if response.status_code != 200:
        print("Error: status code", response.status_code)
        return None, None
    
    page_soup = BeautifulSoup(response.content, 'html.parser')
    posts = page_soup.find('div', {'class': "content mt-3"})
    posts_link = posts.find_all('div', {'class': 'card-body p-md-3'})
    posts_pic = posts.find_all('div', {"data-images": True})
    posts_links_list = ["https://www.ad.co.il" + post.find('a').get('href') for post in posts_link]
    
    posts_image_num_list = []
    for post in posts_pic:
        data_images = post.get("data-images")
        if data_images is not None:
            posts_image_num_list.append(int(data_images))
        else:
            posts_image_num_list.append(0)  # Default value if no images are present

    return posts_links_list, posts_image_num_list

In [29]:
# Function to get the total number of pages
def amount_of_pages():
    pages_n = nissan_page.find('h6', {'class': "ms-2 d-none d-sm-flex fs--1"})
    return int(pages_n.string.split(' ')[-1].split(')')[0])

In [30]:
# Function to fetch all posts across all pages
def fetch_all_pages():
    all_links = []
    all_images = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(get_posts, page) for page in range(1, amount_of_pages() + 1)]
        for future in futures:
            links, images = future.result()
            if links and images:
                all_links.extend(links)
                all_images.extend(images)
    return all_links, all_images

In [31]:
# Helper function to replace commas in strings (used for prices)
def replace_comma(string):
    return int(string.replace(",", ""))

In [32]:
# Function to convert list of info from the post to a dictionary
def info_to_dict(info):
    keys_list = []
    values_list = []
    for index, value in enumerate(info):
        if index % 2 == 0:
            keys_list.append(str(value.string.strip()))
        else:
            values_list.append(str(value.string.strip()))
    return dict(zip(keys_list, values_list))

In [33]:
# Function to update keys in the dictionary from Hebrew to English
def update_or_add_keys(dictionary):
    heb_keys = ['שנה', 'יד', 'ת. הילוכים', 'נפח', 'סוג מנוע', 'ק"מ', 'טסט עד', 'צבע', 'בעלות קודמת', 'בעלות נוכחית', 'אזור', 'עיר']
    en_keys = ['Year', 'Hand', 'Gear', 'capacity_Engine', 'Engine_type', 'Km', 'Test', 'Color', 'Prev_ownership', 'Curr_ownership', 'Area', 'City']
    for heb_key, en_key in zip(heb_keys, en_keys):
        if heb_key in dictionary:
            dictionary[en_key] = dictionary.pop(heb_key)
        else:
            dictionary[en_key] = None
    return dictionary

In [34]:
# Function to extract and clean the description from sub-info
def desc(sub_info):
    if sub_info:
        post_description = sub_info.find('div', {'class': 'p-3'})
        if post_description:
            return post_description.find('p', {'class': 'text-word-break'}).get_text().strip().replace('\n', '')
    return None

In [35]:
# Function to extract creation and re-publication dates
def cre_repub_dates(sub_info):
    if sub_info:
        post_dates = sub_info.find('div', {'class': 'd-flex flex-row align-items-center justify-content-center flex-wrap'}).find_all('div', {'class': 'px-3'})
        cre_date_string = str(post_dates[0].string).split(" ")[-1]
        repub_date_string = str(post_dates[1].string).split(" ")[-1]
        dates_list = [datetime.strptime(cre_date_string, "%d/%m/%Y")]
        dates_list.append(datetime.strptime(repub_date_string, "%d/%m/%Y") if repub_date_string else None)
        return dates_list
    return [None, None]

In [36]:
# Function to calculate days until a given date (for the "Test" field)
def days_until(date_str):
    if date_str:
        given_date = datetime.strptime(date_str, "%m/%Y")
        _, last_day = calendar.monthrange(given_date.year, given_date.month)
        given_date = given_date.replace(day=last_day)
        current_date = date.today()
        return (given_date.date() - current_date).days
    return None

In [37]:
# Function to correct data types in the dataframe
def type_correction(df):
    df['Year'] = df['Year'].astype(int)
    df['Price'] = df['Price'].astype(float)
    df['Hand'] = df['Hand'].astype(int)
    df['Gear'] = df['Gear'].astype(str)
    df['Color'] = df['Color'].astype(str)
    df['Prev_ownership'] = df['Prev_ownership'].astype(str)
    df['Curr_ownership'] = df['Curr_ownership'].astype(str)
    df['Area'] = df['Area'].astype(str)
    df['City'] = df['City'].astype(str)
    df['Test'] = df['Test'].astype('Int64')
    try:
        df['capacity_Engine'] = df['capacity_Engine'].str.replace(',', '').astype(int)
        df['Km'] = df['Km'].str.replace(',', '').astype(int)
    except Exception:
        pass
    return df

In [38]:
# Function to fetch detailed information for a specific post
def fetch_post_details(link):
    post_page_response = requests.get(link)
    post_page = BeautifulSoup(post_page_response.content, 'html.parser')
    main_info = list(post_page.find('table', {'class': "table table-sm mb-4"}).find_all("td"))
    sub_info = post_page.find('div', {'class': "order-first order-sm-0"})
    
    details = {
        'model': str(post_page.find('div', {'class': "d-flex justify-content-between"}).find_all('h2')[0].string),
        'Price': replace_comma(str(post_page.find('div', {'class': "d-flex justify-content-between"}).find_all('h2')[1].string).split(' ')[0]),
        'Description': desc(sub_info),
        'Cre_date': cre_repub_dates(sub_info)[0],
        'Repub_date': cre_repub_dates(sub_info)[1]
    }
    info_dict = info_to_dict(main_info)
    updated_dict = update_or_add_keys(info_dict)
    details.update(updated_dict)
    return details

In [39]:
# Main code
all_links, all_images = fetch_all_pages()

data = []
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(fetch_post_details, link) for link in all_links]
    for future in futures:
        data.append(future.result())

df = pd.DataFrame(data)
df['Pic_num'] = all_images
df.insert(0, 'manufactor', ['ניסאן'] * len(df))

df['Test'] = df['Test'].apply(days_until)
df = type_correction(df)
df = df.drop(columns=['תת דגם'])
df.to_csv('adNissanData.csv', index=False, encoding='utf-8-sig')

In [40]:
df

Unnamed: 0,manufactor,model,Price,Description,Cre_date,Repub_date,Year,Hand,Gear,capacity_Engine,Engine_type,Km,Test,Color,Prev_ownership,Curr_ownership,Area,City,Pic_num
0,ניסאן,ניסאן סנטרה,69900.0,למכירה: רכב ניסן סנטרה 2019 שמורה במצב מצוין! ...,2024-06-15,2024-06-15,2019,2,אוטומטית,1600,בנזין,69900,212.0,כסוף מטאלי,פרטית,פרטית,רמת גן - גבעתיים,רמת גן,4
1,ניסאן,ניסאן סנטרה,62000.0,רכב שמור ומטופל טסט לשנהרק 51000 ק״מ .נמצא בחי...,2024-06-21,2024-06-21,2019,2,אוטומטית,1800,בנזין,51000,,אפור עכבר,פרטית,פרטית,חיפה וחוף הכרמל,חיפה,3
2,ניסאן,ניסאן סנטרה,87000.0,ניסאן סנטרה 2021 צבע אפור עם גג שחור יד 2 ידיי...,2024-06-23,2024-06-25,2021,2,אוטומטית,2000,בנזין,33000,120.0,אפור מטאלי,פרטית,פרטית,"ראשל""צ והסביבה",בית דגן,5
3,ניסאן,ניסאן סנטרה,39500.0,רכב שמור טוב וחזק. רכב מהמם יפייפה שמור מאוד מ...,2022-10-28,2024-06-02,2017,2,אוטומטית,1800,בנזין,127000,,כסוף מטאלי,ליסינג,פרטית,ירושלים והסביבה,ירושלים,6
4,ניסאן,ניסאן מיקרה,24000.0,"מיקרה 2017 ידנית במצב חדששששש, יד שניה פרטית, ...",2024-05-28,2024-05-28,2017,2,ידנית,1200,בנזין,180000,240.0,אפור,פרטית,פרטית,כרמיאל והסביבה,כרמיאל,6
5,ניסאן,ניסאן מיקרה,52500.0,כל התוספות האפשריות,2024-04-26,2024-04-26,2020,2,ידנית,1000,בנזין,72200,301.0,לבן שנהב,פרטית,פרטית,חיפה וחוף הכרמל,חיפה,5
6,ניסאן,ניסאן פרימרה,62000.0,ניסאן דגם קשקאי רכב שמור ברמה גבוהה טיפולים בז...,2024-04-24,2024-04-24,2017,1,אוטומטית,1200,בנזין,90000,,לבן מטאלי,פרטית,פרטית,ירושלים והסביבה,ירושלים,6
7,ניסאן,ניסאן ג'וק JUKE,135000.0,ניסאן אקסל טרייל! (לא גוק - בעיה באתר)רמת גימו...,2024-04-12,2024-04-13,2021,1,אוטומטית,1600,בנזין,38000,89.0,אפור מטאלי,אחר,פרטית,פתח תקוה והסביבה,פתח תקווה,8
8,ניסאן,ניסאן מיקרה,19000.0,"רכב בבעלות עמותה. נהג יחיד. נקנה חדש 0 ק""מ. טס...",2024-02-19,2024-04-04,2018,2,ידנית,1200,בנזין,186825,,כסוף,ליסינג,חברה,פתח תקוה והסביבה,פתח תקווה,4
9,ניסאן,ניסאן מיקרה,52500.0,כל התוספות האפשריות דגם מפואר,2024-05-15,2024-05-15,2020,3,ידנית,1000,בנזין,73000,332.0,לבן,פרטית,פרטית,חיפה וחוף הכרמל,חיפה,0
