# Matala 1 - Web Scraping
## Group: Honda
### Students:

Eden Cohen 209056225

Marina Nezheslky 321859969
         

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.parse
from datetime import datetime

### 1. Get HTTP response

In [2]:
url = "https://www.ad.co.il/car?sp261=13890"
response = requests.get(url)
if response.status_code == 200:
    print("Response success")
soup = BeautifulSoup(response.content, 'html.parser')
card_blocks = soup.find_all('div', attrs={'class': 'card-block'})
if not card_blocks:
    print("No results found.")
else:
    print("Found blocks")

Response success
Response success
Found blocks
Found blocks


### 2. Functions

In [3]:
def get_max_page(soup): #return the maximun pages number in the website
    page_links = soup.find_all('a', class_='page-link text-nowrap px-3 py-2 rounded-pill')
    page_numbers = []
    for link in page_links:
        text = link.get_text(strip=True)
        if text.isdigit():
            page_numbers.append(int(text))
    if page_numbers:
        return max(page_numbers)
    return 1  # default 1 page

def get_car_links_from_page(url): #return list of cars ads links for each page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    card_blocks = soup.find_all('div', class_='card-block')  
    car_links_page = []
    for card_block in card_blocks:
        a_tag = card_block.find('a')
        if a_tag:
            link = a_tag.get('href')
            car_links_page.append("https://www.ad.co.il" + link)
    return car_links_page

def get_Model(info): ## return the car's model name 
    try:
        Model = info.find('h2', class_='card-title').text.strip()
    except AttributeError:
        Model = ""
    return Model.split()

def get_Price(info): ## retrnes the car's price
    try:
        Price_h2 = info.find('h2', class_='card-title').find_next('h2', class_='card-title')
        if Price_h2:
            Price= Price_h2.text.strip()
        else:
            Price=""
    except AttributeError:
        Price = ""
    return Price

def get_Info_Table(info): #returns all the info in carws ad
    try:
        Info = info.find('table', class_='table table-sm mb-4')
    except AttributeError:
        Info = ""
    return Info

def contains_only_numbers(value): #checks if the string contains only numeric chars
    if str(value).isdigit():
        return value
    else:
        return "0"

import calendar

def last_day(test_date):  ## returns the las day number of ech month/Year
    month, year = test_date.split('/')
    month = int(month)
    year = int(year)
    last_day = calendar.monthrange(year, month)[1]
    return last_day

### 2. Get all the links to all the cars ads

In [4]:
base_url = "https://www.ad.co.il/car?sp261=13890&pageindex="
start_url = base_url + "1"
response = requests.get(start_url)
soup = BeautifulSoup(response.content, 'html.parser')
max_page = get_max_page(soup)

all_car_links = []
all_car_links.extend(get_car_links_from_page(start_url)) #add the links from 1'st page

for page in range(2, max_page + 1): ## loop over all the pages, that appaned to all_car_links list all the cars ads links
    url = base_url + str(page)
    response = requests.get(url)
    if response.status_code == 200:
        car_links = get_car_links_from_page(url)
        all_car_links.extend(car_links)

print(f"Total links found: {len(all_car_links)}")
for link in all_car_links:
    print(link)

Total links found: 169
https://www.ad.co.il/ad/16188117
https://www.ad.co.il/ad/16184016
https://www.ad.co.il/ad/16164463
https://www.ad.co.il/ad/16163262
https://www.ad.co.il/ad/16139904
https://www.ad.co.il/ad/16186170
https://www.ad.co.il/ad/16129968
https://www.ad.co.il/ad/16140392
https://www.ad.co.il/ad/16133528
https://www.ad.co.il/ad/16122774
https://www.ad.co.il/ad/16069748
https://www.ad.co.il/ad/16056698
https://www.ad.co.il/ad/16025641
https://www.ad.co.il/ad/15993500
https://www.ad.co.il/ad/15983893
https://www.ad.co.il/ad/15475681
https://www.ad.co.il/ad/15927247
https://www.ad.co.il/ad/15900953
https://www.ad.co.il/ad/16152126
https://www.ad.co.il/ad/15818255
https://www.ad.co.il/ad/15789829
https://www.ad.co.il/ad/15900950
https://www.ad.co.il/ad/15728164
https://www.ad.co.il/ad/15727827
https://www.ad.co.il/ad/15579797
https://www.ad.co.il/ad/15648987
https://www.ad.co.il/ad/15605812
https://www.ad.co.il/ad/15577292
https://www.ad.co.il/ad/15386721
https://www.ad.co.il

### 4. Create Data Frame

In [5]:
##Features dictonary that will conatin all the data from all the links
features = {"Manufactor": [],"Model": [],"Year": [],"Hand": [],"Gear": [],"Capacity_Engine": [],"Engine_Type": [],"Prev_Ownership": [],"Curr_Ownership": [],"Area": [],"City": [],"Price": [],"Pic Num": [],"Cre_Date": [],"Repub_Date": [],"Descreption": [],"Color": [],"Km": [],"Test": []}

for link in all_car_links: ## loop over all the links and append the data to the features dictonary 
    car_info_page = requests.get(link)
    car_info = BeautifulSoup(car_info_page.content, 'html.parser')
    features['Manufactor'].append(get_Model(car_info)[0])
    features['Model'].append(' '.join(get_Model(car_info)[1:])) 
    info_table = pd.read_html(link) ## get the table that contains all the relevent information
    info_df = pd.concat(info_table)
    info_df=info_df.transpose()
    info_df.columns = info_df.iloc[0]
    info_df.columns = info_df.columns.str.strip()
    info_df = info_df.iloc[1:].reset_index(drop=True)
    info_df = info_df.fillna('Unknown')
    info_df = info_df.rename(columns={'ק"מ': 'Km','שנה':'Year','יד':'Hand','ת. הילוכים': 'Gear','נפח': 'Capacity_Engine','סוג מנוע':'Engine_Type', 'טסט עד':'Test','צבע':'Color','בעלות קודמת':'Prev_Ownership','בעלות נוכחית':'Curr_Ownership','אזור':'Area','עיר':'City'})
    
    ## append the to the features dictonary all the information from info_df 

    if 'Year' in info_df.columns:
        features['Year'].append(info_df['Year'].values[0])
    else:
        features['Year'].append("")
    if 'Hand' in info_df.columns:
        features['Hand'].append(info_df['Hand'].values[0])
    else:
        features['Hand'].append("")
    if 'Gear' in info_df.columns:
        features['Gear'].append(info_df['Gear'].values[0])
    else:
        features['Gear'].append("")
    if 'Capacity_Engine' in info_df.columns:
        features['Capacity_Engine'].append(info_df['Capacity_Engine'].values[0])
    else:
        features['Capacity_Engine'].append("")
    if 'Engine_Type' in info_df.columns:
        features['Engine_Type'].append(info_df['Engine_Type'].values[0])
    else:
        features['Engine_Type'].append("")
    if 'Prev_Ownership' in info_df.columns:
        features['Prev_Ownership'].append(info_df['Prev_Ownership'].values[0])
    else:
        features['Prev_Ownership'].append("")
    if 'Curr_Ownership' in info_df.columns:
        features['Curr_Ownership'].append(info_df['Curr_Ownership'].values[0])
    else:
        features['Curr_Ownership'].append("")
    if 'Area' in info_df.columns:
        features['Area'].append(info_df['Area'].values[0])
    else:
        features['Area'].append("")
    if 'City' in info_df.columns:
        features['City'].append(info_df['City'].values[0])
    else:
        features['City'].append("")
    features['Price'].append(contains_only_numbers(get_Price(car_info).split()[0].replace(',',"")))
    picture_divs = car_info.find_all('div', class_='justify-content-center px-1')
    features['Pic Num'].append(len(picture_divs))
    create_dates_div = car_info.find('div',class_='d-flex flex-row align-items-center justify-content-center flex-wrap')
    if create_dates_div:
        creates_dates = create_dates_div.get_text(strip=True)
        creates_dates = creates_dates.replace("תאריך יצירה: ", "").replace("תאריך הקפצה אחרון: ", ",")
        dates = creates_dates.split(',')
        if len(dates) == 2:
            features['Cre_Date'].append(dates[0])
            features['Repub_Date'].append(dates[1])
    car_description = car_info.find('p', class_="text-word-break")
    if car_description:
        car_description = car_description.get_text()
        features['Descreption'].append(car_description)
    else:
        features['Descreption'].append("לא נמצא תיאור")
    if 'Color' in info_df.columns:
        features['Color'].append(info_df['Color'].values[0])
    else:
        features['Color'].append("")
    if 'Km' in info_df.columns:
        features['Km'].append(info_df['Km'].values[0])
    else:
        features['Km'].append("0")
    if 'Test' in info_df.columns:
        last_day_of_month = last_day(str(info_df['Test'].values[0]))
        test_date = datetime.strptime(f"{last_day_of_month}/{info_df['Test'].values[0]}", '%d/%m/%Y').date()
        time_delta = test_date - datetime.now().date()
        features['Test'].append(time_delta.days)
    else:
        features['Test'].append("0")

cars_df = pd.DataFrame.from_dict(features) ## convert the dictionary to data frame
cars_df = cars_df[cars_df['Manufactor'] == 'הונדה'] ## filter to only Honda  manufacture
cars_df

URLError: <urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>

URLError: <urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>

### 5. Data validation and types

In [None]:
cars_df.info()

In [None]:
cars_df = cars_df.reset_index(drop=True)

In [None]:
## Int types:
cars_df['Year']=cars_df['Year'].astype(int)
cars_df['Hand']=cars_df['Hand'].astype(int)
cars_df['Pic Num']=cars_df['Pic Num'].astype(int)
cars_df['Capacity_Engine']=cars_df['Capacity_Engine'].astype(int)

cars_df['Km'] = cars_df['Km'].astype(float)
cars_df['Test'] = cars_df['Test'].astype(float)
cars_df['Price'] = cars_df['Price'].astype(float)

# Replace 0 values with NaN
cars_df['Km'].replace(0, np.nan, inplace=True)
cars_df['Test'].replace(0, np.nan, inplace=True)
cars_df['Price'].replace(0, np.nan, inplace=True)

# Convert 'Km', 'Test', and 'Price' columns to 'Int64' after replacing 0 values with NaN
cars_df['Km'] = cars_df['Km'].astype('Int64')
cars_df['Test'] = cars_df['Test'].astype('Int64')
cars_df['Price'] = cars_df['Price'].astype('Int64')

#Categorial types:
list_of_Gears = ['אוטומטית','טיפטרוניק','ידנית']
for i in range(len(cars_df)):
    if cars_df.at[i, 'Gear'].replace(" ", "") not in list_of_Gears:
        cars_df.at[i, 'Gear'] = 'לא מוגדר'
cars_df['Gear'] = pd.Categorical(cars_df['Gear'], categories=list_of_Gears  + ['לא מוגדר'] ,ordered=True)

list_of_Engine_type = ['בנזין','היברידי','דיזל','חשמלי','גז']
for i in range(len(cars_df)):
    if cars_df.at[i, 'Engine_Type'].replace(" ", "") not in list_of_Engine_type:
        cars_df.at[i, 'Engine_Type'] = 'לא מוגדר'
cars_df['Engine_Type'] = pd.Categorical(cars_df['Engine_Type'], categories=list_of_Engine_type  + ['לא מוגדר'],ordered=True)
        
list_of_ownership = ['פרטית','ליסינג','אחר']
for i in range(len(cars_df)):
    if cars_df.at[i, 'Prev_Ownership'].replace(" ", "") not in list_of_ownership:
        cars_df.at[i, 'Prev_Ownership'] = 'לא מוגדר'
cars_df['Prev_Ownership'] = pd.Categorical(cars_df['Prev_Ownership'], categories=list_of_ownership  + ['לא מוגדר'],ordered=True)

for i in range(len(cars_df)):
    if cars_df.at[i, 'Curr_Ownership'].replace(" ", "") not in list_of_ownership:
        cars_df.at[i, 'Curr_Ownership'] = 'לא מוגדר'
cars_df['Curr_Ownership'] = pd.Categorical(cars_df['Curr_Ownership'], categories=list_of_ownership  + ['לא מוגדר'],ordered=True)

## Date types:
cars_df['Cre_Date'] = pd.to_datetime(cars_df['Cre_Date'], format="%d/%m/%Y")
cars_df['Repub_Date'] = pd.to_datetime(cars_df['Repub_Date'], format="%d/%m/%Y")

cars_df.info()

In [None]:
print("Final data frame:")
cars_df

In [None]:
cars_df.to_csv('car_details.csv', index=False, encoding='utf-8-sig')  # 'car_details.csv' is the name of the file to be created