In [None]:
# install beautiful soup if needed
'''pip install requests beautifulsoup4 pandas'''

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import numpy as np

In [None]:
#SCRAPE DATA

# Specify your CSV file
csv_file = '2024_turkey_car_market.csv'

# Read the CSV file if it exists
try:
    df = pd.read_csv(csv_file)
    # Remove any rows with all NaN values
    df.dropna(how='all', inplace=True)
except FileNotFoundError:
    # If the file doesn't exist, create an empty DataFrame
    df = pd.DataFrame()

# Ensure that 'Page' column is numeric, replacing any non-numeric values with NaN
if 'Page' not in df.columns:
    df['Page'] = np.nan  # Ensure the 'Page' column exists

df['Page'] = pd.to_numeric(df['Page'], errors='coerce')

# Get the last scraped page number
last_scraped_page = df['Page'].max()
if pd.isna(last_scraped_page):
    last_scraped_page = 0
else:
    last_scraped_page = int(last_scraped_page)

print(f"Last scraped page: {last_scraped_page}")

# Define your base URL
base_url = 'https://www.arabam.com'

# List of search URLs to scrape
search_urls = [
    'https://www.arabam.com/ikinci-el?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/alfa-romeo?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/audi?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/bentley',
    'https://www.arabam.com/ikinci-el/otomobil/bmw-1-serisi?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/bmw-3-serisi?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/bmw-5-serisi?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/chevrolet?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/citroen?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/dacia?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/ferrari',
    'https://www.arabam.com/ikinci-el/otomobil/fiat?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/fiat-egea?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/fiat-linea?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/fiat-palio?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/fiat-punto?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/ford?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/ford-fiesta?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/ford-focus?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/ford-mondeo?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/ford-mustang?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/honda?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/honda-civic?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/hyundai?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/hyundai-accent?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/hyundai-accent-blue?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/hyundai-i20?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/jaguar',
    'https://www.arabam.com/ikinci-el/otomobil/kia?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/lada',
    'https://www.arabam.com/ikinci-el/otomobil/lamborghini',
    'https://www.arabam.com/ikinci-el/otomobil/maserati',
    'https://www.arabam.com/ikinci-el/otomobil/mazda',
    'https://www.arabam.com/ikinci-el/otomobil/mercedes-benz?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/mini',
    'https://www.arabam.com/ikinci-el/otomobil/mitsubishi',
    'https://www.arabam.com/ikinci-el/otomobil/nissan',
    'https://www.arabam.com/ikinci-el/otomobil/opel?page=50',
    'https://www.arabam.com/ikinci-el/otomobil/opel-astra?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/opel-corsa?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/opel-vectra?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/peugeot?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/peugeot-206?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/peugeot-307?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/porsche',
    'https://www.arabam.com/ikinci-el/otomobil/renault?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/renault-clio?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/renault-fluence?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/renault-laguna?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/renault-megane?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/renault-symbol?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/rolls-royce',
    'https://www.arabam.com/ikinci-el/otomobil/rover',
    'https://www.arabam.com/ikinci-el/otomobil/seat?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/skoda?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/subaru',
    'https://www.arabam.com/ikinci-el/otomobil/suzuki',
    'https://www.arabam.com/ikinci-el/otomobil/tata',
    'https://www.arabam.com/ikinci-el/otomobil/tofas?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/toyota?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/toyota-corolla?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/volkswagen?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/volkswagen-golf?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/volkswagen-jetta?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/volkswagen-passat?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/volkswagen-polo?take=50',
    'https://www.arabam.com/ikinci-el/otomobil/volvo?take=50'    
]

# Setup retry strategy for requests
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

# Define Function to Extract Links from JavaScript
def extract_links_from_js(script_text):
    urls = re.findall(r'"url":\s*window\.location\.origin\s*\+\s*"([^"]+)"', script_text)
    return [base_url + url for url in urls]

                         
# Define Function to Scrape Search Page
def scrape_search_page(page_number, search_url):
    page_url = f"{search_url}&page={page_number}"  # Append page number to the search URL
    try:
        response = http.get(page_url)
        response.raise_for_status()  # Raise an exception for any HTTP error
        soup = BeautifulSoup(response.content, 'html.parser')
        
        script_tag = soup.find('script', string=re.compile(r'"url":\s*window\.location\.origin'))
        if script_tag:
            car_links = extract_links_from_js(script_tag.string)
        else:
            car_links = []
        
        if not car_links:
            print(f"Warning: No car links found on page {page_url}")
        return car_links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page_url}: {e}")
        return []

def scrape_car_details(car_link):
    try:
        response = http.get(car_link)
        response.raise_for_status()  # Raise an exception for any HTTP error
        soup = BeautifulSoup(response.content, 'html.parser')
        
        def find_text_by_label(label):
            property_item_divs = soup.find_all('div', class_='property-item')
            for div in property_item_divs:
                key = div.find('div', class_='property-key')
                if key and key.get_text(strip=True) == label:
                    value = div.find('div', class_='property-value')
                    if value:
                        return value.get_text(strip=True)
            print(f"Warning: Label '{label}' not found on page {car_link}")
            return None

        def find_price():
            price_element = soup.find('div', class_='product-price')
            if price_element:
                return price_element.get_text(strip=True)
            else:
                print(f"Warning: Price not found on page {car_link}")
            return None

        # Check if ad number already exists, if not, add it to the set and proceed
        ad_number = find_text_by_label('İlan No')
        if ad_number in scraped_ad_numbers:
            print(f"Ad number {ad_number} already exists, skipping...")
            return None
        scraped_ad_numbers.add(ad_number)

        data = {
            "Link": car_link,  # Add Link column with the car link
            "İlan No": ad_number,
            "İlan Tarihi": find_text_by_label('İlan Tarihi'),
            "Marka": find_text_by_label('Marka'),
            "Seri": find_text_by_label('Seri'),
            "Model": find_text_by_label('Model'),
            "Yıl": find_text_by_label('Yıl'),
            "Kilometre": find_text_by_label('Kilometre'),
            "Vites Tipi": find_text_by_label('Vites Tipi'),
            "Yakıt Tipi": find_text_by_label('Yakıt Tipi'),
            "Kasa Tipi": find_text_by_label('Kasa Tipi'),
            "Renk": find_text_by_label('Renk'),
            "Motor Hacmi": find_text_by_label('Motor Hacmi'),
            "Motor Gücü": find_text_by_label('Motor Gücü'),
            "Çekiş": find_text_by_label('Çekiş'),
            "Araç Durumu": find_text_by_label('Araç Durumu'),
            "Ort. Yakıt Tüketimi": find_text_by_label('Ort. Yakıt Tüketimi'),
            "Yakıt Deposu": find_text_by_label('Yakıt Deposu'),
            "Boya-değişen": find_text_by_label('Boya-değişen'),
            "Takasa Uygun": find_text_by_label('Takasa Uygun'),
            "Kimden": find_text_by_label('Kimden'),
            "Fiyat": find_price(),
            "Page": current_page  # Add the current page number to the data
        }
        
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error scraping car details from {car_link}: {e}")
        return None

def scrape_all_cars(start_page, end_page):
    all_car_data = []
    global current_page
    for page_number in range(start_page, end_page + 1):
        current_page = page_number
        for search_url in search_urls:
            print(f"Scraping page {page_number} for search URL: {search_url}...")
            car_links = scrape_search_page(page_number, search_url)
            for car_link in car_links:
                try:
                    car_data = scrape_car_details(car_link)
                    if car_data:
                        all_car_data.append(car_data)
                except requests.exceptions.RequestException as e:
                    print(f"Request failed for {car_link}: {e}")
                    continue  # Skip this link and move to the next one
            time.sleep(3)  # Add a delay to avoid overwhelming the server
        if all_car_data:
            # Save the intermediate data after each page
            df_new = pd.DataFrame(all_car_data)
            global df
            df = pd.concat([df, df_new], ignore_index=True)
            df.to_csv(csv_file, index=False)
            all_car_data.clear()  # Clear the data after saving
    return all_car_data

# Scrape car data from this many pages, resuming from the last scraped page
current_page = last_scraped_page + 1
end_page = 50

car_data = scrape_all_cars(start_page=current_page, end_page=end_page)

# Convert the data to a DataFrame and save to a CSV file if there is any remaining data
if car_data:
    df_new = pd.DataFrame(car_data)
    df = pd.concat([df, df_new], ignore_index=True)  # Append new data to the existing DataFrame
    df.to_csv(csv_file, index=False)

print('Scraping completed.')

In [None]:
'''
# If the above scraping fails, use this to start fresh
import pandas as pd

# Specify your CSV file
csv_file = 'car_data.csv'

# Create an empty DataFrame with the columns you intend to use
empty_df = pd.DataFrame(columns=[
    'İlan Tarihi', 'Marka', 'Seri', 'Model', 'Yıl', 'Kilometre', 'Vites Tipi', 
    'Yakıt Tipi', 'Kasa Tipi', 'Renk', 'Motor Hacmi', 'Motor Gücü', 'Araç Durumu', 
    'Ort. Yakıt Tüketimi', 'Yakıt Deposu', 'Boya-değişen', 'Takasa Uygun', 
    'Kimden', 'Fiyat', 'Link', 'İlan No', 'Page'
])

# Save the empty DataFrame to the CSV file
empty_df.to_csv(csv_file, index=False)

print(f"The file '{csv_file}' has been reset to an empty state.")

# empty the set scraped_ad_numbers
scraped_ad_numbers.clear()
last_scraped_page = 0


'''