In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

scraped_df = pd.DataFrame(columns=['Link', 'Price', 'Date', 'Text',
                                   'Location', 'Area', 'Rooms', 'Located Floor',
                                   'Number of Floors', 'Repair', 'Document'])

base_url = 'https://www.emlak.az/elanlar/?ann_type=3&announce_type=18880&property_type=1&room_min=1&room_max=5&sort_type=0&page='
header = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Mobile Safari/537.36'}

page_num = 1
max_retries = 3

while page_num<200: #True:
    print(f'Scraping page {page_num}...')
    url = f'{base_url}{page_num}'

    for attempt in range(max_retries):
        try:
            r = requests.get(url, headers=header)

            if r.status_code != 200:
                print(f"Failed to retrieve page {page_num}. Status code: {r.status_code}. Attempt {attempt+1} of {max_retries}")
                time.sleep(5)
                continue

            soup = BeautifulSoup(r.content, 'lxml')
            ticket_list = soup.find('div', class_='ticket-list')

            if ticket_list is None or not ticket_list.find_all('div'):
                print(f'No ticket-list or inner divs found on page {page_num}. Stopping scraping.')
                break

            div_class = ' '.join(ticket_list.find('div').get('class', []))

            houses = ticket_list.find_all('div', class_=f"{div_class}")
            if not houses:
                print('No more data found in the houses divs. Ending scraping.')
                break

            for house in houses:
                house_links = house.find_all('div', attrs={'class': 'img'})
                for link in house_links:
                    a_tag = link.find('a')
                    link_basi = 'https://emlak.az'
                    if a_tag and 'href' in a_tag.attrs:
                        full_link = link_basi + a_tag['href']
                        detail_attempts = 0
                        while detail_attempts < max_retries:
                            try:
                                detail = requests.get(full_link, headers=header)
                                
                                if detail.status_code != 200:
                                    print(f"Failed to retrieve detail page {full_link}. Status code: {detail.status_code}. Attempt {detail_attempts+1} of {max_retries}")
                                    detail_attempts += 1
                                    time.sleep(5)
                                    continue
                                
                                detail_soup = BeautifulSoup(detail.content, 'lxml')
                                
                                prices = [price.get_text(strip=True) for price in detail_soup.find_all('span', attrs={'class': 'm'})]
                                
                                strong_tags = detail_soup.find_all('strong')
                                date = strong_tags[1].get_text(strip=True) if len(strong_tags) > 1 else None
                                
                                div_text = detail_soup.find_all('div', class_='desc')
                                text = [p.get_text(strip=True) for div in div_text for p in div.find_all('p')]
                                
                                location_div = detail_soup.find_all('div', class_='map-address')
                                location = [h4.get_text(strip=True) for div in location_div for h4 in div.find_all('h4')]
                                
                                area = None
                                rooms = None
                                located_floor = None
                                number_of_floors = None
                                repair = None
                                document = None

                                for dd in detail_soup.find_all('dd'):
                                    if 'Sahə' in dd.text:
                                        area = dd.text.replace('Sahə', '').strip()
                                    if 'Otaqların sayı' in dd.text:
                                        rooms = dd.text.replace('Otaqların sayı', '').strip()
                                    if 'Yerləşdiyi mərtəbə' in dd.text:
                                        located_floor = dd.text.replace('Yerləşdiyi mərtəbə', '').strip()
                                    if 'Mərtəbə sayı' in dd.text:
                                        number_of_floors = dd.text.replace('Mərtəbə sayı', '').strip()
                                    if 'Təmiri' in dd.text:
                                        repair = dd.text.replace('Təmiri', '').strip()
                                    if 'Sənədin tipi' in dd.text:
                                        document = dd.text.replace('Sənədin tipi', '').strip()
                                
                                current_data = pd.DataFrame([{
                                    'Link': full_link,
                                    'Price': prices,
                                    'Date': date,
                                    'Text': text,
                                    'Area': area,
                                    'Document': document,
                                    'Location': location,
                                    'Rooms': rooms,
                                    'Located Floor': located_floor,
                                    'Number of Floors': number_of_floors,
                                    'Repair': repair
                                }])

                                scraped_df = pd.concat([scraped_df, current_data], ignore_index=True)
                                break
                            
                            except Exception as e:
                                print(f"An error occurred when scraping detail page {full_link}: {e}. Attempt {detail_attempts+1} of {max_retries}")
                                detail_attempts += 1
                                time.sleep(5)
                        if detail_attempts == max_retries:
                            print(f"Failed to retrieve detail page {full_link} after {max_retries} attempts. Skipping.")
                            
            page_num += 1
            time.sleep(1)
            break  
        
        except Exception as e:
            print(f"An error occurred on page {page_num}: {e}. Attempt {attempt+1} of {max_retries}")
            time.sleep(5) 
            
    else:
        print(f"Failed to retrieve page {page_num} after {max_retries} attempts. Skipping to next page.")
        page_num += 1

    if ticket_list is None or not ticket_list.find_all('div'):
        break

scraped_df.to_csv('scraped_data_team_11.csv', index=False)
                                