In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.options.plotting.backend = "plotly"
from time import sleep
import random
import panel as pn
import plotly.express as px

In [47]:
class Scraper:
    def __init__(self, citta, tipologia, num_pages):
        self.citta = citta
        self.tipologia = tipologia
        self.num_pages = num_pages
        self.user_agent_list = [
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
        ]

    def scrape(self):
        items= []
        for page_num in range(self.num_pages):
            print(f'Pagina Numero = {page_num}')
            url = f'https://www.immobiliare.it/{self.tipologia}/{self.citta}/?criterio=rilevanza&pag={page_num}&noAste=1'
            user_agent = random.choice(self.user_agent_list)
            headers = {'User-Agent': user_agent}
            page = requests.get(url,headers=headers)
            soup = BeautifulSoup(page.content, "html.parser")
            results = soup.find_all('div',{'class':'in-card nd-mediaObject nd-mediaObject--colToRow in-realEstateCard in-realEstateCard--interactive in-realEstateListCard'})
            for result in results:
                descrizione = result.find('a',{'class':'in-card__title'}).text.split(',')[0]
                try:
                    quartiere = result.find('a',{'class':'in-card__title'}).text.split(',')[1].strip()
                except IndexError:
                    quartiere= float("NaN")
                try:
                    price= result.select_one('ul > li').text.replace('da','').replace('.','').replace('/mese','').split('€')[1].strip()
                except IndexError:
                    price= float("NaN")
                except AttributeError:
                    price= float("NaN")
                try:
                    locali = result.find('li',{'class':'nd-list__item in-feat__item'}).text
                except AttributeError:
                    locali= float("NaN")
                try:
                    superficie = result.find('li',{'class':'nd-list__item in-feat__item','aria-label':'superficie'}).text.replace('m²','')
                except AttributeError:
                    superficie= float("NaN")
                try:
                    piano = result.find('li',{'class':'nd-list__item in-feat__item','aria-label':'piano'}).text
                except AttributeError:
                    piano = float("NaN")

                items.append([descrizione,quartiere,price,locali,superficie,piano])
        sleep(2)
        df = pd.DataFrame(items,columns=['descrizione','quartiere','price','locali','superficie(m²)','piano'])
        df_ = pd.DataFrame(df.loc[~df['descrizione'].str.contains('nuova costruzione'),['descrizione','quartiere','price','locali','superficie(m²)','piano']])
        df_.loc[df['locali'].str.contains(' - ') | df['locali'].str.contains('m²') | df['piano'].str.contains(' - ') | df['quartiere'].str.contains('m²') | df['quartiere'].str.contains(r'\d',regex=True) | df['price'].str.contains('%')] = float("NaN")
        df_['descrizione']=df_['descrizione'].astype('string')
        df_['quartiere']=df_['quartiere'].astype('string')
        df_['price']=pd.to_numeric(df_['price'])
        df_['superficie(m²)']=pd.to_numeric(df_['superficie(m²)'])
        
        return df_

    def visualize(self,df_):
        locali_price = df_.groupby('locali').mean()['price'].sort_values(ascending=False).reset_index()
        fig_locali_price = locali_price.plot.bar(x='locali', y='price')
        
        quartire_prezzo = df_.groupby('quartiere').mean()['price'].sort_values(ascending=False).reset_index()
        fig_quartire_prezzo = quartire_prezzo.plot.bar(x='price', y='quartiere')

        piano_price = df_.groupby('piano').mean()['price'].sort_values(ascending=False).reset_index()
        fig_piano_price = piano_price.plot.bar(x='piano', y='price')

        piano_superficie = df_.groupby('piano').mean()['superficie(m²)'].sort_values(ascending=False).reset_index()
        fig_piano_superficie = piano_superficie.plot.bar(x='piano', y='superficie(m²)')

        quartire_superficie = df_.groupby('quartiere').mean()['superficie(m²)'].sort_values(ascending=False).reset_index()
        fig_quartire_superficie = quartire_superficie.plot.bar(x='superficie(m²)', y='quartiere')

        dashboard = pn.Column(
            pn.Row(fig_locali_price, fig_quartire_prezzo),
            pn.Row(fig_piano_price, fig_piano_superficie),
            pn.Row(fig_quartire_superficie)
        )
        
        case_dash = pn.template.BootstrapTemplate(
            title=f'Scraping Immobiliare.it nella città di {self.citta}',
            main=dashboard,
            accent_base_color="#88d8b0",
            header_background="#000000"
        )

        return case_dash.show({'displayModeBar': True})

    


In [49]:
#watch out for tipologia there are two option: affitto-case or vendita-case.
#usually pages are 40 but may be more or less. check the immobiliare.it website
scraper = Scraper(citta='roma', tipologia='affitto-case', num_pages=5)
df = scraper.scrape()
scraper.visualize(df)

Pagina Numero = 0
Pagina Numero = 1
Pagina Numero = 2
Pagina Numero = 3
Pagina Numero = 4



The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_

Launching server at http://localhost:61332


<panel.io.server.Server at 0x7fe973387cd0>