# Data Acquisition

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 

In [None]:
base_URL = 'https://www.vivareal.com.br/aluguel/sp/sao-paulo/apartamento_residencial/?pagina='

In [None]:
# get ads in multiple pages and returns a list of dataFrames
def scrap(url, pages=2):
    '''Scraps search results from a real state website'''
    if pages < 2:
        raise ValueError('number of pages must be greater than 2')
    scrap_df = [] # list to hold all data frames created per page
    for p_no in range(1, pages):
        search_url = url + str(p_no)
        # check if request is successful
        try:
            page_content = requests.get(search_url).content
            print(search_url)
        except Exception as e:
            print(f'Failed to gather data from {search_url}')
            print(e)
    
        soup = BeautifulSoup(page_content) # get page content into soup

        rent_tag_list = soup.find_all('div', {'class': 'property-card__price js-property-card-prices js-property-card__price-small'}) # gets all rent tags
        rent_text_list = [rent.p.text for rent in rent_tag_list] # gets the text from each rent tag


        rooms_tag_list = soup.find_all('li', {'class': 'property-card__detail-item property-card__detail-room js-property-detail-rooms'})
        rooms_text_list = [room.span.text for room in rooms_tag_list]


        address_tag_list = soup.find_all('span', {'class': 'property-card__address'})
        address_text_list = [address.text for address in address_tag_list]


        bathroom_tag_list = soup.find_all('li', {'class': 'property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom'})
        bathroom_text_list = [bathroom.span.text for bathroom in bathroom_tag_list]


        parking_tag_list = soup.find_all('li', {'class': 'property-card__detail-item property-card__detail-garage js-property-detail-garages'})
        parking_text_list = [parking.span.text for parking in parking_tag_list]

        area_tag_list = soup.find_all('span', {'class': 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area'})
        area_text_list = [area.text for area in area_tag_list]

        # create a dictionary to create a pandas dataframe
        data = {'address': address_text_list,
                'rent': rent_text_list,
                'rooms': rooms_text_list,
                'bathroom': bathroom_text_list,
                'parking': parking_text_list,
                'area': area_text_list}
        
        #create the dataFrame
        df = pd.DataFrame(data)
        print(df)
        scrap_df.append(df)
    
    final_data = pd.concat(scrap_df)
    # return final dataFrame
    return final_data

In [None]:
final_data = scrap(base_URL, 180)

In [None]:
final_data.head(72)

In [None]:
final_data['suburb'] = final_data.address.str.split('\s-').str[1].str.split(', ').str[0]

In [None]:
final_data

In [None]:
pd.DataFrame.nunique(final_data)

In [None]:
pd.DataFrame.drop_duplicates(final_data)