### Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Put all together

In [8]:
#create empty dataframe
real_estate=pd.DataFrame(columns=['Address','Baths','Beds','Price','Floorspace'])

#scrape 3 pages
#sequence of number 1 up to 3, number 3 is excluded
for i in range(1,4):
    #website request
    website = requests.get('https://www.trulia.com/NY/New_York/'+str(i)+'_p/')
    #create soup object
    soup=BeautifulSoup(website.content,'html.parser')
    #result itmes
    results=soup.findAll('li',{'class':'SearchResultsList__WideCell-b7y9ki-2'})
    #update results
    results_update=[]
    for r in results:
        if r.has_attr('data-testid'):
            results_update.append(r)
    #lists
    addresses=[result.find('div',{'data-testid':'property-address'}).get_text() for result in results_update]
    bedrooms=[result.find('div',{'data-testid':'property-beds'}).get_text() for result in results_update]
    bathrooms=[result.find('div',{'data-testid':'property-baths'}).get_text() for result in results_update]
    
    floorspace=[]
    for result in results_update:
        try:
            floorspace.append(result.find('div',{'data-testid':'property-floorSpace'}).get_text())
        except:
            floorspace.append('n/a')
    prices=[result.find('div',{'data-testid':'property-price'}).get_text() for result in results_update]
    for k in range(len(addresses)):
        real_estate=real_estate.append({'Address':addresses[k],'Beds':bedrooms[k],'Baths':bathrooms[k],'Floorspace':floorspace[k],'Price':prices[k]}, ignore_index=True)
        

In [9]:
real_estate

Unnamed: 0,Address,Baths,Beds,Price,Floorspace
0,"432 Park Ave #PENTHOUSE, New York, NY 10022",9ba,6bd,"$169,000,000","8,255 sqft"
1,"8829 183rd St, Jamaica, NY 11423",2ba,3bd,"$250,000","1,632 sqft"
2,"303 E 57th St #32B, New York, NY 10022",3ba,2bd,"$329,000","1,800 sqft"
3,"9915 200th St, Jamaica, NY 11423",4ba,6bd,"$405,000","2,831 sqft"
4,"10727 155th St, Jamaica, NY 11433",3ba,6bd,"$300,000","3,120 sqft"
...,...,...,...,...,...
115,"118 Wooster St #4C5C, New York, NY 10012",3ba,3bd,"$5,250,000","2,476 sqft"
116,"1204 Franklin Ave, Bronx, NY 10456",2ba,5bd,"$125,000","2,695 sqft"
117,"995 5th Ave #16, New York, NY 10028",10ba,8bd,"$29,500,000","8,360 sqft"
118,"100 W 57th St #16K, New York, NY 10019",1ba,Studio,"$220,000",580 sqft


### Information about Dataframe

In [10]:
#dataframe info
real_estate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     120 non-null    object
 1   Baths       120 non-null    object
 2   Beds        120 non-null    object
 3   Price       120 non-null    object
 4   Floorspace  120 non-null    object
dtypes: object(5)
memory usage: 4.8+ KB


In [11]:
#first five results
real_estate.head()

Unnamed: 0,Address,Baths,Beds,Price,Floorspace
0,"432 Park Ave #PENTHOUSE, New York, NY 10022",9ba,6bd,"$169,000,000","8,255 sqft"
1,"8829 183rd St, Jamaica, NY 11423",2ba,3bd,"$250,000","1,632 sqft"
2,"303 E 57th St #32B, New York, NY 10022",3ba,2bd,"$329,000","1,800 sqft"
3,"9915 200th St, Jamaica, NY 11423",4ba,6bd,"$405,000","2,831 sqft"
4,"10727 155th St, Jamaica, NY 11433",3ba,6bd,"$300,000","3,120 sqft"


In [12]:
#last five results
real_estate.tail()

Unnamed: 0,Address,Baths,Beds,Price,Floorspace
115,"118 Wooster St #4C5C, New York, NY 10012",3ba,3bd,"$5,250,000","2,476 sqft"
116,"1204 Franklin Ave, Bronx, NY 10456",2ba,5bd,"$125,000","2,695 sqft"
117,"995 5th Ave #16, New York, NY 10028",10ba,8bd,"$29,500,000","8,360 sqft"
118,"100 W 57th St #16K, New York, NY 10019",1ba,Studio,"$220,000",580 sqft
119,"32 W 76th St, New York, NY 10023",9ba,6bd,"$27,500,000","10,635 sqft"


### Data Cleaning

In [14]:
real_estate['Baths']=real_estate['Baths'].apply(lambda x: x.strip('ba'))

In [15]:
real_estate['Beds']=real_estate['Beds'].apply(lambda x: x.strip('bd'))

In [17]:
##updated dataframe
real_estate

Unnamed: 0,Address,Baths,Beds,Price,Floorspace
0,"432 Park Ave #PENTHOUSE, New York, NY 10022",9,6,"$169,000,000","8,255 sqft"
1,"8829 183rd St, Jamaica, NY 11423",2,3,"$250,000","1,632 sqft"
2,"303 E 57th St #32B, New York, NY 10022",3,2,"$329,000","1,800 sqft"
3,"9915 200th St, Jamaica, NY 11423",4,6,"$405,000","2,831 sqft"
4,"10727 155th St, Jamaica, NY 11433",3,6,"$300,000","3,120 sqft"
...,...,...,...,...,...
115,"118 Wooster St #4C5C, New York, NY 10012",3,3,"$5,250,000","2,476 sqft"
116,"1204 Franklin Ave, Bronx, NY 10456",2,5,"$125,000","2,695 sqft"
117,"995 5th Ave #16, New York, NY 10028",10,8,"$29,500,000","8,360 sqft"
118,"100 W 57th St #16K, New York, NY 10019",1,Studio,"$220,000",580 sqft


### Save in Excel

In [18]:
real_estate.to_excel('real_estate_multiple_pages.xlsx', index=False)