In [1]:
"""
Module to scrape Segunda Mano DF appartments
and stores data in local storage as CSV.
"""
import requests
import pandas as pd
from pprint import pprint as pp
from bs4 import BeautifulSoup

In [2]:
# Vars
_base_url = "https://www.segundamano.mx/anuncios/ciudad-de-mexico/venta-inmuebles?page={}"
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
ddir='data/'

In [3]:
def save(depts):
    """ Append page data

        Params:
        -----
        depts : list
            List of Departments
    """
    # Read Existant file to append
    _fname = ddir+"{}/segundamano.csv".format(dt.date.today().isoformat())
    try:
        df = pd.read_csv(_fname, delimiter='~')
    except:
        print('New file, creating folder..')
        try:
            os.mkdir(ddir+'{}'.format(dt.date.today().isoformat()))
            print('Created folder!')
        except:
            print('Folder exists already!')
        df = pd.DataFrame()
    # Append data
    depdf = pd.DataFrame(depts)
    print(depdf.head(1).to_dict())
    try:
        if df.empty:
            depdf.set_index(['name','location']).to_csv(_fname, sep='~')
            print('Correctly saved file: {}'.format(_fname))
        else:
            df = pd.concat([df, depdf])
            df.set_index(['name','location']).to_csv(_fname, sep='~')
            print('Correctly saved file: {}'.format(_fname))
    except Exception as e:
        print(e)
        print('Could not save file: {}'.format(_fname))

In [4]:
def scrape(content):
    """ Scrape all departments per page
    """
    data = []
    # Generate soup
    soup = BeautifulSoup(content, 'html.parser')
    with open(ddir+'segundamano.html', 'w') as _F:
        _F.write(soup.prettify())
    # Get Characteristics
    for d in soup.find_all(class_="ad"):
        print('----')
        try:
            print(d) 
        except Exception as e:
            print(e)
            continue
        break
    print('Found {} depts'.format(len(data)))
    return data

In [5]:
def paginate():
    """ Loop over pages to retrieve all info available

        Returns:
        -----
        pg_nums : int
            Number of pages scraped
    """
    pg_nums = 1
    while True:
        try:
            print(_base_url.format(pg_nums))
            r = requests.get(_base_url.format(pg_nums),
                headers={'user-agent': user_agent})
            if r.status_code != 200:
                raise Exception("Wrong Response")
            depts = scrape(r.content)
            if not depts:
                raise Exception("No more departments")
        except Exception as e:
            print(e)
            print('Finishing to retrieve info.')
            break
        # Store values
        #save(depts)
        pg_nums += 1
        break ###
    return pg_nums


In [7]:
pg_nums = 1
print(_base_url.format(pg_nums))
r = requests.get(_base_url.format(pg_nums), headers={'user-agent': user_agent})
r

https://www.segundamano.mx/anuncios/ciudad-de-mexico/venta-inmuebles?page=1


<Response [200]>

In [21]:
data = []
content = r.content
# Generate soup
soup = BeautifulSoup(content, 'html.parser')
with open('..\data\segundamano.html', 'w') as _F:
    _F.write(soup.prettify())
# Get Characteristics
for d in soup.find_all(class_="card grid"):
    print('----')
    try:
        print(d) 
    except Exception as e:
        print(e)
        continue
    break
print('Found {} depts'.format(len(data)))

Found 0 depts


In [30]:
soup

<!DOCTYPE html>
<html data-locale="es_MX" lang="es-MX"><head><meta charset="utf-8"/><meta content="width=device-width,initial-scale=1" name="viewport"/><meta content="app-id=934844916" name="apple-itunes-app"/><link href="/manifest-android.json" rel="manifest"/><link href="/anuncios/static/favicon.ico" rel="shortcut icon" type="image/x-icon"/><link href="https://www.google-analytics.com/" rel="dns-prefetch"/><link crossorigin="" href="https://www.google-analytics.com/" rel="preconnect"/><link href="https://ib.adnxs.com" rel="dns-prefetch"/><link crossorigin="" href="https://ib.adnxs.com" rel="preconnect"/><meta content="index, follow" name="robots"/><title>Venta de Inmuebles en Ciudad de México | Segundamano.mx</title><meta content="53797 anuncios de venta inmuebles   🏘️ en venta de particulares y profesionales de segundamano y ocasión en Ciudad de México ✔️. Entra, revisa y encuentra 🔎 en Segundamano.mx." name="description"/><meta content="product.group" property="og:type"/><meta cont

In [27]:
soup.find_all("card-data grid-data")

[]