### Datasets

* [Craigslist Apartments](https://sacramento.craigslist.org/d/apts-housing-for-rent/search/apa)

## Web Scraping

### 1. Packages Installation

In [1]:
# Our usual data science tools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp # other science tools
# statsmodels -- "traditional" statistical models
# scikit-learn -- machine learning models
import seaborn as sns
#from plotnine import *

%matplotlib inline

# Web scraping tools
import lxml.html as lx
import requests
import requests_cache

requests_cache.install_cache("../craigslist")

### 2.  Pages Scraping

In [2]:
start_url = "https://sacramento.craigslist.org/d/apts-housing-for-rent/search/apa"

def scrape_front_page(url):
    response = requests.get(url)
    response.raise_for_status()
    html = lx.fromstring(response.text)
    html.make_links_absolute(url)
    # Get all <a> tags with class "result-title"
    links = html.xpath("//a[contains(@class, 'result-title')]/@href")    
    next_page = html.xpath("//a[contains(@class, 'next')]/@href")[0]
    
    return next_page, links

next_page, links = scrape_front_page(start_url)

In [3]:
len(links)

120

In [20]:
def scrape_one_post(link):
    response = requests.get(link)
    try:
        response.raise_for_status()
    except:
        print("The url couldn't be downloaded!")
        
    html = lx.fromstring(response.text)
    
    if len(html.cssselect(".removed")):
        #Deleted Post
        price = None
    
    try:
        price = html.xpath("//*[contains(@class,'price')]")[0].text_content()
    except IndexError:
        price = None

    # Alternative using CSS selectors:
    # html.cssselect(".price") 
    try:
        title = html.cssselect("#titletextonly")[0].text_content()
    except:
        title = None

    #html.cssselect("p.attrgroup span")
    attribs = [x.text_content() for x in html.xpath("//p[contains(@class, 'attrgroup')]/span")]


    # Get the text
    try:
        text = html.cssselect("#postingbody")[0].text_content()
    except:
        text = None
        
    # Get the map
    try:
        coords = html.cssselect("#map")[0]
        lon = coords.attrib.get("data-longitude")
        lat = coords.attrib.get("data-latitude")
    except:
        lon = None
        lat = None

    # Get time posted
    try:
        time = html.cssselect("time.timeago")[0]
        time = time.attrib.get("datetime")
    except:
        time = None

    return {"text":text, "attribs": attribs, "lat":lat, "lon": lon, "time": time, "title": title, "price": price}

In [18]:
posts = [scrape_one_post(i) for i in links]

The url couldn't be downloaded!
The url couldn't be downloaded!


In [19]:
pd.DataFrame(scrape_one_post(links[10]))

Unnamed: 0,text,attribs,lat,lon,time,title,price
0,\n \n QR Code Link to This P...,2BR / 2Ba,38.676728,-121.323392,2020-03-20T17:17:33-0700,DUAL MASTER WITH VINYL PLANK FLOORING AVAIL IMMED,$1549
1,\n \n QR Code Link to This P...,920ft2,38.676728,-121.323392,2020-03-20T17:17:33-0700,DUAL MASTER WITH VINYL PLANK FLOORING AVAIL IMMED,$1549
2,\n \n QR Code Link to This P...,available mar 20,38.676728,-121.323392,2020-03-20T17:17:33-0700,DUAL MASTER WITH VINYL PLANK FLOORING AVAIL IMMED,$1549
3,\n \n QR Code Link to This P...,application fee details: 52.46,38.676728,-121.323392,2020-03-20T17:17:33-0700,DUAL MASTER WITH VINYL PLANK FLOORING AVAIL IMMED,$1549
4,\n \n QR Code Link to This P...,cats are OK - purrr,38.676728,-121.323392,2020-03-20T17:17:33-0700,DUAL MASTER WITH VINYL PLANK FLOORING AVAIL IMMED,$1549
5,\n \n QR Code Link to This P...,dogs are OK - wooof,38.676728,-121.323392,2020-03-20T17:17:33-0700,DUAL MASTER WITH VINYL PLANK FLOORING AVAIL IMMED,$1549
6,\n \n QR Code Link to This P...,apartment,38.676728,-121.323392,2020-03-20T17:17:33-0700,DUAL MASTER WITH VINYL PLANK FLOORING AVAIL IMMED,$1549
7,\n \n QR Code Link to This P...,w/d in unit,38.676728,-121.323392,2020-03-20T17:17:33-0700,DUAL MASTER WITH VINYL PLANK FLOORING AVAIL IMMED,$1549
8,\n \n QR Code Link to This P...,carport,38.676728,-121.323392,2020-03-20T17:17:33-0700,DUAL MASTER WITH VINYL PLANK FLOORING AVAIL IMMED,$1549
