import random
from google.colab import drive
drive.mount('/content/drive')

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import datetime
import re

Defining basic methods,
Building the scraping engine. 


1.   Gets each page of listings
2.   For each page of listings, gets attributes and headline descriptions, sqft, rent and a url for the posting.



In [2]:
# Open a browser instance.
def start_browser(headless = True):
    chrome_options = webdriver.ChromeOptions()
    if headless:
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        webbrowser = webdriver.Chrome("chromedriver", options=chrome_options)
    if not headless:
        webbrowser.set_window_size(48, 32)
    return webbrowser


#Generates a list of valid page urls from a given starting point.
def get_pages(start_url, browser, results=-1, cooldown=1):
    browser.get(start_url)
    end = ''
    while end=='':
        soup = BeautifulSoup(browser.page_source)
        end = soup.find('span', class_='cl-page-number').text

    m = re.search("of "+'(.+%?)',end)

    if m is not None:
        end = int(m.group(1).replace(",", ""))

    if results !=-1:
        end = min([end, results])

    pages = end//120
    pagelist = []
    for n in range(0, pages):
        pagelist.append((start_url+'#search=1~list~'+str(n)))

    return pagelist

def get_listings(pagelist, browser=start_browser()):
    links = []
    for p in pages:
        browser.get(p)
        while browser.page_source is None:
            sleep(1)
        source = BeautifulSoup(browser.page_source).ol.find_all('li')
        for s in source:
            links.append(s.a.get('href'))
    return links

#utility to pull variables out of xml data using leading and following tag.
#I refusue to learn regex and this is my workaround for that.
def xml_getter(text, before="", after=""):
    if text is not None:
        text = str(text)
        m = re.search(before+'(.+?)'+after, text)
        if m:
            found = m.group(1).strip()
            return(found)
    else: return None


In [3]:
#Listing object holds all information relevant to the post. Non normalized, Has redundancy.

class Listing:

    def __init__(self, url, browser):
        self.url = url
        browser.get(url)
        self.ID = xml_getter(url, before = "/", after=".html")[-10::]

        try:
            self.html = BeautifulSoup(browser.page_source)
        except:
            try:
                time.sleep(1)
                browser.get(url)
                time.sleep(1)
                self.html=BeautifulSoup(browser.page_source)
            except:
                pass

            
    def parse(self):

        def get_price(self):
            try:
                price = self.html.find('span', class_='price').text
                price = re.sub(r'[^\d.]', '', str(price))
                return(int(price))
            except:
                return(None)

        def get_beds(self):
            housing = self.html.find("span", class_='housing')
            if housing is not None:
                beds = xml_getter(housing.text.lower(), before=' ', after="br")
            else:
                beds = 0
            return(beds)

        def get_sqft(self):
            housing = self.html.find('span', class_='housing')
            if housing is not None:
                sqft =xml_getter(housing.text.lower(), before=' - ', after="ft2")
                if sqft is not None:
                    sqft = int(sqft)
                else: sqft = None
            else: sqft = None
            return(sqft)

        def get_park(self):
            park = xml_getter(self.html, before='>', after="parking")
            return(park)

        def get_baths(self):
            bedbath = self.html.find('span', class_='shared-line-bubble')
            if bedbath is not None:
                baths = xml_getter(bedbath.text.lower(), before="/ ", after='ba')
                return(baths)
            else: return None

        def get_body(self):
            body = self.html.find('section', id='postingbody')
            if body is not None:
                body = body.text
                body = body.replace("\n\nQR Code Link to This Post\n\n\n", "")
                body = body.replace("\n", " ")
                return(body)
            else: return " "

        def get_address(self):
            address = self.html.find('div', class_='mapaddress')
            if address is not None:
                location = address.text
            else: location = None
            return(location)

        def get_lat_lon(self):
            try:  
                lat = self.html.find('div', id='map').get('data-latitude')
                lon = self.html.find('div', id='map').get('data-longitude')
                lat = float(lat)
                lon = float(lon)
            except:
                lat=None
                lon=None
            return(lat, lon)

        def get_attrgroup(self):
            attrgroup = self.html.find('div', class_='mapAndAttrs')
            attrlist = ['cats are OK - purrr', 'dogs are OK - wooof', 'air conditioning', 
            'furnished', 'w/d in unit', 'laundry on site', 'laundry in bldg', 
            'no laundry on site', 'no parking', 'street parking', 'off-street parking', 
            'detached garage']
            if attrgroup is not None:
                attrgroup.find_all('p', class_='attrgroup')[1]
                attrgroup = set(attrgroup.text.split("\n"))
                attrvals = []
                for attr in attrlist:
                    if attr in attrgroup:
                        attrvals.append(1)
                    else:
                        attrvals.append(0)

                return(attrvals)
            else: return [0]*(len(attrlist))


        def get_date(self):
            return xml_getter(self.html.find('time'), before='title="', after = '"')
        
        self.price = get_price(self)  
        self.beds = get_beds(self)
        self.sqft = get_sqft(self)
        self.park = get_park(self)
        self.baths = get_baths(self)
        self.body = get_body(self)
        self.address = get_address(self)
        self.lat, self.lon = get_lat_lon(self)
        self.attrgroup = get_attrgroup(self)
        self.date = get_date(self)


    def get_attributes(self):
        self.parse()
        attrs = [self.url, self.price, self.beds, self.sqft, self.park, self.baths, 
         self.body, self.address, self.lat, self.lon, self.date]+self.attrgroup
        attrdict = {self.ID: attrs}
        return attrdict

# Generating page list:

In [4]:
browser=start_browser()
pages = get_pages(start_url = 'https://chicago.craigslist.org/search/apa',
                  browser=browser,
                  results=-1,
                  cooldown=2)

In [6]:
listings = pd.Series(get_listings(pages, browser=browser))
len(listings)

4680

browser=start_browser()
test = Listing(listings[0], browser)
test.get_attributes()

In [7]:
import warnings
warnings.simplefilter('ignore')
cooldown=0
results = {}
browser=start_browser()
for l in listings:
    try:
        entry = Listing(l, browser)
    except:
        try:
            browser=start_browser()
            entry = Listing(l, browser)
        except:
            entry=None
    
    if entry is not None:
        results.update(entry.get_attributes())
        percent = (100*len(results))/len(listings)
        if percent%1==0:
            print(str(percent)+"% scraped")
            
listingdf = pd.DataFrame.from_dict(results, orient='index')
listingdf.columns = ['url', 'price', 'beds', 'sqft', 'parking', 'baths', 'descript', 'adress', 'lat', 'lon', 'date', 
                     'cats are OK - purrr', 'dogs are OK - wooof', 'air conditioning', 
              'furnished', 'w/d in unit', 'laundry on site', 'laundry in bldg', 
              'no laundry on site', 'no parking', 'street parking', 'off-street parking', 
              'detached garage']

#Cleaning up listings that are improbably cheap or expensive
listingdf = listingdf[listingdf['price']<10000]
listingdf = listingdf[listingdf['price']>100]

5.0% scraped
10.0% scraped
15.0% scraped
20.0% scraped
25.0% scraped
30.0% scraped
35.0% scraped
40.0% scraped
45.0% scraped
50.0% scraped
55.0% scraped
60.0% scraped
65.0% scraped
70.0% scraped
75.0% scraped
80.0% scraped
85.0% scraped
90.0% scraped
95.0% scraped


In [8]:
listingdf.shape

(4544, 23)

In [9]:
listingdf.to_csv("./CLScraped.csv")

# Analysis of Search Results:

In [13]:
listingdf = pd.read_csv("CLScraped.csv")

In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(listingdf, row="beds", hue="beds", aspect=15, height=.5, palette=pal)
g.map(sns.kdeplot, "price",
      bw_adjust=.5, clip_on=False,
      fill=True, alpha=1, linewidth=1.5)

g.map(sns.kdeplot, "price", clip_on=False, color="w", lw=2, bw_adjust=.5)

g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)

def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color,
            ha="left", va="center", transform=ax.transAxes)

g.map(label, "price")
g.figure.subplots_adjust(hspace=-.25)
g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)

AttributeError: 'DataFrame' object has no attribute 'iteritems'

In [18]:
from scipy.stats import linregress
model = listingdf[listingdf['sqft']<3000]
'''
f = sns.jointplot(x=model['sqft'], 
              y=model['price'], 
              #hue = model['beds'],
              ratio=2,
              kind='reg',
              )
'''
slope, intercept, r, p, se = linregress(x=listingdf[listingdf['sqft']>0]['sqft'], y=listingdf[listingdf['sqft']>0]['price'])
print(str(slope.round(4))+"*X+"+str(intercept.round(4))+"\nr="+str(r.round(4))+"\np="+str(p.round(4)))

0.0267*X+1894.0461
r=0.1001
p=0.0002


In [19]:
from sklearn import linear_model
y = model['price']
x = model[['beds', 'sqft']]
multi_model = linear_model.LinearRegression()
multi_model.fit(x,y)
sns.kdeplot(multi_model.predict(x)-y, shade=True)
print('R^2 Score: '+str(round(multi_model.score(x,y), 4)))
print("Coefs: "+str(multi_model.coef_))
print("Standard Error: "+str(round((multi_model.predict(x)-y).std(),4)))

OptionError: "No such keys(s): 'mode.use_inf_as_null'"

In [10]:
y = model['price']
x = model[['sqft']]
multi_model = linear_model.LinearRegression()
multi_model.fit(x,y)
sns.kdeplot(multi_model.predict(x)-y, shade=True)
print('R^2 Score: '+str(multi_model.score(x,y).round(4)))
print("Coefs: "+str(multi_model.coef_))
print("Standard Error: "+str(round((multi_model.predict(x)-y).std(),4)))

NameError: name 'model' is not defined

Number of bedrooms is not a significant factor over square-footage. Knowing the number of beds does not improve predictions of value, implying no meaningful relationship apart from communicating square-footage.

In [None]:
model['rent']=model['price'].map(int)
model['ppsf'] = model['price']/model['sqft']


In [None]:
sns.histplot(model, x='ppsf', hue='beds')

In [None]:
#Fill in the blanks:
def fetch_blank_listings(listingdf):
  browser = start_browser()
  browser.get(listingdf['url'][0])
  tester = browser.page_source
  if len(tester)>1000:
    nans = listingdf[listingdf['html'].isna()]['url'].apply(lambda x: pull_html(x, cooldown=2))
    return(nans)
  else:
    print("failed to fetch first listing")
    print(BeautifulSoup(browser.page_source))

In [None]:
#Pull listings html:
listingdf['html']=listingdf['url'][0:100].apply(lambda x: pull_html(x, cooldown=5))

In [None]:

parsed = listingdf.apply(lambda x: parse_listing_html(x['html']), axis='columns', result_type='expand')
parsed.columns = ['baths', 'body', 'lat', 'lon', 'parking', 'dogs', 'cats', 'laundry']
listingdf = listingdf.join(parsed)
listingdf['Dpsf']=listingdf['rent']/listingdf['sqft']

def clean(heading):
    heading = heading.replace('QR Code Link to This Post\n\n\n', "")
    heading = heading.replace("\n", "")
    heading = re.sub('[^0-9a-zA-Z]+', " ", heading)
    heading = heading.lower()
    return(heading)

    listingdf['heading'].map(clean)

listingdf

In [None]:
listingdf.dropna()

In [None]:
# importing all necessary modules
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
 
comment_words = ''
stopwords = set(STOPWORDS)
 
# iterate through the csv file
for val in listingdf.dropna().body:
     
    # typecaste each val to string
    val = str(val)
 
    # split the value
    tokens = val.split()
     
    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
     
    comment_words += " ".join(tokens)+" "
 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(comment_words)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
text = " ".join(clean(str(i)) for i in listingdf['body'].dropna())
wordcloud = WordCloud().generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


In [None]:
#df=pd.read_csv('./CLScraped.csv')
dummy = listingdf[['rent','sqft', 'beds', 'baths', 'parking', 'lat', 'lon', 'dogs', 'cats', 'laundry']].dropna()
dummy['rent'] = dummy['rent'].apply(lambda x: (x-min(dummy['rent']))/(max(dummy['rent'])))
dummy['beds']=dummy['beds']/max(dummy['beds'])
dummy['dogs'].replace("None", 0, inplace=True)
dummy['dogs'].replace("dogs are OK - wooof", 1, inplace=True)
dummy['cats'].replace("None", 0, inplace=True)
dummy['cats'].replace('cats are OK - purrr', 1, inplace = True)
dummy['laundry'].replace('laundry in bldg', 1, inplace=True)
dummy["laundry"].replace('laundry on site', 1, inplace=True)
dummy['laundry'].replace("None", 0, inplace=True)
dummy[dummy['parking'].notnull()]['parking']=1
dummy['parking'].fillna(0, inplace=True)
dummy['sqft'].replace(0, None)
dummy['lat'] = dummy['lat'].map(float)
dummy['lon'] = dummy['lon'].map(float)
#########delete this later
dummy['baths']=dummy['baths'].apply(lambda x: float(x))
#########

dummy['baths']=dummy['baths']/max(dummy['baths'])

dummy = dummy.drop(columns=['sqft', 'parking'])