In [1]:
import os
import sys
import json
import requests
import re
import argparse
from pprint import pprint
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import unicodedata
import math

In [2]:
class Item(object):
    def __init__(self, price, location, datetime):
        self.price = price
        self.location = location
        self.datetime = datetime

In [3]:
def convert_to_int(price_str):
    print 'price to be converted: ', price_str
    price_str = re.search(r'\d+', price_str)
    try:
        return int(price_str.group())
    except Exception:
        return -9999

In [4]:
def convert_to_string(item):
    return unicodedata.normalize('NFKD',item).encode('ascii', 'ignore')

In [5]:
# let's convert these into graceful functions
def get_price(item):
    try:
        price = item.find('div', {'class':'listed-item-price'}).get_text()
    except Exception:
        return None
    
    price = convert_to_int(convert_to_string(price))
    return price

def get_location(item):
    try:
        location = item.find('address',{'class':'listed-item-place'}).get_text()
    except Exception:
        return None
    
    location = convert_to_string(location)
    return location.strip()

def get_date(item):
    try:
        date = item.find('time',{'class':'listed-item-date'}).get_text()
    except Exception:
        return None
    date = convert_to_string(date)
    return date.strip()

In [6]:
def parse_item(item):    
    item = Item(get_price(item), get_location(item), get_date(item))
    return item

In [7]:
# these are results from the 1st page - we would like results from all X pages
parsed_items = []
offset_jump = 35
for i in range(0, 10):
    offset = int(offset_jump*i)
    url_path = "https://www.2ememain.be/v%C3%A9los/v%C3%A9los/2/velo/?offset={}".format(offset)
    raw = requests.get(url_path).text
    soup = BeautifulSoup(raw,'lxml')
    
    items = soup.findAll('article')
    for item in items:
        parsed_items.append(parse_item(item))
    time.sleep(5)

price to be converted:   56,99
price to be converted:   400,00
price to be converted:  Ne s'applique pas
price to be converted:  Prix a disc.
price to be converted:   400,00
price to be converted:  Prix a disc.
price to be converted:   5,00
price to be converted:   25,00
price to be converted:   195,00
price to be converted:   70,00
price to be converted:   280,00
price to be converted:   125,00
price to be converted:   1.950,00
price to be converted:   15,00
price to be converted:   10,00
price to be converted:   95,00
price to be converted:   30,00
price to be converted:   79,00
price to be converted:   275,00
price to be converted:   20,00
price to be converted:   100,00
price to be converted:   60,00
price to be converted:  Prix a disc.
price to be converted:   200,00
price to be converted:   550,00
price to be converted:  Prix a disc.
price to be converted:  Prix a disc.
price to be converted:   50,00
price to be converted:   20,00
price to be converted:   200,00
price to be conve

price to be converted:   4.900,00
price to be converted:   369,00
price to be converted:  Prix a disc.
price to be converted:   1.600,00
price to be converted:   165,00
price to be converted:   50,00
price to be converted:   130,00
price to be converted:   1.000,00
price to be converted:   35,00
price to be converted:   120,00
price to be converted:   70,00
price to be converted:   55,00
price to be converted:   70,00
price to be converted:   250,00
price to be converted:   140,00
price to be converted:   70,00
price to be converted:   50,00
price to be converted:   68,00
price to be converted:  Prix a disc.
price to be converted:   67,00
price to be converted:   15,00
price to be converted:  Prix a disc.
price to be converted:   50,00
price to be converted:   2.800,00
price to be converted:   1.090,00
price to be converted:   100,00
price to be converted:   75,00
price to be converted:   95,00
price to be converted:   45,00
price to be converted:   40,00
price to be converted:   20,00

In [104]:
len(parsed_items)

380

In [105]:
df = pd.DataFrame(data = [x.__dict__ for x in parsed_items],
                  columns = parsed_items[0].__dict__.keys())

In [113]:
# let's remove the -9999 values in price

In [116]:
df = df[df.price != -9999]

In [119]:
df.price.describe()

count    330.000000
mean     136.763636
std      180.087260
min        1.000000
25%       35.000000
50%       75.000000
75%      150.000000
max      960.000000
Name: price, dtype: float64

In [124]:
# strange to see a bike for 1 euro - let's set a threshold to remove these prices

In [122]:
df = df[df.price > 30]

In [123]:
df.price.describe()

count    249.000000
mean     177.016064
std      190.686767
min       35.000000
25%       65.000000
50%      100.000000
75%      200.000000
max      960.000000
Name: price, dtype: float64