# bike-reloot data set
## assembling a dat set for a prototype 

This notebook has the purpose of making a data set for the _bike-reloot_ project. 

Steps are the following: 
- scrape will haben bike pictures
- rename them to "stolen" and "not_stolen" with consecutive IDs

---

Inside of a loop through a list of willhaben URLs with different "&pages=1": 

Structure of the "a-tag" I want to scrap, which is one entry of a bike.

```
<a 
    href="/iad/kaufen-und-verkaufen/d/bike-discounter-angebot-kross-trans-4-0-s-28-zoll-0401-578744109/" 
    id="search-result-entry-header-578744109" 
    aria-labelledby="search-result-entry-header-578744109" 
    class="sc-ca51e2d8-0 lkgIJQ sc-deecb898-1 imWndm" 
    data-testid="search-result-entry-header-578744109">
</a>
```

Then retreive the href and save it to a dataframe, for n number of pages which lead to n number of entries (say, 600 entries).


In [1]:
import requests
from bs4 import BeautifulSoup

# DEFINE BASE URL 
# = "https://www.willhaben.at/iad/kaufen-und-verkaufen/marktplatz/fahrraeder-4552&rows=1"
url_fahrräder = "https://www.willhaben.at/iad/kaufen-und-verkaufen/marktplatz/fahrraeder-radsport/fahrraeder-4552"
# and &page=9 &rows=1

# MAKE REQUEST ON BASE URL 
page = requests.get(url_fahrräder)
soup = BeautifulSoup(page.content, "html.parser")

# Define a function to filter elements based on their IDs within the div with ID "skip-to-resultlist"
def find_by_id_custom(tag):
    return (
        tag.name == 'a' and 
        tag.get('id', '').startswith('search-result-entry-header') and 
        tag.has_attr('id')
    )

# FIND ALL a tags within ID
links = soup.find(id="skip-to-resultlist").find_all(find_by_id_custom)
print(len(links))
print(links[0].get('href'))



5
/iad/kaufen-und-verkaufen/d/flexispot-tischfahrrad-769562406/


In [2]:
import pandas as pd

list_id = [ link.get('id') for link in links]
list_href = [ link.get('href') for link in links]

df = pd.DataFrame({"id": list_id,
                   "href": list_href})

# split ID into all parts and extract the unique ID 
df['id_int'] = df['id'].str.split('-',expand=True)[4]
df['links'] = "https://www.willhaben.at"+df['href']
df['links'][1]
df

Unnamed: 0,id,href,id_int,links
0,search-result-entry-header-769562406,/iad/kaufen-und-verkaufen/d/flexispot-tischfah...,769562406,https://www.willhaben.at/iad/kaufen-und-verkau...
1,search-result-entry-header-769562315,/iad/kaufen-und-verkaufen/d/bmc-amp-one-db-car...,769562315,https://www.willhaben.at/iad/kaufen-und-verkau...
2,search-result-entry-header-674176928,/iad/kaufen-und-verkaufen/d/basso-diamant-mit-...,674176928,https://www.willhaben.at/iad/kaufen-und-verkau...
3,search-result-entry-header-769562012,/iad/kaufen-und-verkaufen/d/alan-record-rennra...,769562012,https://www.willhaben.at/iad/kaufen-und-verkau...
4,search-result-entry-header-769562671,/iad/kaufen-und-verkaufen/d/damenfahrad-steger...,769562671,https://www.willhaben.at/iad/kaufen-und-verkau...


# Extracting URL links of images of bikes

In [3]:
import requests
from bs4 import BeautifulSoup
from PIL import Image

In [4]:
import requests
from bs4 import BeautifulSoup
import time

df['links'][1]


# TEST THE SINGLE ENTRY IMG EXTRACTION
single_page = requests.get(df['links'][1])
single_soup = BeautifulSoup(single_page.content, "html.parser")

def find_by_data_test_id_custom(tag):
    return (
        tag.name == 'img' and 
        tag.get('data-testid', '').startswith('image') and 
        tag.has_attr('alt')
    )

print(len(single_soup.find_all(find_by_data_test_id_custom)))
img_links = single_soup.find_all(find_by_data_test_id_custom)

# print(single_soup.find_all('img'))

print([ link.get('src') if link.has_attr('src') 
       else link.get('data-flickity-lazyload') 
       for link in img_links ])

print([ link.get('alt')  for link in img_links ])
#img_links



11
['https://cache.willhaben.at/mmo/5/769/562/315_-235536717.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_-491453809.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_-851510819.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_-1475545040.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_-2053380139.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_-131533182.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_926155059.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_-461249392.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_-913702457.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_-1228474320.jpg', 'https://cache.willhaben.at/mmo/5/769/562/315_-1605503307.jpg']
['Bild 1 von 11', 'Bild 2 von 11', 'Bild 3 von 11', 'Bild 4 von 11', 'Bild 5 von 11', 'Bild 6 von 11', 'Bild 7 von 11', 'Bild 8 von 11', 'Bild 9 von 11', 'Bild 10 von 11', 'Bild 11 von 11']


In [5]:

# PERFORM IMAGE EXTRACTION ON WHOLE DF
img_list = {}
for i in df.index:
    time.sleep(0.1)
    #requests.get(entry)
    pass_soup = BeautifulSoup(requests.get(df['links'][i]).content, "html.parser")
    pass_image_links = pass_soup.find_all(find_by_data_test_id_custom)
    
    pass_image_src = [ link.get('src') if link.has_attr('src') 
       else link.get('data-flickity-lazyload') 
       for link in img_links ]

    pass_image_alt = [ link.get('alt') for link in pass_image_links]
    
    if len(pass_image_links) > 1: 
        img_list[df['id_int'][i]] =  {'src':pass_image_src} 
        
img_list


{'769562406': {'src': ['https://cache.willhaben.at/mmo/5/769/562/315_-235536717.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-491453809.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-851510819.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-1475545040.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-2053380139.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-131533182.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_926155059.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-461249392.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-913702457.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-1228474320.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-1605503307.jpg']},
 '769562315': {'src': ['https://cache.willhaben.at/mmo/5/769/562/315_-235536717.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-491453809.jpg',
   'https://cache.willhaben.at/mmo/5/769/562/315_-851510819.jpg',
   'https://cache.willhaben.at/

In [6]:
# TAKE DICT AND MAKE IT TO DF
df_src = pd.DataFrame(img_list).T.explode('src').reset_index(names='id')
df_src

Unnamed: 0,id,src
0,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...
1,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...
2,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...
3,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...
4,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...
5,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...
6,769562406,https://cache.willhaben.at/mmo/5/769/562/315_9...
7,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...
8,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...
9,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...


In [7]:
# GENERATE A COLUMN WITH A PREFIX
import numpy as np

df_src['label'] = df_src.groupby('id')['id'].transform(
    lambda group: np.random.choice(['stolen', 'not_stolen'], size=len(group)) 
    )
df_src

Unnamed: 0,id,src,label
0,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,not_stolen
1,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,stolen
2,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,stolen
3,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,not_stolen
4,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,stolen
5,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,stolen
6,769562406,https://cache.willhaben.at/mmo/5/769/562/315_9...,not_stolen
7,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,not_stolen
8,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,stolen
9,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,not_stolen


In [8]:
# generate a column with a unique ID, not group ID

from pathlib import Path

df_src['src_id'] = [ Path(df_src['src'][entry]).stem.split('-')[-1] for entry in df_src.index ]
df_src.head()

Unnamed: 0,id,src,label,src_id
0,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,not_stolen,235536717
1,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,stolen,491453809
2,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,stolen,851510819
3,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,not_stolen,1475545040
4,769562406,https://cache.willhaben.at/mmo/5/769/562/315_-...,stolen,2053380139


# Download all the pictures
Now, it is essential to find a way to rename the figures like: "stolen_78999345" or "not_stolen_5643345"

In [9]:
import io
import requests
from PIL import Image
from pathlib import Path

# Create a byte object out of image_content and store it in the variable image_file
image_file = io.BytesIO(requests.get(df_src['src'][0]).content)

image = Image.open(image_file).convert("RGB")

# assuming there is a folder "figs" in the current working directory
file_path = Path("figs",df_src['label'][0] + df_src['id'][0] + ".png").absolute()
print(file_path)
image.save(file_path, "PNG", quality=80)


C:\Users\georg\Documents\GIT\fastbook\bike-reloot\figs\not_stolen769562406.png


In [12]:
# Now iterate through the list 

for j in df_src.index:
    time.sleep(0.1)
    pass_image_file = io.BytesIO(requests.get(df_src['src'][j]).content)
    pass_image = Image.open(pass_image_file).convert("RGB")
    file_path = Path("figs",df_src['label'][j] +"-"+ df_src['src_id'][j] +"-"+ df_src['src_id'][j] + ".png").absolute()
    image.save(file_path, "PNG", quality=80)
    print("image with id "+df_src['src_id'][j]+" saved")
    print("URL: "+ df_src['src'][j])

image with id 235536717 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_-235536717.jpg
image with id 491453809 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_-491453809.jpg
image with id 851510819 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_-851510819.jpg
image with id 1475545040 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_-1475545040.jpg
image with id 2053380139 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_-2053380139.jpg
image with id 131533182 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_-131533182.jpg
image with id 315_926155059 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_926155059.jpg
image with id 461249392 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_-461249392.jpg
image with id 913702457 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_-913702457.jpg
image with id 1228474320 saved
URL: https://cache.willhaben.at/mmo/5/769/562/315_-1228474320.jpg
image with id 1605503307 saved
URL: https

In [None]:
# Start the CNN

In [27]:
from fastai.vision.all import *

def is_stolen(x): return x.startswith('stolen')

print(is_stolen("stolen_asdf"))

path = Path("figs").absolute()

print(path)

dls = ImageDataLoaders.from_name_func(
    path, get_image_files(path), valid_pct=0.2, seed=42,
    label_func=is_stolen, item_tfms=Resize(224))
dls
learn = vision_learner(dls, resnet34, metrics=error_rate)
learn.fine_tune(3)

True
C:\Users\georg\Documents\GIT\fastbook\bike-reloot\figs


epoch,train_loss,valid_loss,error_rate,time
0,,3.961857,0.5,00:02


epoch,train_loss,valid_loss,error_rate,time
0,,3.961857,0.5,00:02
1,,3.961857,0.5,00:02
2,,3.961857,0.5,00:02
