# Building a custom data set and training a classifier for pedestrians, cars and cyclists

**Steps:**

1. Open your Mozilla Firefox browser,
2. enter your search term, e.g. "cyclists",
3. click "images",
4. open your web console using F12,
5. enter into the console tab: 
```javascript 
urls = Array.from(document.querySelectorAll('.rg_i')).map(el=>{if (typeof el.dataset.iurl != "undefined" ) {return el.dataset.iurl;} else if (typeof el.dataset.src != "undefined") {return el.dataset.src;}});
``` 
(this should collect all urls to the images in the results list - you may verify if `.rg_i` really is the correct class value by inspecting the html soup using the inspector tab in your web console),
```javascript
urls_filtered = urls.reduce((result, el) => {if (typeof el != "undefined") {result.push(el);} return result;}, []);
```
6. enter into the console tab:
```javascript 
window.open('data:text/csv;charset=utf-8,'+escape(urls_filtered.join('\n')));
``` 
(this opens a dialoge to store the content of your urls variable defined above to a csv file in your local downloads dir - so you need to allow the popup to open to download in the first place)
7. download and store the images, create a data bunch, train and evaluate the model using the cells below

In [None]:
from fastai import *
from fastai.vision import *
import shutil

## Download and store images from the Google search above

In [None]:
base_dir = pathlib.Path(r"D:\fastai\google_images")

data_paths = {
    "cyclists": 
        {
            "urls_csv_path": base_dir/"cyclists.csv",
            "image_dir": base_dir/"cyclists"
        },
    "cars": 
        {
            "urls_csv_path": base_dir/"cars.csv",
            "image_dir": base_dir/"cars"
        },
    "pedestrians": 
        {
            "urls_csv_path": base_dir/"pedestrians.csv",
            "image_dir": base_dir/"pedestrians"
        },
}

data_paths

In [None]:
for label in sorted(data_paths):
    
    print("Processing", label)
    download_images(data_paths[label]["urls_csv_path"], data_paths[label]["image_dir"])

## Copy images into one dir and rename them adding their class to the name

In [None]:
target_dir = base_dir/"pedestrians_cyclists_and_cars"

In [None]:
if not target_dir.exists():
    print("Creating {}".format(target_dir))
    target_dir.mkdir(exist_ok=True)
    
target_dir

In [None]:
stuff = []
for label in data_paths:
    stuff += [(data_paths[label]["image_dir"]/f.name , target_dir/"{}_{}".format(label, f.name), label) for f in data_paths[label]["image_dir"].ls()] 

source_files, target_files, labels = list(zip(*stuff))
print("source: {},\ntarget: {},\nlabel: {}".format(source_files[:3], target_files[:3], labels[:3]))

In [None]:
for s,t in zip(source_files, target_files):
    shutil.copyfile(s, t)

## Create an `ImageDataBunch` object and inspect

In [None]:
tfms = get_transforms()
data = ImageDataBunch.from_lists(path=target_dir, fnames=target_files, labels=labels, valid_pct=.2, bs=32, ds_tfms=tfms, size=224).normalize()

In [None]:
data.show_batch(rows=3)

In [None]:
data.classes, data.c

## Setup and train an image model

In [None]:
learn = cnn_learner(data, models.resnet34, metrics=error_rate)

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
s = slice(1e-6, 1e-4) # depends on the optimal range identified using learn.lr_find

In [None]:
learn.unfreeze()

In [None]:
# learn.fit_one_cycle(4, max_lr=s)
learn.fit_one_cycle(4)

In [None]:
learn.save("stage-1-34")

## Inspect results

In [None]:
learn.show_results(ds_type=DatasetType.Train, rows=2)

In [None]:
learn.show_results(ds_type=DatasetType.Valid, rows=2)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)

In [None]:
interp.plot_top_losses(6)

In [None]:
interp.plot_confusion_matrix()

In [None]:
interp.most_confused(min_val=1)