# Download train images from google search


In [None]:
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd
import os
import random
import shutil

Create a list of criteria to download images

In [None]:
styles = [
    'henri iv', 'louis xiii', 'louis xiv', 'louis xv', 'louis xvi',
    'french empire', 'art nouveau', 'art deco'
]

furnitures = [
    'chair','table','furniture','antique','desk',
    'bookcase','sofa','meuble','style','armchair',
    'antiquite','commode','gueridon','armoire','upholstery'
]


Download a list of unique image urls from Google via requests and Beautifoul Soup.

In [None]:
def build_url(criterion):
    """
    create a url to query 20 Google images
    """
    start = 'https://www.google.com/search?q='
    mid = '+'.join(criterion.split(' '))
    end = '&source=lnms&tbm=isch&sa=X&ved=0ahUKEwiF3qqzpYzcAhUHwVQKHX6PB5EQ_AUICygC&biw=1242&bih=715'
    return start + mid + end

def extract_img_src(data):
    """
    generates a list of urls
    each url being a link to an image from data
    data being the results from the Google Image query
    """
    image_urls = []
    soup = BeautifulSoup(data.content,"html.parser")
    images = soup.find_all('img')
    return [image['src'] for image in images]


In [None]:
df = pd.DataFrame()
for style in styles:
    for furniture in furnitures:
        criterion = style + ' ' + furniture
        url = build_url(criterion)
        data = requests.get(url)
        img_src = extract_img_src(data)
        criterion_df = pd.DataFrame()
        criterion_df['img_src'] = img_src
        criterion_df['style'] = style
        df = pd.concat([df,criterion_df], axis=0)

df = df.reset_index(drop=True)
df.head()

Let's clean the dataset a bit. First 'henri iv' and 'louis xiii' can be grouped into one style, like 'louis xiv and 'louis xv' or 'louis xvi' and 'french empire'.

Also some images may appear in different search results.
- if it's for the same style search, the image is simply very representative of the style. It must be deduplicated.
- if it's for several style searches, it's probably a mistake. The image must be removed completely

In [None]:
sav = df.copy()
sav.tail()

In [None]:
# group stles
df.loc[df['style'].isin(['henri iv','louis xiii']), 'style'] = 'louis xiii' 
df.loc[df['style'].isin(['louis xiv','louis xv']), 'style'] = 'baroque' 
df.loc[df['style'].isin(['louis xvi','french empire']), 'style'] = 'neoclassical' 

# deduplicate image sources within a style
styles = df['style'].unique()
style_dfs = [df.loc[df['style']==style,:] for style in styles]
style_dfs = [sd.drop_duplicates(keep='first') for sd in style_dfs]
df = pd.concat(style_dfs)

# remove duplicates accross styles
df = df.drop_duplicates(subset = 'img_src',keep=False)
df.shape

Download the images in different folders

In [None]:
def download_img(url,path):
    """
    download the images localy
    """
    data = requests.get(url).content
    with open(path, 'wb') as f:
            f.write(data)
    return

styles = df['style'].unique()
try:
    os.mkdir('data')
except:
    pass

for style in styles:
    i = 0
    if style in os.listdir('data'):
        pass
    else:
        os.mkdir(os.path.join('data',style))
    img_urls = df.loc[df['style']==style,'img_src']
    for img_url in img_urls:
        filename = str(i) + '.jpg'
        path = os.path.join('data',style,filename)
        download_img(img_url,path)
        i = i + 1

Split the images between a train and test folder (architecture required to train Keras models).

In [None]:
# reclass images
for style in styles:
    if not os.path.exists(os.path.join('data','train',style)):
        os.makedirs(os.path.join('data','train',style))
    if not os.path.exists(os.path.join('data','validation',style)):
        os.makedirs(os.path.join('data','validation',style))
    imgs = os.listdir(os.path.join('data',style))
    print(style)
    print(len(imgs))
    random.shuffle(imgs)
    train = imgs[0:140]
    validation = imgs[140:202]
    for img in train:
        source = os.path.join('data',style,img)
        target = os.path.join('data','train', style,img)
        shutil.copyfile(source,target)
    for img in validation:
        source = os.path.join('data',style,img)
        target = os.path.join('data','validation', style,img)
        shutil.copyfile(source,target)