## Data scraping and download

In [2]:
import time
import petpy
import os
import json
import pandas as pd
import urllib.request
import urllib.error
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool

In [3]:
#key = os.getenv('PETFINDER_KEY')
pf = petpy.Petfinder('key', 'secret')

For my image search I decided to start in Texas. I used Brady, Tx as the location since it is the center of Texas and set the distance to 500 to get the whole state.
I am scraping 10k dog images.

In [None]:
breeds = pf.animals(location=76825, animal_type='dog', distance=500, pages=100, results_per_page=100, return_df=True)

In [45]:
breeds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9813 entries, 0 to 9999
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   url                           9813 non-null   object 
 1   type                          9813 non-null   object 
 2   species                       9813 non-null   object 
 3   age                           9813 non-null   object 
 4   gender                        9813 non-null   object 
 5   size                          9813 non-null   object 
 6   coat                          2855 non-null   object 
 7   name                          9813 non-null   object 
 8   description                   7318 non-null   object 
 9   photos                        9813 non-null   object 
 10  videos                        9813 non-null   object 
 11  breeds.primary                9813 non-null   object 
 12  breeds.secondary              3734 non-null   object 
 13  bre

In [41]:
breeds.head()

Unnamed: 0,url,type,species,age,gender,size,coat,name,description,photos,...,breeds.unknown,colors.primary,colors.secondary,colors.tertiary,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,animal_type,primary_photo_cropped
0,https://www.petfinder.com/dog/bud-49113000/tx/...,Dog,Dog,Baby,Male,Medium,Short,BUD,"Hi! I’m Bud, a 6-month-old Cur/Collie mix. I’m...",[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,...,False,Black,White / Cream,,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,dog,
1,https://www.petfinder.com/dog/daisy-49113009/t...,Dog,Dog,Young,Female,Medium,,Daisy,,[],...,False,Yellow / Tan / Blond / Fawn,White / Cream,,,,,,dog,
2,https://www.petfinder.com/dog/phillip-20-09-09...,Dog,Dog,Senior,Male,Large,,Phillip 20-09-095,,[],...,False,Yellow / Tan / Blond / Fawn,,,,,,,dog,
3,https://www.petfinder.com/dog/sophie-reutter-2...,Dog,Dog,Young,Female,Medium,,Sophie Reutter 20-09-094,,[],...,False,Yellow / Tan / Blond / Fawn,,,,,,,dog,
4,https://www.petfinder.com/dog/ramona-49112958/...,Dog,Dog,Baby,Female,Small,Short,Ramona,These adorably tiny chi mixes will make your l...,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,...,False,Black,Yellow / Tan / Blond / Fawn,,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,dog,


Here I drop many of the columns I don't need to clean up my dataframe.

In [None]:
breeds.drop(columns=['attributes.spayed_neutered','attributes.house_trained','attributes.declawed',
                    'attributes.special_needs','attributes.shots_current','environment.children', 
                     'environment.dogs', 'environment.cats','contact.email', 'contact.phone', 
                     'contact.address.address1','contact.address.address2', 'contact.address.city',
                     'contact.address.state', 'contact.address.postcode',
                     'contact.address.country','organization_id','status', 'tags','status_changed_at', 
                     'published_at', 'distance', 'animal_id', 'organization_animal_id'], inplace=True)

I am again going to export my dataframe to a CSV so I will not have to re-scrape any data.

In [38]:
#breeds.to_csv('./data/10k_mostlyraw_9-16-20.csv')

In [78]:
#breeds=pd.read_csv('./data/10k_mostlyraw_9-16-20.csv')

In [84]:
breeds.columns

Index(['id', 'url', 'type', 'species', 'age', 'gender', 'size', 'coat', 'name',
       'description', 'photos', 'videos', 'breeds.primary', 'breeds.secondary',
       'breeds.mixed', 'breeds.unknown', 'colors.primary', 'colors.secondary',
       'colors.tertiary', 'primary_photo_cropped.small',
       'primary_photo_cropped.medium', 'primary_photo_cropped.large',
       'primary_photo_cropped.full', 'animal_type', 'primary_photo_cropped'],
      dtype='object')

In [42]:
pf.animal_types('dog')

{'type': {'name': 'Dog',
  'coats': ['Hairless', 'Short', 'Medium', 'Long', 'Wire', 'Curly'],
  'colors': ['Apricot / Beige',
   'Bicolor',
   'Black',
   'Brindle',
   'Brown / Chocolate',
   'Golden',
   'Gray / Blue / Silver',
   'Harlequin',
   'Merle (Blue)',
   'Merle (Red)',
   'Red / Chestnut / Orange',
   'Sable',
   'Tricolor (Brown, Black, & White)',
   'White / Cream',
   'Yellow / Tan / Blond / Fawn'],
  'genders': ['Male', 'Female'],
  '_links': {'self': {'href': '/v2/types/dog'},
   'breeds': {'href': '/v2/types/dog/breeds'}}}}

I am dropping all rows that have null values for all the photo columns.

In [85]:
breeds = breeds[breeds['primary_photo_cropped.small'].notna()]
breeds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9813 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            9813 non-null   int64  
 1   url                           9813 non-null   object 
 2   type                          9813 non-null   object 
 3   species                       9813 non-null   object 
 4   age                           9813 non-null   object 
 5   gender                        9813 non-null   object 
 6   size                          9813 non-null   object 
 7   coat                          2855 non-null   object 
 8   name                          9813 non-null   object 
 9   description                   7318 non-null   object 
 10  photos                        9813 non-null   object 
 11  videos                        9813 non-null   object 
 12  breeds.primary                9813 non-null   object 
 13  bre

I am saving another version of the dataframe to csv.

In [86]:
#breeds.to_csv('./data/10k_mostlyraw_9-16-20CLEANED.csv')

In [16]:
#breeds=pd.read_csv('./data/10k_mostlyraw_9-16-20CLEANED.csv')

Now I remove all columns except ones that contain id, breed and photo.

In [17]:
photos = breeds[breeds.columns[breeds.columns.str.contains('id|breed|photo')]]

In [102]:
photos

Unnamed: 0,id,photos,videos,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,primary_photo_cropped,image_width
0,49113000,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Black Mouth Cur,Collie,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
4,49112958,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Chihuahua,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
6,49112972,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
7,49112968,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],German Shepherd Dog,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
8,49112969,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Mixed Breed,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,45801263,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Labrador Retriever,Mixed Breed,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
9996,45795879,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
9997,45795880,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Labrador Retriever,Cattle Dog,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
9998,45795882,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600


I am getting the size of the largest photos because those are the ones I will download.

In [18]:
photos['image_width'] = photos['primary_photo_cropped.large'].str.split('width=', 1).str[1].str.split('&', 0).str[0].astype(int)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


I create a column called image_width to display the size of the large photos.

In [19]:
photos['image_width'].unique()

array([600])

I set the index to be the dog id column.

In [20]:
photos.set_index('id')

Unnamed: 0_level_0,photos,videos,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,primary_photo_cropped,image_width
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
49113000,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Black Mouth Cur,Collie,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
49112958,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Chihuahua,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
49112972,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
49112968,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],German Shepherd Dog,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
49112969,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Mixed Breed,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
...,...,...,...,...,...,...,...,...,...,...,...,...
45801263,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Labrador Retriever,Mixed Breed,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
45795879,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
45795880,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Labrador Retriever,Cattle Dog,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
45795882,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600


I am only keeping the images that have a width of 600.

In [21]:
largephotos = photos.groupby('id').apply(lambda x: x[x['image_width'] == 600])

In [106]:
largephotos.head()

Unnamed: 0,id,photos,videos,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,primary_photo_cropped,image_width
0,45795879,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
1,45795880,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Labrador Retriever,Cattle Dog,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
2,45795882,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
3,45795884,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Cattle Dog,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
4,45801263,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Labrador Retriever,Mixed Breed,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600


Now my dataset has a few extra columns so I delete those and reset the index to be the dog id.

In [22]:
del largephotos['id']
largephotos.reset_index(inplace=True)
del largephotos['level_1']

For the 'breeds.primary' column I replace spaces with _ and / with nothing.

In [23]:
largephotos['breeds.primary'] = largephotos['breeds.primary'].str.replace(' ', '_')
largephotos['breeds.primary'] = largephotos['breeds.primary'].str.replace('/', '')

I check my value counts to see most popular breed in my dataset.

In [12]:
largephotos['breeds.primary'].value_counts()

Pit Bull Terrier       1524
Labrador Retriever     1347
Chihuahua               572
Mixed Breed             495
German Shepherd Dog     473
                       ... 
Swedish Vallhund          1
Pyrenean Shepherd         1
Maltipoo                  1
Schipperke                1
Standard Poodle           1
Name: breeds.primary, Length: 159, dtype: int64

In [14]:
largephotos = largephotos.groupby('breeds.primary').head()
largephotos.head()

Unnamed: 0,id,photos,videos,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,primary_photo_cropped,image_width
0,45795879,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
1,45795880,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Labrador Retriever,Cattle Dog,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
2,45795882,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
3,45795884,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Cattle Dog,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600
4,45801263,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Labrador Retriever,Mixed Breed,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,,600


In [15]:
urls, breed, index = largephotos['primary_photo_cropped.large'].tolist(), largephotos['breeds.primary'].tolist(), largephotos.index.tolist()

I turn that into a list of lists

In [16]:
breed_list=[index, breed, urls]

In [17]:
breed_list_new = []
for i in range(0, len(breed_list[0])):
    breed_list_new.append([breed_list[0][i], breed_list[1][i], breed_list[2][i]])

Create breed directories for the images.

In [18]:
breed_dirs = list(largephotos['breeds.primary'].unique())

Create a directory, dog_breeds, that will contain all the breed directories.

In [None]:
for i in breed_dirs:
    os.makedirs('dog_breeds/' + str(i))

Next, I have a function to download the images.

In [158]:
def download_breed_images(breed_img):
    try:
        urllib.request.urlretrieve(breed_img[2], 
                                   os.path.join('dog_breeds/', 
                                                str(breed_img[1]), str(breed_img[1]) + str(breed_img[0]) + '.jpg'))
    except urllib.error.HTTPError as err:
        print(err.code)

In [159]:
pool = ThreadPool(processes=20)

In [None]:
pool.map(download_breed_images, breed_list_new)
pool.close()
pool.join()