## Data scraping and download (Round 2)

In [2]:
import time
import petpy
import os
import json
import pandas as pd
import urllib.request
import urllib.error
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool

In [3]:
key = os.getenv('PETFINDER_KEY')
pf = petpy.Petfinder('key', 'secret')

Since this is my second round of scraping dog images, I have decided to use Nashville as the location (since it's far enough from the center of Texas) and set the distance to 500 to cast a far net.
I was able to get another 10k dogs

In [5]:
breeds = pf.animals(location=37011, animal_type='dog', distance=500, pages=100, results_per_page=100, return_df=True)

In [6]:
breeds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            10000 non-null  int64  
 1   organization_id               10000 non-null  object 
 2   url                           10000 non-null  object 
 3   type                          10000 non-null  object 
 4   species                       10000 non-null  object 
 5   age                           10000 non-null  object 
 6   gender                        10000 non-null  object 
 7   size                          10000 non-null  object 
 8   coat                          3239 non-null   object 
 9   tags                          10000 non-null  object 
 10  name                          10000 non-null  object 
 11  description                   7815 non-null   object 
 12  organization_animal_id        6742 non-null   object 
 13  ph

In [7]:
breeds.head()

Unnamed: 0,id,organization_id,url,type,species,age,gender,size,coat,tags,...,contact.address.state,contact.address.postcode,contact.address.country,animal_id,animal_type,organization_id.1,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full
0,49398423,OH82,https://www.petfinder.com/dog/eden-49398423/oh...,Dog,Dog,Young,Female,Medium,,[],...,OH,45458,US,49398423,dog,oh82,,,,
1,49398414,OH82,https://www.petfinder.com/dog/valerie-49398414...,Dog,Dog,Adult,Female,Medium,,[],...,OH,45458,US,49398414,dog,oh82,,,,
2,49398415,OH82,https://www.petfinder.com/dog/halen-49398415/o...,Dog,Dog,Adult,Male,Medium,,[],...,OH,45458,US,49398415,dog,oh82,,,,
3,49398421,OH82,https://www.petfinder.com/dog/nate-49398421/oh...,Dog,Dog,Young,Male,Small,,[],...,OH,45458,US,49398421,dog,oh82,,,,
4,49398412,OH82,https://www.petfinder.com/dog/ace-49398412/oh/...,Dog,Dog,Adult,Male,Large,,[],...,OH,45458,US,49398412,dog,oh82,,,,


Here I drop many of the columns I don't need to clean up my dataframe.

In [8]:
# i am dropping columns i dont need
breeds.drop(columns=['attributes.spayed_neutered','attributes.house_trained','attributes.declawed',
                    'attributes.special_needs','attributes.shots_current','environment.children', 
                     'environment.dogs', 'environment.cats','contact.email', 'contact.phone', 
                     'contact.address.address1','contact.address.address2', 'contact.address.city',
                     'contact.address.state', 'contact.address.postcode',
                     'contact.address.country','organization_id','status', 'tags',
                     'status_changed_at', 'published_at', 'distance', 'animal_id',
                     'organization_animal_id'], inplace=True)

I am again going to export my dataframe to a CSV so I will not have to re-scrape any data.

In [9]:
#breeds.to_csv('./data/10k_mostlyraw_nashville_10-09-20.csv')

In [78]:
#breeds=pd.read_csv('./data/10k_mostlyraw_9-16-20.csv')

In [10]:
breeds.columns

Index(['id', 'url', 'type', 'species', 'age', 'gender', 'size', 'coat', 'name',
       'description', 'photos', 'primary_photo_cropped', 'videos',
       'breeds.primary', 'breeds.secondary', 'breeds.mixed', 'breeds.unknown',
       'colors.primary', 'colors.secondary', 'colors.tertiary', 'animal_type',
       'primary_photo_cropped.small', 'primary_photo_cropped.medium',
       'primary_photo_cropped.large', 'primary_photo_cropped.full'],
      dtype='object')

In [11]:
pf.animal_types('dog')

{'type': {'name': 'Dog',
  'coats': ['Hairless', 'Short', 'Medium', 'Long', 'Wire', 'Curly'],
  'colors': ['Apricot / Beige',
   'Bicolor',
   'Black',
   'Brindle',
   'Brown / Chocolate',
   'Golden',
   'Gray / Blue / Silver',
   'Harlequin',
   'Merle (Blue)',
   'Merle (Red)',
   'Red / Chestnut / Orange',
   'Sable',
   'Tricolor (Brown, Black, & White)',
   'White / Cream',
   'Yellow / Tan / Blond / Fawn'],
  'genders': ['Male', 'Female'],
  '_links': {'self': {'href': '/v2/types/dog'},
   'breeds': {'href': '/v2/types/dog/breeds'}}}}

I am dropping all rows that have null values for all the photo columns.

In [13]:
breeds = breeds[breeds['primary_photo_cropped.small'].notna()]
breeds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9713 entries, 5 to 9999
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            9713 non-null   int64  
 1   url                           9713 non-null   object 
 2   type                          9713 non-null   object 
 3   species                       9713 non-null   object 
 4   age                           9713 non-null   object 
 5   gender                        9713 non-null   object 
 6   size                          9713 non-null   object 
 7   coat                          3208 non-null   object 
 8   name                          9713 non-null   object 
 9   description                   7703 non-null   object 
 10  photos                        9713 non-null   object 
 11  primary_photo_cropped         0 non-null      float64
 12  videos                        9713 non-null   object 
 13  bre

I am saving another version of the dataframe to csv.

In [14]:
#breeds.to_csv('./data/10k_mostlyraw_nashville_10-09-20CLEANED.csv')

In [16]:
#breeds=pd.read_csv('./data/10k_mostlyraw_9-16-20CLEANED.csv')

Now I remove all columns except ones that contain id, breed and photo.

In [16]:
photos = breeds[breeds.columns[breeds.columns.str.contains('id|breed|photo')]]

In [17]:
photos

Unnamed: 0,id,photos,primary_photo_cropped,videos,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full
5,49398401,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Mixed Breed,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...
6,49398290,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Australian Cattle Dog / Blue Heeler,German Shepherd Dog,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...
7,49393253,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Terrier,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...
8,49398318,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...
10,49398317,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,49088756,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],American Bulldog,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...
9996,49088535,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Miniature Pinscher,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...
9997,49088674,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Border Collie,Mixed Breed,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...
9998,49088617,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Golden Retriever,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...


I am getting the size of the largest photos because those are the ones I will download.

In [18]:
# i am getting the size of the largest photo
photos['image_width'] = photos['primary_photo_cropped.large'].str.split('width=', 1).str[1].str.split('&', 0).str[0].astype(int)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


I create a column called image_width to display the size of the large photos.

In [19]:
photos['image_width'].unique()

array([600])

I set the index to be the dog id column.

In [20]:
photos.set_index('id')

Unnamed: 0_level_0,photos,primary_photo_cropped,videos,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,image_width
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
49398401,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Mixed Breed,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49398290,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Australian Cattle Dog / Blue Heeler,German Shepherd Dog,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49393253,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Terrier,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49398318,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49398317,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
...,...,...,...,...,...,...,...,...,...,...,...,...
49088756,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],American Bulldog,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49088535,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Miniature Pinscher,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49088674,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Border Collie,Mixed Breed,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49088617,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,,[],Golden Retriever,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600


I need to drop the primary_photo_cropped column since it is empty.

In [21]:
photos.drop(columns=['primary_photo_cropped'], inplace=True)
photos.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,id,photos,videos,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,image_width
5,49398401,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Mixed Breed,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
6,49398290,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Australian Cattle Dog / Blue Heeler,German Shepherd Dog,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
7,49393253,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Terrier,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
8,49398318,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
10,49398317,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Shepherd,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600


I have decided to keep the photos that have a width of 600

In [39]:
lrgphotos = photos.groupby('id').apply(lambda x: x[x['image_width'] == 600])

In [24]:
lrgphotos.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,photos,videos,breeds.primary,breeds.secondary,breeds.mixed,breeds.unknown,primary_photo_cropped.small,primary_photo_cropped.medium,primary_photo_cropped.large,primary_photo_cropped.full,image_width
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
46140200,9091,46140200,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Pit Bull Terrier,,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
48849178,6426,48849178,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Weimaraner,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49010594,7835,49010594,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Dachshund,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49029975,8737,49029975,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Pit Bull Terrier,,False,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600
49052909,7399,49052909,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[],Mountain Cur,Pit Bull Terrier,True,False,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,https://dl5zpyw5k3jeb.cloudfront.net/photos/pe...,600


Now my dataset has a few extra columns so I delete those and reset the index to be the dog id.

In [40]:
del lrgphotos['id']
lrgphotos.reset_index(inplace=True)
del lrgphotos['level_1']

For the 'breeds.primary' column I replace spaces with _ and / with nothing.

In [28]:
lrgphotos['breeds.primary'] = lrgphotos['breeds.primary'].str.replace(' ', '_')
lrgphotos['breeds.primary'] = lrgphotos['breeds.primary'].str.replace('/', '')

I check my value counts to see most popular breed in my dataset. Just like my last scrape, the Pit Bull Terrier has the highest count.

In [29]:
lrgphotos['breeds.primary'].value_counts()

Pit Bull Terrier                      1344
Labrador Retriever                    1039
Mixed Breed                            936
Terrier                                509
Hound                                  462
                                      ... 
McNab                                    1
Akbash                                   1
Nova Scotia Duck Tolling Retriever       1
Pyrenean Shepherd                        1
Pembroke Welsh Corgi                     1
Name: breeds.primary, Length: 172, dtype: int64

In [30]:
urls, breed, index = lrgphotos['primary_photo_cropped.large'].tolist(), lrgphotos['breeds.primary'].tolist(), lrgphotos.index.tolist()

In [31]:
breed_list=[index, breed, urls]

In [32]:
breed_list_new = []
for i in range(0, len(breed_list[0])):
    breed_list_new.append([breed_list[0][i], breed_list[1][i], breed_list[2][i]])

In [33]:
len(breed_list_new)

9713

Create breed directories for the images.

In [34]:
breed_dirs = list(lrgphotos['breeds.primary'].unique())

Create a directory, dog_breeds, that will contain all the breed directories.

In [35]:
for i in breed_dirs:
    os.makedirs('dog_breeds/' + str(i))

A function to download the images.

In [36]:
def download_breed_images(breed_img):
    try:
        urllib.request.urlretrieve(breed_img[2], 
                                   os.path.join('dog_breeds/', 
                                                str(breed_img[1]), str(breed_img[1]) + str(breed_img[0]) + '.jpg'))
    except urllib.error.HTTPError as err:
        print(err.code)

In [37]:
pool = ThreadPool(processes=20)

In [38]:
pool.map(download_breed_images, breed_list_new)
pool.close()
pool.join()

415
