In [90]:
import pandas as pd
import numpy as np
import scipy.stats as scs
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras.models import Sequential, Model, Input
from keras.layers import Dense, Dropout, Activation
from sklearn.model_selection import train_test_split
import tensorflow as tf
import requests 
from bs4 import BeautifulSoup
import json
from IPython.display import display, Image
import urllib.request
from PIL.ExifTags import TAGS
import PIL.Image

In [91]:
def extract_image_url(pd_series):
    '''
    Extracts image URLs from the pictures column in the RescuePets database.
    INPUT: Pandas Series where each item is a list of dictionaries of dictionaries??
    OUTPUT: Pandas dataframe with animalID and imageURL
    '''
    large_image_urls = []
    animalIDs = []
        
    for lst in pd_series:
        for dct in lst:
            large_image_urls.append(dct['largeUrl'])
                
    for url in large_image_urls:
        animalIDs.append(url.split('/')[-2])
    
    return pd.DataFrame({'animalID': animalIDs,'ImageUrl': large_image_urls})

In [109]:
def extract_df(filepath):
    '''
    Extracts orgId, animalID, name breed and animalLocation from RescueGroup JSON and adds imageURLs
    INPUT: JSON filepath, string
    OUTPUT: Pandas dataframes
    '''
    df = pd.read_json(filepath, lines=True)
    images = extract_image_url(df.pictures)
    df1 = df[['orgID','animalID','name','breed','animalLocation']]
    # NOTE: You loose images with this concat
    result = pd.concat([df1, images.ImageUrl], axis=1, join_axes=[df1.index])
    # Return combined dataframe and original image source dataframe
    return result, images

In [110]:
def download_images(urls):
    '''
    Downloads all images from Rescue Pets S3 bucket 
    INPUT: Pandas Series of URLs
    OUTPUT: Images stored in data directory.
    '''
    for image_url in list(urls)[0:25]:
        image_name = image_url.split('/')[-1]
        r = requests.get(image_url, allow_redirects = True)
        open('data/'+image_name, 'wb').write(r.content)

In [111]:
### Still working on this function
def rotate_image(file):
    '''
    Rotates images uploaded by user's smartphone via exif data.
    Images need to be rotated to proper orientation prior to preprocessing step.
    '''
    image=Image.open(file)
    try:
        for orientation in ExifTags.TAGS.keys():
            if ExifTags.TAGS[orientation]=='Orientation':
                break
            exif=dict(image._getexif().items())
    
        if exif[orientation] == 3:
            print('Rotate 180 degrees!')
            image=image.rotate(180, expand=True)
        elif exif[orientation] == 6:
            print('Rotate 270 degrees!')
            image=image.rotate(270, expand=True)
        elif exif[orientation] == 8:
            print('Rotate 90 degrees!')
            image=image.rotate(90, expand=True)
        image.save(file)
        image.close()
    except (AttributeError, KeyError, IndexError):
    # cases: image don't have getexif   
        pass
    return(image)

In [None]:
#Function to extract exif data from smartphone image and view in nice format

from PIL.ExifTags import TAGS

def extract_image_data(file):

im = PIL.Image.open(filename)
exifdict = im._getexif()
#print(exifdict)

if len(exifdict):
    for k in exifdict.keys():
        if k in TAGS.keys():
            print(TAGS[k], exifdict[k])
        else:
            print(k, exifdict[k])

In [112]:
#new_pets_df = pd.read_json('data/h9DH7711_newpets_1.json', lines=True)
#pets1_df = pd.read_json('data/h9DH7711_pets_1.json', lines=True)
#pets2_df = pd.read_json('data/h9DH7711_pets_2.json', lines=True)
#pets3_df = pd.read_json('data/h9DH7711_pets_3.json', lines=True)
#pets4_df = pd.read_json('data/h9DH7711_pets_4.json', lines=True)
#pets5_df = pd.read_json('data/h9DH7711_pets_5.json', lines=True)

#import pdb 
#pdb.set_trace()

In [113]:
df0, image0 = extract_df('data/h9DH7711_newpets_1.json')
df1, image1 = extract_df('data/h9DH7711_pets_1.json')
df2, image2 = extract_df('data/h9DH7711_pets_2.json')
df3, image3 = extract_df('data/h9DH7711_pets_3.json')
df4, image4 = extract_df('data/h9DH7711_pets_4.json')
df5, image5 = extract_df('data/h9DH7711_pets_5.json')

In [114]:
combined_df = df0.append([df1, df2, df3, df4, df5])
combined_imgs = image0.append([image1, image2, image3, image4, image5])
combined_df = combined_df.reset_index(drop=True)
combined_imgs = combined_imgs.reset_index(drop=True)

In [115]:
total_records = [df0.shape[0], df1.shape[0], df2.shape[0], df3.shape[0], df4.shape[0], df5.shape[0]]
image_records = [image0.shape[0], image1.shape[0], image2.shape[0], image3.shape[0], image4.shape[0], image5.shape[0]]
print('Total Records: ',sum(total_records))
print('Total Images: ',sum(image_records))

Total Records:  49755
Total Images:  143562


In [128]:
#combined_df.columns
#combined_imgs.columns
combined_imgs.head()
combined_df.tail(50)

Unnamed: 0,orgID,animalID,name,breed,animalLocation,ImageUrl
49705,939,13402679,HARMONY,Labrador Retriever (medium coat),30318,https://s3.amazonaws.com/filestore.rescuegroup...
49706,939,13402680,ROSERY,Labrador Retriever (medium coat),30318,https://s3.amazonaws.com/filestore.rescuegroup...
49707,939,13402681,CHARLIE,Labrador Retriever (medium coat),30318,https://s3.amazonaws.com/filestore.rescuegroup...
49708,939,13402682,BARLEY,Labrador Retriever (medium coat),30318,https://s3.amazonaws.com/filestore.rescuegroup...
49709,939,13402683,MARLEY,Labrador Retriever (medium coat),30318,https://s3.amazonaws.com/filestore.rescuegroup...
49710,939,13402684,TALIA,Labrador Retriever / Mixed (medium coat),30318,https://s3.amazonaws.com/filestore.rescuegroup...
49711,939,13402685,SPIKE,American Pit Bull Terrier (medium coat),30318,https://s3.amazonaws.com/filestore.rescuegroup...
49712,1460,13402687,molly,Staffordshire Bull Terrier (short coat),85354,https://s3.amazonaws.com/filestore.rescuegroup...
49713,4141,13402688,Chicco *,Boxer / Mixed (short coat),33256,https://s3.amazonaws.com/filestore.rescuegroup...
49714,6744,13402689,Cocoa JuM,Terrier / Mixed (short coat),15672,https://s3.amazonaws.com/filestore.rescuegroup...


0

In [None]:
#Columns to drop?: 'petUrl','drools' 
#Columns to rename?: animalLocation to zipcode use GeoPy to get city and state? 
#Activity Level change to scale from 1 to 4?
#Age change to numeric scale from 1 to 4 for Baby, Young, Adult, Senior? '' = UNK?
#'apartment', 'cratetrained', 'declawed' change to numeric 0,1=yes
#'birthdate' to get exact age? lot of nulls
#'breed','color', 'descriptionPlain'...tokenize with NLP? Combine them?
#'eventempered',eagerToPlease','cats','dogs' good with cats or dogs?, change to numeric? 0,1,2=UNK
#'coatLength' Med, Short, Long, ''=UNK
#'contactEmail' & 'contactCellPhone'=='contactHomePhone' drop? only 1 seen in new Dog JSON...only drop after merging all JSONs!!
#'contactName' foster? drop?
#'exerciseNeeds' Low, Moderate, High, Not Required, ''=UNK...combine with activity level?
#'description' == 'trackerImageUrl'
#'eyeColor' various string descriptions lot of blanks.. merge to description?
#'fence'..Not Required, Any Type, 3ft or 6ft
#Need more info on following columns: 'altered','courtesy','lastUpdated','mediaLastUpdated','MessagePet'

#df[df['eyeColor']==''].count()
#df.activityLevel.isnull().count()
#df[df['earType']==''].count()
#type(df.breed[10])
#df.shape
#df.eagerToPlease.unique()
#df.pictures[698]
new_pets_df.pictures[1]
#df.name[df.name == 'Atlas']
#df.animalLocation[df.animalLocation =="90018"]

In [None]:
pd.scatter_matrix(df, alpha=0.2, diagonal='kde', figsize=(12,12))

In [None]:
import requests
import pymongo

mc = pymongo.MongoClient()
scraper_db = mc['scraper']
sites = scraper_db['sites']
sites.delete_many({})


def retrieve_site(url:str) -> bytes:
    for site in sites.find():
        if site['url'] == url:
            return site['data']

        
def scrape_site(url:str) -> bytes:
    data = retrieve_site(url)
    if data:
        return data
    response = requests.get(url)
    data = response.content
    sites.insert_one({'url': url,'data': data})
    return data