### Starter Section
- [x] set up imports
- [x] API - set-up, call, display
- [x] verify at least 100 records

In [1]:
# set-up basic imports
import requests
import pandas as pd
from pprint import pprint

# set-up imports for creating SQL tables
import numpy as np
from itertools import chain

In [2]:
# API KEY - though it's not needed?
api_key = 'eeec1e46-9491-4fc5-aada-f8cd1955cf46'

In [3]:
#API URL
url = 'https://api.thedogapi.com/v1/breeds'

In [4]:
# get API results
doggos = requests.get(url).json()

In [5]:
# display raw json

# doggos

In [6]:
# checking that records meet 100 minimum requirement

len(doggos)

172

### Data Cleaning
- [x] get list of all attributes used and count to evaluate columns to be used
- [x] reference image id VS image id (resolved - deleted image id)

In [7]:
# create a list of all attributes for dataframe to evaluate appropriate columns for dataframe
all_attributes = []
for group in doggos:
    for line in group:
        if line not in all_attributes:
            all_attributes.append(line)
# all_attributes

In [8]:
# count occurances of attributes
att_count = {}
for attribute in all_attributes:
    count = 0
    for group in doggos:
        if attribute in group:
            count += 1
            att_count[attribute] = count
            
att_count

{'weight': 172,
 'height': 172,
 'id': 172,
 'name': 172,
 'bred_for': 151,
 'breed_group': 156,
 'life_span': 172,
 'temperament': 168,
 'origin': 5,
 'reference_image_id': 172,
 'image': 172,
 'country_code': 12,
 'description': 1,
 'history': 2}

In [9]:
# check reference image id VS image id to see if either are missing
# RESULT: none were missing (counter = 0), so deleted image id 

# OLD CODE taken out of dogs list  --  img_id = puppers['image'].get('id','none')

# counter = 0
# for index, row in dogs_df.iterrows():
#     if row[9] != row[10]:
#         counter += 1
# print(counter)

### Create Main Dataframe
- [x] set-up: for loops to create list of lists
- [x] dataframe: assign columns
- [x] save dataframe as csv

In [10]:
# Set-up Dog Dataframe (list of lists)
dogs = []

for puppers in doggos:
    entry = []
    name = puppers.get('name','none')
    id_code = puppers.get('id', 'none')
    breedgroup = puppers.get('breed_group','none')
    temperament = puppers.get('temperament', 'none')
    weight = puppers['weight'].get('imperial','none')
    height = puppers['height'].get('imperial', 'none')
    bred_for = puppers.get('bred_for','none')
    life_span = puppers.get('life_span','none')
    origin = puppers.get('origin','none')
    country_code = puppers.get('country_code')
    ref_img_id = puppers.get('reference_image_id', 'none')
    
    entry.append(name)
    entry.append(id_code)
    entry.append(breedgroup)
    entry.append(temperament)
    entry.append(weight)
    entry.append(height)
    entry.append(bred_for)
    entry.append(life_span)
    entry.append(origin)
    entry.append(country_code)
    entry.append(ref_img_id)
    dogs.append(entry)





# THESE are attributes I have not included
#  'description'
#  'history'
# 'image id'

In [11]:
# create dataframe
dogs_df = pd.DataFrame(dogs, columns =['Dog Breed', 
                                       'ID Code',
                                       'Breed Group', 
                                       'Temperament', 
                                       'Weight', 
                                       'Height', 
                                       'Bred For', 
                                       'Life Span', 
                                       'Origin', 
                                       'Country Code',
                                       'Reference Image ID'
                                      ])
dogs_df.head(20)

Unnamed: 0,Dog Breed,ID Code,Breed Group,Temperament,Weight,Height,Bred For,Life Span,Origin,Country Code,Reference Image ID
0,Affenpinscher,1,Toy,"Stubborn, Curious, Playful, Adventurous, Activ...",6 - 13,9 - 11.5,"Small rodent hunting, lapdog",10 - 12 years,"Germany, France",,BJa4kxc4X
1,Afghan Hound,2,Hound,"Aloof, Clownish, Dignified, Independent, Happy",50 - 60,25 - 27,Coursing and hunting,10 - 13 years,"Afghanistan, Iran, Pakistan",AG,hMyT4CDXR
2,African Hunting Dog,3,none,"Wild, Hardworking, Dutiful",44 - 66,30,A wild pack animal,11 years,,,rkiByec47
3,Airedale Terrier,4,Terrier,"Outgoing, Friendly, Alert, Confident, Intellig...",40 - 65,21 - 23,"Badger, otter hunting",10 - 13 years,"United Kingdom, England",,1-7cgoZSh
4,Akbash Dog,5,Working,"Loyal, Independent, Intelligent, Brave",90 - 120,28 - 34,Sheep guarding,10 - 12 years,,,26pHT3Qk7
5,Akita,6,Working,"Docile, Alert, Responsive, Dignified, Composed...",65 - 115,24 - 28,Hunting bears,10 - 14 years,none,,BFRYBufpm
6,Alapaha Blue Blood Bulldog,7,Mixed,"Loving, Protective, Trainable, Dutiful, Respon...",55 - 90,18 - 24,Guarding,12 - 13 years,none,,33mJ-V3RX
7,Alaskan Husky,8,Mixed,"Friendly, Energetic, Loyal, Gentle, Confident",38 - 50,23 - 26,Sled pulling,10 - 13 years,none,,-HgpNnGXl
8,Alaskan Malamute,9,Working,"Friendly, Affectionate, Devoted, Loyal, Dignif...",65 - 100,23 - 25,"Hauling heavy freight, Sled pulling",12 - 15 years,none,,dW5UucTIW
9,American Bulldog,10,Working,"Friendly, Assertive, Energetic, Loyal, Gentle,...",60 - 120,22 - 27,none,10 - 12 years,none,,pk1AAdloG


In [12]:
# save dogs dataframe to csv file
dogs_df.to_csv(r'C:\Users\chels\OneDrive\Desktop\UNC Boot Camp\PROJECTS\Project-2\Input\dogs_df.csv')

### Temperaments
- [x] create list of all temperaments (resolved: 124 unique temperaments used)
- [x] count the occurance of each temperament in dataset
- [x] sort occurances from greatest occurance to least
- [x] create temperament (columns) V dog breeds (rows) table

In [95]:
# create list of all temperaments (listed once)
temp_list = []
for dog in doggos:
    if 'temperament' in dog:
        temp = dog['temperament'].split(',')
        for att in temp:
            att = att.strip()
            if att not in temp_list:
                temp_list.append(att)
                
# print(len(temp_list))
# pprint(temp_list)

In [96]:
# creating temperament id && name table
temps_df = pd.DataFrame({'temp_name': temp_list})
temps_df = temps_df.sort_values(by='temp_name').reset_index(drop=True)
temps_df.index.name = 'temp_id'
temps_df = temps_df.reset_index()


Unnamed: 0,temp_id,temp_name
0,0,Active
1,1,Adaptable
2,2,Adventurous
3,3,Affectionate
4,4,Aggressive
...,...,...
119,119,Vigilant
120,120,Vocal
121,121,Watchful
122,122,Wild


In [98]:
# temp_id && temp_name - save to csv for Brandy
temps_df.to_csv(r'C:\Users\chels\OneDrive\Desktop\UNC Boot Camp\PROJECTS\Project-2\Input\SQL\temp_id.csv')

In [14]:
# count number of times temperaments appear in dataset
temp_counts = {}

for temperament in temp_list:
    counter = 0
    for dog in doggos:
        if 'temperament' in dog:
            if temperament in dog['temperament']:
                counter += 1
                temp_counts[temperament] = counter

# temp_counts

In [59]:
# sort temp_counts from greatest to least
temp_count_sort = pd.DataFrame.from_dict(temp_counts, 
                       orient='index', 
                       columns=['Temperament Count']).sort_values(by='Temperament Count', 
                                                                  ascending=False)
# temp_count_sort.head(30)

In [57]:
## create dataframe of temperaments, each dog as rows temps as columns
# list of lists, dog name as first list value, t/f for each temperament
all_dogs = []
for dog in doggos:
    dog_list = [dog['name']]
    if 'temperament' in dog:
        for temperament in temp_list:
            if temperament in dog['temperament']:
                dog_list.append('True')
            else:
                dog_list.append('False')
    all_dogs.append(dog_list)

# create table with list of lists, add 'Name' to temp_list to accommodate first list value
column_names = ['Name'] + temp_list
temp_dogs_table = pd.DataFrame(all_dogs, columns= column_names)
# temp_dogs_table

In [61]:
# add dog_breed ID to dogs_df
dogs_df_id = dogs_df.reset_index()
dogs_df_id

Unnamed: 0,index,Dog Breed,ID Code,Breed Group,Temperament,Weight,Height,Bred For,Life Span,Origin,Country Code,Reference Image ID
0,0,Affenpinscher,1,Toy,"Stubborn, Curious, Playful, Adventurous, Activ...",6 - 13,9 - 11.5,"Small rodent hunting, lapdog",10 - 12 years,"Germany, France",,BJa4kxc4X
1,1,Afghan Hound,2,Hound,"Aloof, Clownish, Dignified, Independent, Happy",50 - 60,25 - 27,Coursing and hunting,10 - 13 years,"Afghanistan, Iran, Pakistan",AG,hMyT4CDXR
2,2,African Hunting Dog,3,none,"Wild, Hardworking, Dutiful",44 - 66,30,A wild pack animal,11 years,,,rkiByec47
3,3,Airedale Terrier,4,Terrier,"Outgoing, Friendly, Alert, Confident, Intellig...",40 - 65,21 - 23,"Badger, otter hunting",10 - 13 years,"United Kingdom, England",,1-7cgoZSh
4,4,Akbash Dog,5,Working,"Loyal, Independent, Intelligent, Brave",90 - 120,28 - 34,Sheep guarding,10 - 12 years,,,26pHT3Qk7
...,...,...,...,...,...,...,...,...,...,...,...,...
167,167,Wire Fox Terrier,259,none,"Fearless, Friendly, Bold, Keen, Alert, Quick",15 - 19,13 - 16,"Vermin hunting, fox bolting",13 – 14 years,none,,SJ6f2g9EQ
168,168,Wirehaired Pointing Griffon,260,Sporting,"Loyal, Gentle, Vigilant, Trainable, Proud",45 - 70,20 - 24,"Gundog, ""swamp-tromping"", Flushing, pointing, ...",12 - 14 years,none,,Bkam2l9Vm
169,169,Wirehaired Vizsla,261,Sporting,none,45 - 65,21.5 - 25,none,12 - 14 years,none,,r1I4hl5Em
170,170,Xoloitzcuintli,262,Non-Sporting,"Cheerful, Alert, Companionable, Intelligent, P...",9 - 31,10 - 23,none,12 - 14 years,none,,HkNS3gqEm


In [69]:
# create breed V temperament table, use https://stackoverflow.com/questions/50731229/split-cell-into-multiple-rows-in-pandas-dataframe
# use 'lookup' to replace breed with id - BOOM there's middle table (reference 62 from ETL project)

# return list from series of comma-separated strings
def chaining(s):
    return list(chain.from_iterable(s.str.split(',')))

# calculate lengths of splits
temps = dogs_df_id['Temperament'].str.split(',').map(len)

# create new dataframe, repeating or chaining as appropriate
dogs_temps = pd.DataFrame({'breed_id': np.repeat(dogs_df_id['index'], temps),
                    'temp_name': chaining(dogs_df_id['Temperament'])})
dogs_temps = dogs_temps.reset_index(drop=True)
dogs_temps

## BRANDY'S ETL CODE: FOR REFERENCE ONLY, DO NOT USE
# review_final = review_final.rename(columns={'index': 'review_id','Title': 'book_fk','source': 'source_fk', 'Rating': 'rating'})
# review_final.head()

Unnamed: 0,breed_id,temp_name
0,0,Stubborn
1,0,Curious
2,0,Playful
3,0,Adventurous
4,0,Active
...,...,...
1027,171,Bold
1028,171,Independent
1029,171,Confident
1030,171,Intelligent


### Group-Bys
- [x]  group by breed group
- [ ]  group by weight
    - split string into min/max, store as min max vals (if 1 value min/max same, if 0 'no age' or NaN)
    - create list of weights based on lowest highest 
    - create new table with weights as columns V names as rows, t/f values 
    - count weights 
- [ ]  group by height
    - split string into min/max, store as min max vals (if 1 value then min/max same, if 0 then 'no age' or NaN)
    - create list of heights based on lowest + highest
    - create new table with heights as columns V names as rows, t/f values
    - count heights 

In [57]:
# Breed group
gb_breed_group = dogs_df.groupby('Breed Group')
breed_groups = pd.DataFrame(gb_breed_group['Dog Breed'].count()).reset_index().drop([0])

# breed_groups