### Starter Section
- [x] set up imports
- [x] API - set-up, call, display
- [x] verify at least 100 records

In [1]:
# set-up basic imports
import requests
import pandas as pd
from pprint import pprint

# set-up imports for creating SQL tables
import numpy as np
from itertools import chain

#set-up imports for transferring table data to SQL (postgres)
from sqlalchemy import create_engine

In [2]:
# API KEY - though it's not needed?
api_key = 'eeec1e46-9491-4fc5-aada-f8cd1955cf46'

In [3]:
#API URL
url = 'https://api.thedogapi.com/v1/breeds'

In [4]:
# get API results
doggos = requests.get(url).json()

In [5]:
# display raw json

# doggos

In [6]:
# checking that records meet 100 minimum requirement

len(doggos)

172

### Data Cleaning
- [x] get list of all attributes used and count to evaluate columns to be used
- [x] reference image id VS image id (resolved - deleted image id)

In [7]:
# create a list of all attributes for dataframe to evaluate appropriate columns for dataframe
all_attributes = []
for group in doggos:
    for line in group:
        if line not in all_attributes:
            all_attributes.append(line)

# count occurances of attributes
att_count = {}
for attribute in all_attributes:
    count = 0
    for group in doggos:
        if attribute in group:
            count += 1
            att_count[attribute] = count
            
# att_count

In [8]:
# check reference image id VS image id to see if either are missing
# RESULT: none were missing (counter = 0), so deleted image id 

# OLD CODE taken out of dogs list  --  img_id = puppers['image'].get('id','none')

# counter = 0
# for index, row in dogs_df.iterrows():
#     if row[9] != row[10]:
#         counter += 1
# print(counter)

### Create Main Dataframe
- [x] set-up: for loops to create list of lists
- [x] dataframe: assign columns
- [x] save dataframe as csv

In [9]:
# Set-up Dog Dataframe (list of lists)
dogs = []

for puppers in doggos:
    entry = []
    name = puppers.get('name','none')
    id_code = puppers.get('id', 'none')
    breedgroup = puppers.get('breed_group','none')
    temperament = puppers.get('temperament', 'none')
    weight = puppers['weight'].get('imperial','none')
    height = puppers['height'].get('imperial', 'none')
    bred_for = puppers.get('bred_for','none')
    life_span = puppers.get('life_span','none')
    origin = puppers.get('origin','none')
    country_code = puppers.get('country_code')
    ref_img_id = puppers.get('reference_image_id', 'none')
    
    entry.append(name)
    entry.append(id_code)
    entry.append(breedgroup)
    entry.append(temperament)
    entry.append(weight)
    entry.append(height)
    entry.append(bred_for)
    entry.append(life_span)
    entry.append(origin)
    entry.append(country_code)
    entry.append(ref_img_id)
    dogs.append(entry)





# THESE are attributes I have not included
#  'description'
#  'history'
# 'image id'

In [10]:
# create dataframe
dogs_df = pd.DataFrame(dogs, columns =['Dog Breed', 
                                       'ID Code',
                                       'Breed Group', 
                                       'Temperament', 
                                       'Weight', 
                                       'Height', 
                                       'Bred For', 
                                       'Life Span', 
                                       'Origin', 
                                       'Country Code',
                                       'Reference Image ID'
                                      ])
# dogs_df.head(20)

In [11]:
# save dogs dataframe to csv file
dogs_df.to_csv(r'C:\Users\chels\OneDrive\Desktop\UNC Boot Camp\PROJECTS\Project-2\Input\dogs_df.csv')

### Temperaments
- [x] create list of all temperaments (resolved: 124 unique temperaments used)
- [x] count the occurance of each temperament in dataset
- [x] sort occurances from greatest occurance to least
- [x] create temperament (columns) V dog breeds (rows) table

In [12]:
# create list of all temperaments (listed once)
temp_list = []
for dog in doggos:
    if 'temperament' in dog:
        temp = dog['temperament'].split(',')
        for att in temp:
            att = att.strip()
            if att not in temp_list:
                temp_list.append(att)
                
# print(len(temp_list))
# pprint(temp_list)

In [18]:
# creating temperament id && name table
temps_df = pd.DataFrame({'temperament_name': temp_list})
temps_df = temps_df.sort_values(by='temperament_name').reset_index(drop=True)
temps_df.index.name = 'temperament_id'
temps_df = temps_df.reset_index()

# temps_df

# temp_id && temp_name - save to csv for Brandy
# temps_df.to_csv(r'C:\Users\chels\OneDrive\Desktop\UNC Boot Camp\PROJECTS\Project-2\Input\SQL\temp_id.csv')

In [36]:
# add dog_breed ID to dogs_df
dogs_df_id = dogs_df.reset_index()
dogs_df_id = dogs_df_id.rename(columns={'index': 'Breed ID'})

dogs_df_id.head()

## TO DO : create "breed" table with matching attributes
# TO DO : add min/max heights and weights use "1" as filler


Unnamed: 0,Breed ID,Dog Breed,ID Code,Breed Group,Temperament,Weight,Height,Bred For,Life Span,Origin,Country Code,Reference Image ID
0,0,Affenpinscher,1,Toy,"Stubborn, Curious, Playful, Adventurous, Activ...",6 - 13,9 - 11.5,"Small rodent hunting, lapdog",10 - 12 years,"Germany, France",,BJa4kxc4X
1,1,Afghan Hound,2,Hound,"Aloof, Clownish, Dignified, Independent, Happy",50 - 60,25 - 27,Coursing and hunting,10 - 13 years,"Afghanistan, Iran, Pakistan",AG,hMyT4CDXR
2,2,African Hunting Dog,3,none,"Wild, Hardworking, Dutiful",44 - 66,30,A wild pack animal,11 years,,,rkiByec47
3,3,Airedale Terrier,4,Terrier,"Outgoing, Friendly, Alert, Confident, Intellig...",40 - 65,21 - 23,"Badger, otter hunting",10 - 13 years,"United Kingdom, England",,1-7cgoZSh
4,4,Akbash Dog,5,Working,"Loyal, Independent, Intelligent, Brave",90 - 120,28 - 34,Sheep guarding,10 - 12 years,,,26pHT3Qk7


In [37]:
breeds = {'breed_id': dogs_df_id['Breed ID'], 
          'breed_name': dogs_df_id['Dog Breed'], 
          'min_height': 1, 
          'max_height': 1, 
          'min_weight': 1, 
          'max_weight': 1, 
          'min_life': 1, 
          'max_life': 1, 
          'group': dogs_df_id['Breed Group']}
breed_df = pd.DataFrame(breeds)
breed_df
# "breed_id" int   NOT NULL,
#     "breed_name" varchar(250)   NOT NULL,
#     "min_height" int,
#     "max_height" int,
#     "min_weight" int,
#     "max_weight" int,
#     "min_life" int,
#     "max_life" int,
#     "group" int,

Unnamed: 0,breed_id,breed_name,min_height,max_height,min_weight,max_weight,min_life,max_life,group
0,0,Affenpinscher,1,1,1,1,1,1,Toy
1,1,Afghan Hound,1,1,1,1,1,1,Hound
2,2,African Hunting Dog,1,1,1,1,1,1,none
3,3,Airedale Terrier,1,1,1,1,1,1,Terrier
4,4,Akbash Dog,1,1,1,1,1,1,Working
...,...,...,...,...,...,...,...,...,...
167,167,Wire Fox Terrier,1,1,1,1,1,1,none
168,168,Wirehaired Pointing Griffon,1,1,1,1,1,1,Sporting
169,169,Wirehaired Vizsla,1,1,1,1,1,1,Sporting
170,170,Xoloitzcuintli,1,1,1,1,1,1,Non-Sporting


In [None]:
dogs_df_id.to_csv(r'C:\Users\chels\OneDrive\Desktop\UNC Boot Camp\PROJECTS\Project-2\Input\SQL\main.csv')

In [28]:
# create breed V temperament table, use https://stackoverflow.com/questions/50731229/split-cell-into-multiple-rows-in-pandas-dataframe
# use 'lookup' to replace breed with id - BOOM there's middle table (reference 62 from ETL project)

# return list from series of comma-separated strings
def chaining(s):
    return list(chain.from_iterable(s.str.split(',')))

# calculate lengths of splits
temps = dogs_df_id['Temperament'].str.split(',').map(len)

# create new dataframe, repeating or chaining as appropriate
dogs_temps = pd.DataFrame({'breed_id': np.repeat(dogs_df_id['Breed ID'], temps),
                    'temperament_name': chaining(dogs_df_id['Temperament'])})
dogs_temps = dogs_temps.reset_index(drop=True)

# dogs_temps

In [29]:
merged = dogs_temps.merge(temps_df, how='inner', on='temperament_name')
breed_temp_ids = merged[['breed_id', 'temperament_id']]

# breed_temp_ids

In [None]:
# save temp_id && breed_id to csv
breed_temp_ids.to_csv(r'C:\Users\chels\OneDrive\Desktop\UNC Boot Camp\PROJECTS\Project-2\Input\SQL\breed_temp_id.csv')

### Download tables to SQL


In [32]:
# enter password for postgres
rds_connection_string = "postgres:postgres@localhost:5432/Project_2"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [38]:
# Add data to empty tables.
### Brandy will give tables, use QuickDBD-export.sql in SQL folder
# temps_df.to_sql(name='temperament', con=engine, if_exists='append', index=False)
breed_df.to_sql(name='breed', con=engine, if_exists='append', index=False)
# breed_temp_ids.to_sql(name='breed_temperament', con=engine, if_exists='append', index=False)

### EXTRAS: Group-Bys && Counts
- [x]  group by breed group
- [ ]  group by weight
    - split string into min/max, store as min max vals (if 1 value min/max same, if 0 'no age' or NaN)
    - create list of weights based on lowest highest 
    - create new table with weights as columns V names as rows, t/f values 
    - count weights 
- [ ]  group by height
    - split string into min/max, store as min max vals (if 1 value then min/max same, if 0 then 'no age' or NaN)
    - create list of heights based on lowest + highest
    - create new table with heights as columns V names as rows, t/f values
    - count heights 
- [x] count temperament occurances
    - create a count of all temperaments, store in dictionary
    - create a dataframe
    

In [None]:
# Breed group
gb_breed_group = dogs_df.groupby('Breed Group')
breed_groups = pd.DataFrame(gb_breed_group['Dog Breed'].count()).reset_index().drop([0])

# breed_groups

In [None]:
# count number of times temperaments appear in dataset
temp_counts = {}

for temperament in temp_list:
    counter = 0
    for dog in doggos:
        if 'temperament' in dog:
            if temperament in dog['temperament']:
                counter += 1
                temp_counts[temperament] = counter

# create df
temp_counts_df = pd.DataFrame(temp_counts.items(), columns=['Temperament', 'Count'])
# temp_counts_df

# save to csv --
# temp_counts_df.to_csv(r'C:\Users\chels\OneDrive\Desktop\UNC Boot Camp\PROJECTS\Project-2\Input\temperament_counts.csv')

## OPTIONAL : sorted temperaments by highest occurances
# sort temp_counts from greatest to least

# temp_count_sort = pd.DataFrame.from_dict(temp_counts, 
#                        orient='index', 
#                        columns=['Temperament Count']).sort_values(by='Temperament Count', 
#                                                                   ascending=False)
# temp_count_sort.head(30)

# save to csv --
# temp_count_sort.to_csv(r'C:\Users\chels\OneDrive\Desktop\UNC Boot Camp\PROJECTS\Project-2\Input\temperament_counts_sorted.csv')

In [None]:
## EXTRAS?

## create dataframe of temperaments, each dog as rows temps as columns
# list of lists, dog name as first list value, t/f for each temperament
all_dogs = []
for dog in doggos:
    dog_list = [dog['name']]
    if 'temperament' in dog:
        for temperament in temp_list:
            if temperament in dog['temperament']:
                dog_list.append('True')
            else:
                dog_list.append('False')
    all_dogs.append(dog_list)

# create table with list of lists, add 'Name' to temp_list to accommodate first list value
column_names = ['Name'] + temp_list
temp_dogs_table = pd.DataFrame(all_dogs, columns= column_names)
# temp_dogs_table