# Pandas - Basics (Review), Reading Data from File / URL, Cleaning Incoming Data


In [104]:
import pandas as pd
d = [["$229.2", 2017, 123000, "$1100", "Cupertino, US"],
     ["$211.9", 2017, 320671, "$284", "Suwon, South Korea"],
     ["$177.8", 2017, 566000, "$985",  "Seattle, US"],
     ["$154.7", 2017, 1300000, "$66", "New Taipei City, Taiwan"],
     ["$110.8", 2017, 80110, "$834", "Mountain View, US"]]

comps = ["apple", "samsung", "amazon", "foxconn", "alphabet"]
cols = ["revenue", "fy", "employees", "mcap", "location"]

c = pd.DataFrame(d, index=comps, columns=cols)

In [105]:
c

Unnamed: 0,revenue,fy,employees,mcap,location
apple,$229.2,2017,123000,$1100,"Cupertino, US"
samsung,$211.9,2017,320671,$284,"Suwon, South Korea"
amazon,$177.8,2017,566000,$985,"Seattle, US"
foxconn,$154.7,2017,1300000,$66,"New Taipei City, Taiwan"
alphabet,$110.8,2017,80110,$834,"Mountain View, US"


## Dropping Columns

In [106]:
del c['mcap']

In [107]:
c

Unnamed: 0,revenue,fy,employees,location
apple,$229.2,2017,123000,"Cupertino, US"
samsung,$211.9,2017,320671,"Suwon, South Korea"
amazon,$177.8,2017,566000,"Seattle, US"
foxconn,$154.7,2017,1300000,"New Taipei City, Taiwan"
alphabet,$110.8,2017,80110,"Mountain View, US"


In [108]:
# specify axis=1 or axis="columns"
c.drop('fy', axis=1)

Unnamed: 0,revenue,employees,location
apple,$229.2,123000,"Cupertino, US"
samsung,$211.9,320671,"Suwon, South Korea"
amazon,$177.8,566000,"Seattle, US"
foxconn,$154.7,1300000,"New Taipei City, Taiwan"
alphabet,$110.8,80110,"Mountain View, US"


In [109]:
c

Unnamed: 0,revenue,fy,employees,location
apple,$229.2,2017,123000,"Cupertino, US"
samsung,$211.9,2017,320671,"Suwon, South Korea"
amazon,$177.8,2017,566000,"Seattle, US"
foxconn,$154.7,2017,1300000,"New Taipei City, Taiwan"
alphabet,$110.8,2017,80110,"Mountain View, US"


In [110]:
# use inplace=True to change DataFrame that drop was called on
c.drop('fy', axis='columns', inplace=True)

In [111]:
c

Unnamed: 0,revenue,employees,location
apple,$229.2,123000,"Cupertino, US"
samsung,$211.9,320671,"Suwon, South Korea"
amazon,$177.8,566000,"Seattle, US"
foxconn,$154.7,1300000,"New Taipei City, Taiwan"
alphabet,$110.8,80110,"Mountain View, US"


## Retrieving Values

In [112]:
c['employees']

apple        123000
samsung      320671
amazon       566000
foxconn     1300000
alphabet      80110
Name: employees, dtype: int64

In [113]:
c

Unnamed: 0,revenue,employees,location
apple,$229.2,123000,"Cupertino, US"
samsung,$211.9,320671,"Suwon, South Korea"
amazon,$177.8,566000,"Seattle, US"
foxconn,$154.7,1300000,"New Taipei City, Taiwan"
alphabet,$110.8,80110,"Mountain View, US"


In [114]:
c['revenue']

apple       $229.2
samsung     $211.9
amazon      $177.8
foxconn     $154.7
alphabet    $110.8
Name: revenue, dtype: object

## Three Ways to Retrieve a Specific Value

In [115]:
c['revenue']['amazon']

'$177.8'

In [116]:
c.loc['amazon', 'revenue']

'$177.8'

In [117]:
c.iloc[2, 0]

'$177.8'

## Working with Rows

In [118]:
c.head(3)

Unnamed: 0,revenue,employees,location
apple,$229.2,123000,"Cupertino, US"
samsung,$211.9,320671,"Suwon, South Korea"
amazon,$177.8,566000,"Seattle, US"


In [119]:
c[:3]

Unnamed: 0,revenue,employees,location
apple,$229.2,123000,"Cupertino, US"
samsung,$211.9,320671,"Suwon, South Korea"
amazon,$177.8,566000,"Seattle, US"


In [120]:
c['apple':'amazon'] # 2nd index is inclusive!

Unnamed: 0,revenue,employees,location
apple,$229.2,123000,"Cupertino, US"
samsung,$211.9,320671,"Suwon, South Korea"
amazon,$177.8,566000,"Seattle, US"


In [121]:
c['apple':'amazon'][['revenue', 'location']]

Unnamed: 0,revenue,location
apple,$229.2,"Cupertino, US"
samsung,$211.9,"Suwon, South Korea"
amazon,$177.8,"Seattle, US"


In [122]:
c.index # show only row labels

Index(['apple', 'samsung', 'amazon', 'foxconn', 'alphabet'], dtype='object')

In [123]:
c

Unnamed: 0,revenue,employees,location
apple,$229.2,123000,"Cupertino, US"
samsung,$211.9,320671,"Suwon, South Korea"
amazon,$177.8,566000,"Seattle, US"
foxconn,$154.7,1300000,"New Taipei City, Taiwan"
alphabet,$110.8,80110,"Mountain View, US"


## Add a Column (Align by Labels, Fill in Missing with NaN)

In [124]:
c['state'] = pd.Series({'apple':'CA', 'amazon':'WA'})

In [125]:
c

Unnamed: 0,revenue,employees,location,state
apple,$229.2,123000,"Cupertino, US",CA
samsung,$211.9,320671,"Suwon, South Korea",
amazon,$177.8,566000,"Seattle, US",WA
foxconn,$154.7,1300000,"New Taipei City, Taiwan",
alphabet,$110.8,80110,"Mountain View, US",


In [126]:
c['employees'] / 100000 # vectorized across column

apple        1.23000
samsung      3.20671
amazon       5.66000
foxconn     13.00000
alphabet     0.80110
Name: employees, dtype: float64

In [127]:
c[c['employees'] < 100000]

Unnamed: 0,revenue,employees,location,state
alphabet,$110.8,80110,"Mountain View, US",


In [128]:
c

Unnamed: 0,revenue,employees,location,state
apple,$229.2,123000,"Cupertino, US",CA
samsung,$211.9,320671,"Suwon, South Korea",
amazon,$177.8,566000,"Seattle, US",WA
foxconn,$154.7,1300000,"New Taipei City, Taiwan",
alphabet,$110.8,80110,"Mountain View, US",


## Handling NaN / Missing Data / NA

In [129]:
c[c['state'].isnull()] # only find rows where state is missing

Unnamed: 0,revenue,employees,location,state
samsung,$211.9,320671,"Suwon, South Korea",
foxconn,$154.7,1300000,"New Taipei City, Taiwan",
alphabet,$110.8,80110,"Mountain View, US",


In [131]:
c['state'] = c['state'].fillna('') # fill in missing with ''

In [36]:
c

Unnamed: 0,revenue,employees,location,state
apple,$229.2,123000,"Cupertino, US",CA
samsung,$211.9,320671,"Suwon, South Korea",
amazon,$177.8,566000,"Seattle, US",WA
foxconn,$154.7,1300000,"New Taipei City, Taiwan",
alphabet,$110.8,80110,"Mountain View, US",


In [132]:
# c.fillna?? # uncomment to see docs

## String Transformation

In [133]:
c['location'] = c['location'].str.upper()

In [134]:
c['country'] = c['location'].str.split(',').str[1]

In [135]:
c

Unnamed: 0,revenue,employees,location,state,country
apple,$229.2,123000,"CUPERTINO, US",CA,US
samsung,$211.9,320671,"SUWON, SOUTH KOREA",,SOUTH KOREA
amazon,$177.8,566000,"SEATTLE, US",WA,US
foxconn,$154.7,1300000,"NEW TAIPEI CITY, TAIWAN",,TAIWAN
alphabet,$110.8,80110,"MOUNTAIN VIEW, US",,US


In [136]:
c.reindex(index=[*(list(c.index)),'microsoft'])

Unnamed: 0,revenue,employees,location,state,country
apple,$229.2,123000.0,"CUPERTINO, US",CA,US
samsung,$211.9,320671.0,"SUWON, SOUTH KOREA",,SOUTH KOREA
amazon,$177.8,566000.0,"SEATTLE, US",WA,US
foxconn,$154.7,1300000.0,"NEW TAIPEI CITY, TAIWAN",,TAIWAN
alphabet,$110.8,80110.0,"MOUNTAIN VIEW, US",,US
microsoft,,,,,


In [137]:
animals = pd.Series(['ant', 'bat', 'cat'])

In [138]:
animals.map(lambda animal: f'{animal}s')

0    ants
1    bats
2    cats
dtype: object

## Working with Map, Apply, ApplyTransform

In [139]:
rain = pd.DataFrame([[3.50, 4.53, 4.13, 3.98],
                     [7.91, 5.98, 6.10, 5.12],
                     [3.94, 5.28, 3.90, 4.49],
                     [1.42, 0.63, 0.75, 1.65]],
    index=['New York', 'New Orleans', 'Atlanta', 'Seattle'],
    columns=['Jun', 'Jul', 'Aug', 'Sept'])

In [140]:
rain

Unnamed: 0,Jun,Jul,Aug,Sept
New York,3.5,4.53,4.13,3.98
New Orleans,7.91,5.98,6.1,5.12
Atlanta,3.94,5.28,3.9,4.49
Seattle,1.42,0.63,0.75,1.65


In [141]:
rain.apply(lambda cities: cities.sum())

Jun     16.77
Jul     16.42
Aug     14.88
Sept    15.24
dtype: float64

In [57]:
rain.apply(lambda months: months.sum(), axis=1)

New York       16.14
New Orleans    25.11
Atlanta        17.61
Seattle         4.45
dtype: float64

In [143]:
rain.sum(axis=1) # can do same as above with .sum method

New York       16.14
New Orleans    25.11
Atlanta        17.61
Seattle         4.45
dtype: float64

In [144]:
rain.mean(axis=1)

New York       4.0350
New Orleans    6.2775
Atlanta        4.4025
Seattle        1.1125
dtype: float64

In [61]:
# find how big the difference is between the rainiest and the least "rainy" month
rain.max()

Jun     7.91
Jul     5.98
Aug     6.10
Sept    5.12
dtype: float64

In [146]:
rain.max(axis=1) - rain.min(axis=1)

New York       1.03
New Orleans    2.79
Atlanta        1.38
Seattle        1.02
dtype: float64

In [147]:
rain.apply(lambda s: max(s) - min(s), axis=1)

New York       1.03
New Orleans    2.79
Atlanta        1.38
Seattle        1.02
dtype: float64

In [148]:
def factorial(n):
    return 1 if n == 0 else n * factorial(n - 1)

In [149]:
import numpy as np
nums = pd.DataFrame(np.arange(9).reshape((3,3)))

In [150]:
nums

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [151]:
nums.applymap(factorial)

Unnamed: 0,0,1,2
0,1,1,2
1,6,24,120
2,720,5040,40320


## Sorting, Unique Values, Counts, etc.

In [153]:
c

Unnamed: 0,revenue,employees,location,state,country
apple,$229.2,123000,"CUPERTINO, US",CA,US
samsung,$211.9,320671,"SUWON, SOUTH KOREA",,SOUTH KOREA
amazon,$177.8,566000,"SEATTLE, US",WA,US
foxconn,$154.7,1300000,"NEW TAIPEI CITY, TAIWAN",,TAIWAN
alphabet,$110.8,80110,"MOUNTAIN VIEW, US",,US


In [154]:
c.sort_index() # sort by row label

Unnamed: 0,revenue,employees,location,state,country
alphabet,$110.8,80110,"MOUNTAIN VIEW, US",,US
amazon,$177.8,566000,"SEATTLE, US",WA,US
apple,$229.2,123000,"CUPERTINO, US",CA,US
foxconn,$154.7,1300000,"NEW TAIPEI CITY, TAIWAN",,TAIWAN
samsung,$211.9,320671,"SUWON, SOUTH KOREA",,SOUTH KOREA


In [155]:
c.sort_values(by='revenue', ascending=False) # sort by revenue desc

Unnamed: 0,revenue,employees,location,state,country
apple,$229.2,123000,"CUPERTINO, US",CA,US
samsung,$211.9,320671,"SUWON, SOUTH KOREA",,SOUTH KOREA
amazon,$177.8,566000,"SEATTLE, US",WA,US
foxconn,$154.7,1300000,"NEW TAIPEI CITY, TAIWAN",,TAIWAN
alphabet,$110.8,80110,"MOUNTAIN VIEW, US",,US


In [156]:
c['country'].unique() # only unique values in country

array([' US', ' SOUTH KOREA', ' TAIWAN'], dtype=object)

In [157]:
c['country'].value_counts() # show counts for values in column

 US             3
 TAIWAN         1
 SOUTH KOREA    1
Name: country, dtype: int64

In [75]:
rain.describe()

Unnamed: 0,Jun,Jul,Aug,Sept
count,4.0,4.0,4.0,4.0
mean,4.1925,4.105,3.72,3.81
std,2.711081,2.391129,2.212525,1.513605
min,1.42,0.63,0.75,1.65
25%,2.98,3.555,3.1125,3.3975
50%,3.72,4.905,4.015,4.235
75%,4.9325,5.455,4.6225,4.6475
max,7.91,5.98,6.1,5.12


## Reading Data from a CSV 

* Can be done from file in filesystem OR from url!
* Many additional params (use pd.read_csv?)
  


In [77]:
dogs = pd.read_csv('https://data.cityofnewyork.us/api/views/rsgh-akpg/rows.csv?accessType=DOWNLOAD')

In [78]:
dogs

Unnamed: 0,UniqueID,DateOfBite,Species,Breed,Age,Gender,SpayNeuter,Borough,ZipCode
0,8140,December 02 2015,DOG,Pug,7,F,True,Staten Island,
1,1,January 27 2015,DOG,Jack Russ,11,M,False,Brooklyn,11217
2,2,January 25 2015,DOG,"Mastiff, Bull",,U,False,Brooklyn,
3,3,January 20 2015,DOG,PIT BULL/GOLDEN RETRIVE X,,U,False,Brooklyn,11236
4,4,December 26 2014,DOG,Doberman Pinscher,,M,False,Brooklyn,11204
5,5,January 29 2015,DOG,Pit Bull,,U,False,Brooklyn,
6,6,January 31 2015,DOG,Chihuahua Crossbreed,3,M,False,Brooklyn,
7,7,January 31 2015,DOG,German Shepherd Crossbreed,,F,False,Brooklyn,11220
8,8,January 23 2015,DOG,German Shepherd,4,M,False,Brooklyn,11229
9,9,February 04 2015,DOG,Pomeranian,1,F,False,Brooklyn,11216


## data cleaning / scrubbing / transformation

* pandas inserted a bunch of NaN ... do we want those to be missing? should there be a default value, etc.
    * Age columns has NaN
* data normalization / tranformation has to occur
    * Breed column has inconsistent casing (Pit Bull vs PIT BULL), permutations of same breed (Poodle vs Poodle Standard)
    * normalize casing, remove leading and trailing whitespace, date formats
* fix incorrect types, adjust precision; for example... disallow mixed types for Age (only floats?)
* superfluous columns (for example, drop Species)
* alternatively, do columns have to be added / calculated from other columns

## Let's Try Cleaning Up Breed So That we Can Have a Somewhat Accurate Count of Breed Most _Likely_ to bit you!

In [79]:
dogs['Breed'].value_counts()

Pit Bull                                 1919
Shih Tzu                                  358
American Pit Bull Mix / Pit Bull Mix      347
American Pit Bull Terrier/Pit Bull        343
Chihuahua                                 340
German Shepherd                           274
Mixed/Other                               267
Yorkshire Terrier                         230
UNKNOWN                                   188
Maltese                                   187
Rottweiler                                162
Labrador Retriever                        161
Bull dog                                  119
Siberian Husky                            117
Jack Russ                                 108
Poodle, Standard                          108
Cocker Spaniel                             97
Labrador Retriever Crossbreed              82
American Staffordshire Terrier             80
TERRIER                                    79
Beagle                                     73
Boxer                             

In [80]:
dogs['Breed'] = dogs['Breed'].str.upper()

In [81]:
dogs['Breed'].value_counts()

PIT BULL                                1924
SHIH TZU                                 359
AMERICAN PIT BULL MIX / PIT BULL MIX     347
AMERICAN PIT BULL TERRIER/PIT BULL       343
CHIHUAHUA                                341
GERMAN SHEPHERD                          276
MIXED/OTHER                              267
YORKSHIRE TERRIER                        230
MALTESE                                  189
UNKNOWN                                  188
ROTTWEILER                               162
LABRADOR RETRIEVER                       161
BULL DOG                                 119
SIBERIAN HUSKY                           117
JACK RUSS                                108
POODLE, STANDARD                         108
COCKER SPANIEL                            97
LABRADOR RETRIEVER CROSSBREED             82
AMERICAN STAFFORDSHIRE TERRIER            80
TERRIER                                   79
BEAGLE                                    75
BOXER                                     65
GOLDEN RET

## regex

matching ... use characters and special symbols to match a specific series of characters

can match:

* classes of characters.... numeric, any character, word characters)
* specify quantity
* where character occurs (anchor at beginning or end)

character classes
---
* `.` - any character
* `\s` - space character
* `\w` - word character
* `\d` - digit
* `[a-z]` - lower letters
* `[a-zA-Z]` - only letters, any casing

quantifiers
---
* `*` - 0 or more times
* `?` - 0 or exactly 1
* `+` - 1 or more
* `{2,4}` - exact range or quantity

anchors
---
* `^` - match at beginning
* `$` - match at end

`C.*$` - match "a Cat","Cello"

'^.*PIT BULL.*$'

In [88]:
# let's try ^^^^
dogs['Breed'] = dogs['Breed'].str.replace('^.*PIT BULL.*$', 'PIT BULL', regex=True)

In [90]:
dogs['Breed'].value_counts()

PIT BULL                            2799
SHIH TZU                             359
CHIHUAHUA                            341
GERMAN SHEPHERD                      276
MIXED/OTHER                          267
YORKSHIRE TERRIER                    230
MALTESE                              189
UNKNOWN                              188
ROTTWEILER                           162
LABRADOR RETRIEVER                   161
BULL DOG                             119
SIBERIAN HUSKY                       117
POODLE, STANDARD                     108
JACK RUSS                            108
COCKER SPANIEL                        97
LABRADOR RETRIEVER CROSSBREED         82
AMERICAN STAFFORDSHIRE TERRIER        80
TERRIER                               79
BEAGLE                                75
BOXER                                 65
GOLDEN RETRIEVER                      59
POMERANIAN                            57
MIXED                                 54
SHIBA INU                             53
BULL DOG, FRENCH

# API

Application Programmer Interface ?????

as the programmer what is the interface that you're using for this particular library or toolset

The API for pands:

* Series ... you can call x, y an z methods
* DataFrame

## WEB API

A website has offered a service to you for retrieving or manipulatin data

outline how to work with those service:

what HTTP method do I use?`

* GET - reading
* POST - creating (update)
* PUT - update
* DELETE

what path do I use?

/api/characters
/api/characters?page=2
/api/characters/rick

Authorization required?

how? api token... something else (session based?)

What is the response like... is it json, xml?

Dictionary that says what each field is

Terms of use....  is it ok to save all data? is it ok for commercial use? how many requests can i make and how often?

* usually service with no auth limits usage




















In [158]:
# manually retrieve json instead of use read_json
import requests


In [93]:
r = requests.get('https://rickandmortyapi.com/api/character/')

In [95]:
r.json()

{'info': {'count': 493,
  'pages': 25,
  'next': 'https://rickandmortyapi.com/api/character/?page=2',
  'prev': ''},
 'results': [{'id': 1,
   'name': 'Rick Sanchez',
   'status': 'Alive',
   'species': 'Human',
   'type': '',
   'gender': 'Male',
   'origin': {'name': 'Earth (C-137)',
    'url': 'https://rickandmortyapi.com/api/location/1'},
   'location': {'name': 'Earth (Replacement Dimension)',
    'url': 'https://rickandmortyapi.com/api/location/20'},
   'image': 'https://rickandmortyapi.com/api/character/avatar/1.jpeg',
   'episode': ['https://rickandmortyapi.com/api/episode/1',
    'https://rickandmortyapi.com/api/episode/2',
    'https://rickandmortyapi.com/api/episode/3',
    'https://rickandmortyapi.com/api/episode/4',
    'https://rickandmortyapi.com/api/episode/5',
    'https://rickandmortyapi.com/api/episode/6',
    'https://rickandmortyapi.com/api/episode/7',
    'https://rickandmortyapi.com/api/episode/8',
    'https://rickandmortyapi.com/api/episode/9',
    'https://ric