In [19]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
print(range.__doc__)

range(stop) -> range object
range(start, stop[, step]) -> range object

Return an object that produces a sequence of integers from start (inclusive)
to stop (exclusive) by step.  range(i, j) produces i, i+1, i+2, ..., j-1.
start defaults to 0, and stop is omitted!  range(4) produces 0, 1, 2, 3.
These are exactly the valid indices for a list of 4 elements.
When step is given, it specifies the increment (or decrement).


In [3]:
range?

You can find out what variables are declared in the local or global namespace, by using the dictionaries locals() and globals().

In [4]:
x = 12
locals()['x']

12

In [None]:
del x

In [10]:
locals().get('x','not here')

'not here'

In [12]:
heights = [21.6, 22.5, 19.8, 20.5]
heights [0] = 25
h = heights
h

[25, 22.5, 19.8, 20.5]

In [20]:
b = h
b[2] = 99
h[2]
id(b)
id(h)

99

4573819016

4573819016

In [14]:
b = h[:]
b

[25, 22.5, 99, 20.5]

In [15]:
b[1] = 99
h

[25, 22.5, 99, 20.5]

In [23]:
b = list(h) #alterantive creates a copy
c = h.copy()
c is h

False

In [24]:
years = range(1996, 2013)
years

range(1996, 2013)

In [25]:
years()  #shift tab tab

TypeError: 'range' object is not callable

### Method to read a file

In [28]:
# open using a context manager --closes file automatically; csv_file is the file handle
with open("data/health_inspection_chi_sample.csv") as csv_file:
    for line in csv_file:
        pass

In [27]:
line #prints the last line

' | 41. PREMISES MAINTAINED FREE OF LITTER, UNNECESSARY ARTICLES, CLEANING  EQUIPMENT PROPERLY STORED - Comments: CORRECTED. | 41. PREMISES MAINTAINED FREE OF LITTER, UNNECESSARY ARTICLES, CLEANING  EQUIPMENT PROPERLY STORED - Comments: OBSERVED STANDING WASTE WATER INSIDE OF THE MOP BUCKET IN THE REAR PREP AREA AND A WET MOP ON THE FLOOR; INSTRUCTED TO DISPOSE OF STANDING WASTE WATER WHEN DONE USING AND TO HANG WET MOPS UPWARD TO DRY.",60651.0\n'

In [29]:
json_file = open('data/health_inspection_chi_sample.json')

for line in json_file:
    pass
print(json_file.tell())

1775568


In [30]:
json_file.seek(0) #seek means go back to the beginning of the iterable
for line in json_file:
    pass

0

### another method to read a file

In [31]:
import csv
from pprint import pprint
csv_file = open("data/health_inspection_chi_sample.csv")
reader = csv.reader(csv_file)

In [32]:
headers = next(reader)
pprint(headers)

['address',
 'aka_name',
 'city',
 'dba_name',
 'facility_type',
 'inspection_date',
 'inspection_id',
 'inspection_type',
 'latitude',
 'license_',
 'location',
 'longitude',
 'results',
 'risk',
 'state',
 'violations',
 'zip']


In [33]:
line = next(reader)
pprint(line)

['5255 W MADISON ST ',
 'RED SNAPPER FISH CHICKEN & PIZZA',
 'CHICAGO',
 'RED SNAPPER FISH CHICKEN & PIZZA',
 'Restaurant',
 '2016-09-26T00:00:00.000',
 '1965287',
 'Canvass',
 '41.880236543865834',
 '1991820.0',
 "{'type': 'Point', 'coordinates': [-87.757220392117, 41.880236543866]}",
 '-87.7572203921175',
 'Pass w/ Conditions',
 'Risk 1 (High)',
 'IL',
 '35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTED PER CODE: GOOD REPAIR, '
 'SURFACES CLEAN AND DUST-LESS CLEANING METHODS - Comments: MUST CLEAN THE '
 'WALLS AT WALL BASE NEAR THE MIXER IN REAR OF PREMISES AND THE PREP AREA OF '
 'FOOD SPILLS AND CLEAN THE WALL VENT IN PREP AREA ,INSTRUCTED TO CLEAN AND '
 'MAINTAIN AREA | 33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSILS CLEAN, FREE '
 'OF ABRASIVE DETERGENTS - Comments: MUST CLEAN THE INTERIOR PANEL OF THE ICE '
 'MACHINE IN REAR OF PREMISES | 34. FLOORS: CONSTRUCTED PER CODE, CLEANED, '
 'GOOD REPAIR, COVING INSTALLED, DUST-LESS CLEANING METHODS USED - Comments: '
 'MUST CLEAN 

### List Comprehensions

In [34]:
list1 = [i for i in range(1, 6)]
list1

[1, 2, 3, 4, 5]

In [35]:
x = ['a', 'b', 'c', 'd', '_e', '_f']
[i for i in x if not i.startswith('_')]

['a', 'b', 'c', 'd']

In [36]:
[i if not i.startswith('_') else 'skipped' for i in x]

['a', 'b', 'c', 'd', 'skipped', 'skipped']

In [37]:
matrix = [
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12],
]

In [38]:
[element for row in matrix for element in row]  #getting too complex

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [39]:
#dictionary comprehension
pairs = [
    ("first_initial", "J"), 
    ("last_name", "Doe"), 
    ("address", "1234 Main Street")
]
{key: value for key, value in pairs}

{'first_initial': 'J', 'last_name': 'Doe', 'address': '1234 Main Street'}

### Reading Data: web data
Of course, pandas can also load data directly from a URL, but I encourage you to reach for requests as often as you need it.

In [51]:
import requests
#note the addition of parens to split the url into two lines
response = requests.get(
    "https://data.cityofchicago.org/"
    "resource/cwig-ma7x.json", 
    params="$limit=10"
)

In [52]:
response
response.ok

<Response [200]>

True

- libraries like Beautiful Soup, lxml, and mechanize can be helpful.
- There's also a read_html function in pandas that will quickly scrape HTML tables for you and put them into a DataFrame.

In [70]:
import pandas as pd
dta = pd.read_json(response.content, orient='records')
dta.head(1)

Unnamed: 0,:@computed_region_43wa_7qmu,:@computed_region_6mkv_f3dw,:@computed_region_awaf_s7ux,:@computed_region_bdys_3d7i,:@computed_region_vrxf_vc4k,address,aka_name,city,dba_name,facility_type,...,inspection_type,latitude,license_,location,longitude,results,risk,state,violations,zip
0,42,4447,33,264,73,3323 W 111TH ST,WONDERBURGER,CHICAGO,WONDERBURGER,Restaurant,...,License,41.691404,2626727,"{'type': 'Point', 'coordinates': [-87.70428241...",-87.704282,Fail,Risk 3 (Low),IL,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,60655.0


In [146]:
dta.shape
dta.describe()
dta.info()

(25000, 16)

Unnamed: 0,latitude,license_,longitude
count,24865.0,24996.0,24865.0
mean,41.879734,1475028.0,-87.67589
std,0.081968,891267.9,0.058372
min,41.64467,0.0,-87.914428
25%,41.828059,1000632.0,-87.707209
50%,41.891528,1884173.0,-87.666377
75%,41.939792,2134724.0,-87.634533
max,42.021064,8700606.0,-87.525125


<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 1965287 to 413268
Data columns (total 16 columns):
address            25000 non-null object
aka_name           24709 non-null object
city               24981 non-null object
dba_name           25000 non-null object
facility_type      24787 non-null category
inspection_date    25000 non-null datetime64[ns]
inspection_type    25000 non-null category
latitude           24865 non-null float64
license_           24996 non-null float64
location           24865 non-null object
longitude          24865 non-null float64
results            25000 non-null category
risk               24995 non-null category
state              24994 non-null object
violations         23908 non-null object
zip                24990 non-null object
dtypes: category(4), datetime64[ns](1), float64(3), object(8)
memory usage: 3.2+ MB


In [147]:
dta.select_dtypes(['category']).describe()

Unnamed: 0,facility_type,inspection_type,results,risk
count,24787,25000,25000,24995
unique,226,40,7,4
top,Restaurant,Canvass,Pass,Risk 1 (High)
freq,16954,12487,15915,18249


In [None]:
dta = dta.set_index('inspection_id')

In [78]:
dta['address'].head() #w/o loc we get a column

inspection_id
2279757           3323 W 111TH ST 
2279808          1515 W DEVON AVE 
2279779    811 W FULTON MARKET ST 
2279758        1610 W CHICAGO AVE 
2279754        4521 N SHERIDAN RD 
Name: address, dtype: object

In [79]:
# or pass it a list to preserve the df
dta[['address']].head(3)

Unnamed: 0_level_0,address
inspection_id,Unnamed: 1_level_1
2279757,3323 W 111TH ST
2279808,1515 W DEVON AVE
2279779,811 W FULTON MARKET ST


In [80]:
dta.loc[[2279808, 2279779]]

Unnamed: 0_level_0,:@computed_region_43wa_7qmu,:@computed_region_6mkv_f3dw,:@computed_region_awaf_s7ux,:@computed_region_bdys_3d7i,:@computed_region_vrxf_vc4k,address,aka_name,city,dba_name,facility_type,...,inspection_type,latitude,license_,location,longitude,results,risk,state,violations,zip
inspection_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2279808,16,4451,46,536,76,1515 W DEVON AVE,LA UNICA FOOD MART,CHICAGO,"LA UNICA FOOD MART, INC.",RESTAURANT/GROCERY STORE,...,Complaint,41.997993,491,"{'type': 'Point', 'coordinates': [-87.66842788...",-87.668428,Fail,Risk 1 (High),IL,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,60660.0
2279779,46,14917,41,63,29,811 W FULTON MARKET ST,THE WING FULTON,,THE WING FULTON,Restaurant,...,License,41.886684,2642303,"{'type': 'Point', 'coordinates': [-87.64800067...",-87.648001,Fail,Risk 2 (Medium),IL,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",


In [81]:
dta.iloc[1:3,5:]

Unnamed: 0_level_0,address,aka_name,city,dba_name,facility_type,inspection_date,inspection_type,latitude,license_,location,longitude,results,risk,state,violations,zip
inspection_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2279808,1515 W DEVON AVE,LA UNICA FOOD MART,CHICAGO,"LA UNICA FOOD MART, INC.",RESTAURANT/GROCERY STORE,2019-03-21T00:00:00.000,Complaint,41.997993,491,"{'type': 'Point', 'coordinates': [-87.66842788...",-87.668428,Fail,Risk 1 (High),IL,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,60660.0
2279779,811 W FULTON MARKET ST,THE WING FULTON,,THE WING FULTON,Restaurant,2019-03-21T00:00:00.000,License,41.886684,2642303,"{'type': 'Point', 'coordinates': [-87.64800067...",-87.648001,Fail,Risk 2 (Medium),IL,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",


In [82]:
dta.loc[:2279779, ["address", "inspection_date"]].head(4)

Unnamed: 0_level_0,address,inspection_date
inspection_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2279757,3323 W 111TH ST,2019-03-21T00:00:00.000
2279808,1515 W DEVON AVE,2019-03-21T00:00:00.000
2279779,811 W FULTON MARKET ST,2019-03-21T00:00:00.000


In [None]:
dta.inspection_date = dta.inspection_date.apply(pd.to_datetime)  # probably easiest way to convert to datetime
# using apply to a series to convert it ...

In [84]:
dta.inspection_date.head(3)

inspection_id
2279757   2019-03-21
2279808   2019-03-21
2279779   2019-03-21
Name: inspection_date, dtype: datetime64[ns]

In [88]:
# SEE BELOW on use of dtype = 'str', so that we don't need this

import numpy as np
def float_to_zip(zip_code):
    if np.isnan(zip_code):
        return np.nan
    
    # 0 makes sure to left-pad with zero
    # zip codes have 5 digits
    # .0 means, we don't want anything after the decimal
    # f is for float
    zip_code = "{:05.0f}".format(zip_code)
    # {:  0: 0 pad if there are 5 digits; nothing after '.' is accepted}
    return zip_code

In [89]:
dta.zip = dta.zip.apply(float_to_zip)  # note use of apply again

In [90]:
dta[['zip']].head()

Unnamed: 0_level_0,zip
inspection_id,Unnamed: 1_level_1
2279757,60655.0
2279808,60660.0
2279779,
2279758,60622.0
2279754,60640.0


In [91]:
del dta['location']
dta.head(3)

Unnamed: 0_level_0,:@computed_region_43wa_7qmu,:@computed_region_6mkv_f3dw,:@computed_region_awaf_s7ux,:@computed_region_bdys_3d7i,:@computed_region_vrxf_vc4k,address,aka_name,city,dba_name,facility_type,inspection_date,inspection_type,latitude,license_,longitude,results,risk,state,violations,zip
inspection_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2279757,42,4447,33,264,73,3323 W 111TH ST,WONDERBURGER,CHICAGO,WONDERBURGER,Restaurant,2019-03-21,License,41.691404,2626727,-87.704282,Fail,Risk 3 (Low),IL,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,60655.0
2279808,16,4451,46,536,76,1515 W DEVON AVE,LA UNICA FOOD MART,CHICAGO,"LA UNICA FOOD MART, INC.",RESTAURANT/GROCERY STORE,2019-03-21,Complaint,41.997993,491,-87.668428,Fail,Risk 1 (High),IL,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,60660.0
2279779,46,14917,41,63,29,811 W FULTON MARKET ST,THE WING FULTON,,THE WING FULTON,Restaurant,2019-03-21,License,41.886684,2642303,-87.648001,Fail,Risk 2 (Medium),IL,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",


In [92]:
#Using pandas to read csv directly 

dta = pd.read_csv(
    "data/health_inspection_chi.csv", 
    index_col="inspection_id",
    parse_dates=["inspection_date"]
)
# makes inspection_date a datetime object
# sets up index to be more useful

In [None]:
# use of converters during the read....
# apply the function float_to_zip as we read it in
dta = pd.read_csv(
    "data/health_inspection_chi.csv",
    converters={
        'zip': float_to_zip
    },
)

In [None]:
# use of usecols
dta = pd.read_csv(
    "data/health_inspection_chi.csv",
    usecols=lambda col: col != 'location'
)

In [254]:
dta = pd.read_csv(
    "data/health_inspection_chi.csv",
    index_col="inspection_id",
    parse_dates=["inspection_date"],
    dtype={
        'results': 'category',
        'risk': 'category',
        'inspection_type': 'category',
        'facility_type': 'category',
        'zip': 'str' #zipcodes can start with a 0 so cast as a string
    }
)

In [255]:
dta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 1965287 to 413268
Data columns (total 16 columns):
address            25000 non-null object
aka_name           24709 non-null object
city               24981 non-null object
dba_name           25000 non-null object
facility_type      24787 non-null category
inspection_date    25000 non-null datetime64[ns]
inspection_type    25000 non-null category
latitude           24865 non-null float64
license_           24996 non-null float64
location           24865 non-null object
longitude          24865 non-null float64
results            25000 non-null category
risk               24995 non-null category
state              24994 non-null object
violations         23908 non-null object
zip                24990 non-null object
dtypes: category(4), datetime64[ns](1), float64(3), object(8)
memory usage: 2.6+ MB


In [256]:
dta.risk.head()
dta.facility_type.head()

inspection_id
1965287      Risk 1 (High)
1329698      Risk 1 (High)
470787       Risk 1 (High)
68091      Risk 2 (Medium)
1335320       Risk 3 (Low)
Name: risk, dtype: category
Categories (4, object): [All, Risk 1 (High), Risk 2 (Medium), Risk 3 (Low)]

inspection_id
1965287               Restaurant
1329698               Restaurant
470787                Restaurant
68091                 Restaurant
1335320    Mobile Food Dispenser
Name: facility_type, dtype: category
Categories (226, object): [1023 CHILDERN'S SERVICES FACILITY, 1023 CHILDREN'S SERVICES FACILITY, 1023-CHILDREN'S SERVICES FACILITY, A-Not-For-Profit Chef Training Program, ..., tavern/restaurant, theater, warehouse, weight loss program]

In [181]:
dta.loc[dta.violations.isnull()].head() #does not seem to be a pattern

Unnamed: 0_level_0,address,aka_name,city,dba_name,facility_type,inspection_date,inspection_type,latitude,license_,location,longitude,results,risk,state,violations,zip
inspection_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
68091,2804 N CLARK ST,Wells Street Popcorn,CHICAGO,Wells Street Popcorn,Restaurant,2010-02-01,Canvass,41.932921,1954774.0,"{'type': 'Point', 'coordinates': [-87.64515454...",-87.645155,Pass,Risk 2 (Medium),IL,,60657.0
233722,3121 W CERMAK RD,TAQUERIA EL PALMAR,CHICAGO,TAQUERIA EL PALMAR,,2010-05-11,Canvass,41.851674,1243326.0,"{'type': 'Point', 'coordinates': [-87.70378752...",-87.703788,Fail,Risk 1 (High),IL,,60623.0
284278,3813-3815 W CHICAGO AVE,SUGA RAY'S SPORTS GRILL,CHICAGO,SUGA RAY'S SPORTS GRILL,Restaurant,2010-08-12,Canvass,41.895297,1922230.0,"{'type': 'Point', 'coordinates': [-87.72179977...",-87.7218,Out of Business,Risk 2 (Medium),IL,,60651.0
231272,1204 W 36TH PL,MOBILE TRUCK #13,CHICAGO,THUNDERBIRD CATERING,Mobile Food Dispenser,2010-03-22,License,41.828094,1476473.0,"{'type': 'Point', 'coordinates': [-87.65585369...",-87.655854,Pass,Risk 3 (Low),IL,,60609.0
277874,2826 N LINCOLN AVE,MGM Catering,CHICAGO,MGM Catering,Catering,2010-08-02,License,41.933101,2037141.0,"{'type': 'Point', 'coordinates': [-87.65968295...",-87.659683,Pass,Risk 1 (High),IL,,60657.0


In [305]:
with pd.option_context("display.max_colwidth", 500):
    print(dta.violations.head(2))

inspection_id
1965287    35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTED PER CODE: GOOD REPAIR, SURFACES CLEAN AND DUST-LESS CLEANING METHODS - Comments: MUST CLEAN THE WALLS AT WALL BASE NEAR THE MIXER IN REAR OF PREMISES AND THE PREP AREA OF FOOD SPILLS AND CLEAN THE WALL VENT IN PREP AREA ,INSTRUCTED TO CLEAN AND MAINTAIN AREA | 33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSILS CLEAN, FREE OF ABRASIVE DETERGENTS - Comments: MUST CLEAN THE INTERIOR PANEL OF THE ICE MACHINE IN REAR OF PREMISES | 34. FLOORS: CONS...
1329698    33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSILS CLEAN, FREE OF ABRASIVE DETERGENTS - Comments: Non food contact surfaces of ice machine not clean, needs cleaning. \nNon food contact surfaces of cooler shelving/racks not clean, need cleaning. \nPrep table lower shelving not clean, need detailed cleaning(crevices). | 34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD REPAIR, COVING INSTALLED, DUST-LESS CLEANING METHODS USED - Comments: Floors under heavy equipment

In [306]:
violations = dta.violations.str.split("\|", expand=True)
violations.head()
#19 columns because that was the maximum violations at one site

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
inspection_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1965287,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENS...,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...",36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...,22. DISH MACHINES: PROVIDED WITH ACCURATE THE...,40. REFRIGERATION AND METAL STEM THERMOMETERS...,,,,,,,,,,,,,
1329698,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...",36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...,38. VENTILATION: ROOMS AND EQUIPMENT VENTED A...,,,,,,,,,,,,,,,
470787,"6. HANDS WASHED AND CLEANED, GOOD HYGIENIC PRA...","11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, ...",19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE ...,36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...,3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATU...,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENS...,32. FOOD AND NON-FOOD CONTACT SURFACES PROPER...,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONST...",40. REFRIGERATION AND METAL STEM THERMOMETERS...,,,,,,,,,,
68091,,,,,,,,,,,,,,,,,,,
1335320,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,38. VENTILATION: ROOMS AND EQUIPMENT VENTED A...,"9. WATER SOURCE: SAFE, HOT & COLD UNDER CITY ...",12. HAND WASHING FACILITIES: WITH SOAP AND SA...,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPE...,"30. FOOD IN ORIGINAL CONTAINER, PROPERLY LABE...",,,,,,,,,,,,,


In [307]:
(dta.violations.str.count("\|") + 1).max()
(dta.violations.str.count("\|") + 1).mean()
(dta.violations.str.count("\|") + 1).median()

19.0

4.924000334616028

4.0

In [308]:
violations.stack().head(15)

inspection_id   
1965287        0    35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...
               1     33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENS...
               2     34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...
               3     36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...
               4     22. DISH MACHINES: PROVIDED WITH ACCURATE THE...
               5     40. REFRIGERATION AND METAL STEM THERMOMETERS...
1329698        0    33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...
               1     34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...
               2     36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...
               3     38. VENTILATION: ROOMS AND EQUIPMENT VENTED A...
470787         0    6. HANDS WASHED AND CLEANED, GOOD HYGIENIC PRA...
               1     11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, ...
               2     19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE ...
               3     36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...
   

In [309]:
violations.sort_index()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
inspection_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
44249,"41. PREMISES MAINTAINED FREE OF LITTER, UNNECE...",18. NO EVIDENCE OF RODENT OR INSECT OUTER OPE...,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONST...",33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENS...,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...",,,,,,,,,,,,,,
44259,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENS...,,,,,,,,,,,,,,,,,
44262,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...","35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONST...",,,,,,,,,,,,,,,,
48225,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,38. VENTILATION: ROOMS AND EQUIPMENT VENTED A...,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...",,,,,,,,,,,,,,,,
52237,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...","26. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, ...",32. FOOD AND NON-FOOD CONTACT SURFACES PROPER...,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONST...",38. VENTILATION: ROOMS AND EQUIPMENT VENTED A...,,,,,,,,,,,,,,
52241,19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE A...,24. DISH WASHING FACILITIES: PROPERLY DESIGNE...,36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...,,,,,,,,,,,,,,,,
52245,3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,17. POTENTIALLY HAZARDOUS FOOD PROPERLY THAWE...,29. PREVIOUS MINOR VIOLATION(S) CORRECTED 7-4...,32. FOOD AND NON-FOOD CONTACT SURFACES PROPER...,38. VENTILATION: ROOMS AND EQUIPMENT VENTED A...,,,,,,,,,,,,,,
52246,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...",36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...,,,,,,,,,,,,,,,,
52247,12. HAND WASHING FACILITIES: WITH SOAP AND SAN...,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPE...,32. FOOD AND NON-FOOD CONTACT SURFACES PROPER...,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENS...,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...",38. VENTILATION: ROOMS AND EQUIPMENT VENTED A...,,,,,,,,,,,,,
52250,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...","34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...",32. FOOD AND NON-FOOD CONTACT SURFACES PROPER...,19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE ...,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPE...,,,,,,,,,,,,,,


In [310]:
violations = violations.stack().dropna()
violations.head(15)

inspection_id   
1965287        0    35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...
               1     33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENS...
               2     34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...
               3     36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...
               4     22. DISH MACHINES: PROVIDED WITH ACCURATE THE...
               5     40. REFRIGERATION AND METAL STEM THERMOMETERS...
1329698        0    33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...
               1     34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...
               2     36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...
               3     38. VENTILATION: ROOMS AND EQUIPMENT VENTED A...
470787         0    6. HANDS WASHED AND CLEANED, GOOD HYGIENIC PRA...
               1     11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, ...
               2     19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE ...
               3     36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...
   

In [311]:
violations.reset_index(level=1, drop=True, inplace=True)

In [312]:
violations.head()

inspection_id
1965287    35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...
1965287     33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENS...
1965287     34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GO...
1965287     36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES O...
1965287     22. DISH MACHINES: PROVIDED WITH ACCURATE THE...
dtype: object

In [313]:
violations = violations.str.strip().head()
violations.head()

inspection_id
1965287    35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...
1965287    33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...
1965287    34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...
1965287    36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES OF...
1965287    22. DISH MACHINES: PROVIDED WITH ACCURATE THER...
dtype: object

In [341]:
dta[dta.dba_name.str.contains('MCD')].results.value_counts() #note all McDonalds grouped under MCD

Pass                    198
Fail                     61
Pass w/ Conditions       33
Out of Business           0
Not Ready                 0
No Entry                  0
Business Not Located      0
Name: results, dtype: int64

In [314]:
dta.results.value_counts()

Pass                    15915
Fail                     5760
Pass w/ Conditions       2931
Out of Business           246
No Entry                  107
Not Ready                  40
Business Not Located        1
Name: results, dtype: int64

In [315]:
g = dta.groupby(dta.results)
g

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x1284cc160>

<font color="blue">**You can access the variables on this object, the same as a DataFrame, and any code called will execute within the groups.**</font> 

In [316]:
g.size()

results
Business Not Located        1
Fail                     5760
No Entry                  107
Not Ready                  40
Out of Business           246
Pass                    15915
Pass w/ Conditions       2931
dtype: int64

In [317]:
result = g.dba_name.value_counts()
result
#series with a multi-index

results               dba_name                              
Business Not Located  CITGO SUPER WASH & GAS                     1
Fail                  SUBWAY                                    42
                      DUNKIN DONUTS                             33
                      MCDONALD'S                                22
                      HAROLD'S CHICKEN SHACK                    14
                      CHIPOTLE MEXICAN GRILL                    11
                      DUNKIN DONUTS / BASKIN ROBBINS            10
                      MCDONALDS                                 10
                      7-ELEVEN                                   9
                      POTBELLY SANDWICH WORKS                    9
                      POTBELLY SANDWICH WORKS LLC                9
                      JIMMY JOHNS                                8
                      LAS ISLAS MARIAS                           8
                      KFC                                        7
 

In [318]:
type(result)
result.index.names #what are the index names...

pandas.core.series.Series

FrozenList(['results', 'dba_name'])

In [319]:
with pd.option_context('max.rows', 15):
    print(result["Fail"].sort_values(ascending=False))

dba_name
SUBWAY                            42
DUNKIN DONUTS                     33
MCDONALD'S                        22
HAROLD'S CHICKEN SHACK            14
CHIPOTLE MEXICAN GRILL            11
DUNKIN DONUTS / BASKIN ROBBINS    10
MCDONALDS                         10
                                  ..
MERCY'S TOBACCO                    1
MERCY HOME FOR BOYS AND GIRLS      1
MERCER 113                         1
MELLOS PEANUT COMPANY              1
MEHRAB SUPER MARKET                1
MEERATH KABAB                      1
HALSTED FOOD                       1
Name: dba_name, Length: 4422, dtype: int64


In [320]:
with pd.option_context('max.rows', 15):
    print(result["Pass"].sort_values(ascending=False))

dba_name
SUBWAY                                     219
DUNKIN DONUTS                              118
MCDONALD'S                                  61
7-ELEVEN                                    38
MCDONALDS                                   35
AU BON PAIN                                 29
POTBELLY SANDWICH WORKS LLC                 29
                                          ... 
KATAKANA & FIESTA & KOKO GRILL               1
KASEY'S TAVERN                               1
KARYN'S ON GREEN                             1
KARLA'S KITCHEN INC.                         1
KARLA'S KITCHEN                              1
KARE INC                                     1
CITY GARDEN EARLY CHILDHOOD CENTER,INC.      1
Name: dba_name, Length: 9325, dtype: int64


In [266]:
dta.head(3)

Unnamed: 0_level_0,address,aka_name,city,dba_name,facility_type,inspection_date,inspection_type,latitude,license_,location,longitude,results,risk,state,violations,zip
inspection_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1965287,5255 W MADISON ST,RED SNAPPER FISH CHICKEN & PIZZA,CHICAGO,RED SNAPPER FISH CHICKEN & PIZZA,Restaurant,2016-09-26,Canvass,41.880237,1991820.0,"{'type': 'Point', 'coordinates': [-87.75722039...",-87.75722,Pass w/ Conditions,Risk 1 (High),IL,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",60644.0
1329698,5958 W DIVERSEY AVE,TAQUERIA MORELOS,CHICAGO,TAQUERIA MORELOS,Restaurant,2014-02-06,Canvass,41.93125,2099479.0,"{'type': 'Point', 'coordinates': [-87.77590699...",-87.775907,Pass,Risk 1 (High),IL,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,60639.0
470787,5400-5402 N CLARK ST,HAMBURGER MARY'S/MARY'S REC ROOM,CHICAGO,HAMBURGER MARY'S CHICAGO/MARY'S REC ROOM,Restaurant,2010-12-03,SFP,41.979884,1933748.0,"{'type': 'Point', 'coordinates': [-87.66842948...",-87.668429,Fail,Risk 1 (High),IL,"6. HANDS WASHED AND CLEANED, GOOD HYGIENIC PRA...",60640.0


In [286]:
(dta[dta.dba_name.str.startswith('mcd')]).dba_name.str.replace('mcd','MCD')

inspection_id
176504    MCDonalds
Name: dba_name, dtype: object

In [339]:
dta.dba_name = dta.dba_name.str.replace('Mc D','MCD') #STEP 2 replace problem names with proper name "MCD"

In [330]:
dta[dta.dba_name.str.startswith('McD')].count()      #STEP 1 Find the problem childern (.contains is better)  
                                                     #STEP 3 recheck 
    #mcd, McD, MC D

address            0
aka_name           0
city               0
dba_name           0
facility_type      0
inspection_date    0
inspection_type    0
latitude           0
license_           0
location           0
longitude          0
results            0
risk               0
state              0
violations         0
zip                0
dtype: int64

In [328]:
dta[dta.dba_name.str.startswith('MCD')].count() #NOTE the difference between startswith and contains

address            287
aka_name           286
city               287
dba_name           287
facility_type      284
inspection_date    287
inspection_type    287
latitude           287
license_           287
location           287
longitude          287
results            287
risk               287
state              287
violations         267
zip                287
dtype: int64

In [340]:
dta[dta.dba_name.str.contains('MCD')].count()

address            292
aka_name           291
city               292
dba_name           292
facility_type      289
inspection_date    292
inspection_type    292
latitude           292
license_           292
location           292
longitude          292
results            292
risk               292
state              292
violations         272
zip                292
dtype: int64

### This is the end of this section - new topic below

In [118]:
# ASIDE on Quantiles
import pandas as pd
import random

A = [ random.randint(0,100) for i in range(10) ]
B = [ random.randint(0,100) for i in range(10) ]

df = pd.DataFrame({ 'field_A': A, 'field_B': B })
df

Unnamed: 0,field_A,field_B
0,54,47
1,65,0
2,98,15
3,3,23
4,55,21
5,93,59
6,72,42
7,72,92
8,3,79
9,84,78


In [120]:
df.mean()
df.median()

field_A    59.9
field_B    45.6
dtype: float64

field_A    68.5
field_B    44.5
dtype: float64

In [121]:
df.field_A.quantile(0.1)
df.field_A.quantile(0.5) # this is the same as median
df.field_A.quantile(0.9)  # 90% of the numbers are below this value

3.0

68.5

93.5