In [2]:
import requests
url = 'https://www.rottentomatoes.com/m/et_the_extraterrestrial'
response = requests.get(url)

In [3]:
# Save HTML to file
# with open ('et_the_extraterrestrial.html', mode='wb') as file:
#     file.write(response.content)

In [4]:
# Work with HTML in memory
from bs4 import BeautifulSoup
import pandas as pd
import os
soup = BeautifulSoup(response.content, 'lxml')

In [5]:
with open('rt_html/et_the_extraterrestrial.html') as file:
    soup = BeautifulSoup(file, 'lxml')

In [6]:
soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]

u'E.T. The Extra-Terrestrial\xa0(1982)'

In [7]:
df_list = []
folder = 'rt_html'
for movie_html in os.listdir(folder):
    with open(os.path.join(folder, movie_html)) as file:
        soup = BeautifulSoup(file, 'lxml')
        title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
        audience_score = soup.find('div', class_='audience-score meter').find('span').contents[0][:-1]
        num_audience_ratings = soup.find('div', class_='audience-info hidden-xs superPageFontColor')
        num_audience_ratings = num_audience_ratings.find_all('div')[1].contents[2].strip().replace(',','')
        df_list.append({'title': title,
                       'audience_score': int(audience_score),
                       'number_of_audience_ratings': int(audience_score)})
df = pd.DataFrame(df_list, columns=['title', 'audience_score', 'number_of_audience_ratings'])
df.head()

Unnamed: 0,title,audience_score,number_of_audience_ratings
0,A Hard Day's Night (1964),89,89
1,"Nosferatu, a Symphony of Horror (Nosferatu, ei...",87,87
2,Rosemary's Baby (1968),87,87
3,Taxi Driver (1976),93,93
4,The Treasure of the Sierra Madre (1948),93,93


# Download files from internet

In [8]:
import requests
import os

In [9]:
# Make directory if it doesn't already exist
folder_name = 'ebert_reviews'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [10]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9900_1-the-wizard-of-oz-1939-film/1-the-wizard-of-oz-1939-film.txt'
response = requests.get(url)
with open(os.path.join(folder_name,
                      url.split('/')[-1]), mode='wb') as file:
    file.write(response.content)

In [11]:
os.listdir(folder_name)

['68-touch-of-evil.txt',
 '31-sunset-boulevard-film.txt',
 '60-a-streetcar-named-desire-1951-film.txt',
 '61-the-night-of-the-hunter-film.txt',
 '63-manchester-by-the-sea-film.txt',
 '16-casablanca-film.txt',
 '10-metropolis-1927-film.txt',
 '44-zootopia.txt',
 '86-la-la-land-film.txt',
 '82-tokyo-story.txt',
 '62-star-wars-the-force-awakens.txt',
 '17-moonlight-2016-film.txt',
 '80-skyfall.txt',
 '33-spotlight-film.txt',
 '55-logan-film.txt',
 '99-the-godfather-part-ii.txt',
 '57-army-of-shadows.txt',
 '21-snow-white-and-the-seven-dwarfs-1937-film.txt',
 '58-arrival-film.txt',
 '22-a-hard-day27s-night-film.txt',
 '59-baby-driver.txt',
 '8-inside-out-2015-film.txt',
 '45-m-1931-film.txt',
 '41-toy-story-2.txt',
 '84-pinocchio-1940-film.txt',
 '34-the-adventures-of-robin-hood.txt',
 '39-toy-story-3.txt',
 '53-12-angry-men-1957-film.txt',
 '79-the-good-the-bad-and-the-ugly.txt',
 '90-on-the-waterfront.txt',
 '7-all-about-eve.txt',
 '48-alien-film.txt',
 '23-la-grande-illusion.txt',
 '100

In [12]:
# Dowload a many files
ebert_review_urls = ['https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9900_1-the-wizard-of-oz-1939-film/1-the-wizard-of-oz-1939-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9901_2-citizen-kane/2-citizen-kane.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9901_3-the-third-man/3-the-third-man.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9902_4-get-out-film/4-get-out-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9902_5-mad-max-fury-road/5-mad-max-fury-road.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9902_6-the-cabinet-of-dr.-caligari/6-the-cabinet-of-dr.-caligari.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9903_7-all-about-eve/7-all-about-eve.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9903_8-inside-out-2015-film/8-inside-out-2015-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9903_9-the-godfather/9-the-godfather.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_10-metropolis-1927-film/10-metropolis-1927-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_11-e.t.-the-extra-terrestrial/11-e.t.-the-extra-terrestrial.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_12-modern-times-film/12-modern-times-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_14-singin-in-the-rain/14-singin-in-the-rain.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9905_15-boyhood-film/15-boyhood-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9905_16-casablanca-film/16-casablanca-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9905_17-moonlight-2016-film/17-moonlight-2016-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9906_18-psycho-1960-film/18-psycho-1960-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9906_19-laura-1944-film/19-laura-1944-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9906_20-nosferatu/20-nosferatu.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9907_21-snow-white-and-the-seven-dwarfs-1937-film/21-snow-white-and-the-seven-dwarfs-1937-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9907_22-a-hard-day27s-night-film/22-a-hard-day27s-night-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9907_23-la-grande-illusion/23-la-grande-illusion.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9908_25-the-battle-of-algiers/25-the-battle-of-algiers.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9908_26-dunkirk-2017-film/26-dunkirk-2017-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9908_27-the-maltese-falcon-1941-film/27-the-maltese-falcon-1941-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9909_29-12-years-a-slave-film/29-12-years-a-slave-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9909_30-gravity-2013-film/30-gravity-2013-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9909_31-sunset-boulevard-film/31-sunset-boulevard-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990a_32-king-kong-1933-film/32-king-kong-1933-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990a_33-spotlight-film/33-spotlight-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990a_34-the-adventures-of-robin-hood/34-the-adventures-of-robin-hood.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990b_35-rashomon/35-rashomon.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990b_36-rear-window/36-rear-window.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990b_37-selma-film/37-selma-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990c_38-taxi-driver/38-taxi-driver.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990c_39-toy-story-3/39-toy-story-3.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990c_40-argo-2012-film/40-argo-2012-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_41-toy-story-2/41-toy-story-2.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_42-the-big-sick/42-the-big-sick.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_43-bride-of-frankenstein/43-bride-of-frankenstein.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_44-zootopia/44-zootopia.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990e_45-m-1931-film/45-m-1931-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990e_46-wonder-woman-2017-film/46-wonder-woman-2017-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990e_48-alien-film/48-alien-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990f_49-bicycle-thieves/49-bicycle-thieves.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990f_50-seven-samurai/50-seven-samurai.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990f_51-the-treasure-of-the-sierra-madre-film/51-the-treasure-of-the-sierra-madre-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9910_52-up-2009-film/52-up-2009-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9910_53-12-angry-men-1957-film/53-12-angry-men-1957-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9910_54-the-400-blows/54-the-400-blows.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9911_55-logan-film/55-logan-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9911_57-army-of-shadows/57-army-of-shadows.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9912_58-arrival-film/58-arrival-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9912_59-baby-driver/59-baby-driver.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_60-a-streetcar-named-desire-1951-film/60-a-streetcar-named-desire-1951-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_61-the-night-of-the-hunter-film/61-the-night-of-the-hunter-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_62-star-wars-the-force-awakens/62-star-wars-the-force-awakens.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_63-manchester-by-the-sea-film/63-manchester-by-the-sea-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9914_64-dr.-strangelove/64-dr.-strangelove.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9914_66-vertigo-film/66-vertigo-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9914_67-the-dark-knight-film/67-the-dark-knight-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9915_68-touch-of-evil/68-touch-of-evil.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9915_69-the-babadook/69-the-babadook.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9915_72-rosemary27s-baby-film/72-rosemary27s-baby-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9916_73-finding-nemo/73-finding-nemo.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9916_74-brooklyn-film/74-brooklyn-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9917_75-the-wrestler-2008-film/75-the-wrestler-2008-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9917_77-l.a.-confidential-film/77-l.a.-confidential-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9918_78-gone-with-the-wind-film/78-gone-with-the-wind-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9918_79-the-good-the-bad-and-the-ugly/79-the-good-the-bad-and-the-ugly.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9918_80-skyfall/80-skyfall.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_82-tokyo-story/82-tokyo-story.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_83-hell-or-high-water-film/83-hell-or-high-water-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_84-pinocchio-1940-film/84-pinocchio-1940-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_85-the-jungle-book-2016-film/85-the-jungle-book-2016-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991a_86-la-la-land-film/86-la-la-land-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991b_87-star-trek-film/87-star-trek-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991b_89-apocalypse-now/89-apocalypse-now.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991c_90-on-the-waterfront/90-on-the-waterfront.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991c_91-the-wages-of-fear/91-the-wages-of-fear.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991c_92-the-last-picture-show/92-the-last-picture-show.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991d_93-harry-potter-and-the-deathly-hallows-part-2/93-harry-potter-and-the-deathly-hallows-part-2.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991d_94-the-grapes-of-wrath-film/94-the-grapes-of-wrath-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991d_96-man-on-wire/96-man-on-wire.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_97-jaws-film/97-jaws-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_98-toy-story/98-toy-story.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_99-the-godfather-part-ii/99-the-godfather-part-ii.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_100-battleship-potemkin/100-battleship-potemkin.txt']

In [13]:
for url in ebert_review_urls:
    response = requests.get(url)
    with open(os.path.join(folder_name, url.split('/')[-1]), mode='wb') as file:
        file.write(response.content)

In [14]:
os.listdir(folder_name)

['68-touch-of-evil.txt',
 '31-sunset-boulevard-film.txt',
 '60-a-streetcar-named-desire-1951-film.txt',
 '61-the-night-of-the-hunter-film.txt',
 '63-manchester-by-the-sea-film.txt',
 '16-casablanca-film.txt',
 '10-metropolis-1927-film.txt',
 '44-zootopia.txt',
 '86-la-la-land-film.txt',
 '82-tokyo-story.txt',
 '62-star-wars-the-force-awakens.txt',
 '17-moonlight-2016-film.txt',
 '80-skyfall.txt',
 '33-spotlight-film.txt',
 '55-logan-film.txt',
 '99-the-godfather-part-ii.txt',
 '57-army-of-shadows.txt',
 '21-snow-white-and-the-seven-dwarfs-1937-film.txt',
 '58-arrival-film.txt',
 '22-a-hard-day27s-night-film.txt',
 '59-baby-driver.txt',
 '8-inside-out-2015-film.txt',
 '45-m-1931-film.txt',
 '41-toy-story-2.txt',
 '84-pinocchio-1940-film.txt',
 '34-the-adventures-of-robin-hood.txt',
 '39-toy-story-3.txt',
 '53-12-angry-men-1957-film.txt',
 '79-the-good-the-bad-and-the-ugly.txt',
 '90-on-the-waterfront.txt',
 '7-all-about-eve.txt',
 '48-alien-film.txt',
 '23-la-grande-illusion.txt',
 '100

In [15]:
len(os.listdir(folder_name))

88

In [16]:
import wptools

In [17]:
page = wptools.page('E.T._the_Extra-Terrestrial').get()

en.wikipedia.org (query) E.T._the_Extra-Terrestrial
en.wikipedia.org (parse) 73441
www.wikidata.org (wikidata) Q11621
www.wikidata.org (labels) Q952914|Q464522|Q102427|P921|P2465|Q186...
www.wikidata.org (labels) Q461742|P2047|P1552|Q30|P18|Q5280675|Q6...
www.wikidata.org (labels) P3110|P3141|Q720068|P3844|Q281939|P364|...
www.wikidata.org (labels) Q443775|Q181508|Q457893|Q787131|P4632|P...
en.wikipedia.org (restbase) /page/summary/E.T._the_Extra-Terrestrial
en.wikipedia.org (imageinfo) File:E t the extra terrestrial ver3....
E.T. the Extra-Terrestrial (en) data
{
  aliases: <list(2)> E.T., ET
  assessments: <dict(4)> United States, Science Fiction, Film, Lib...
  claims: <dict(87)> P646, P5786, P2508, P2509, P3995, P373, P921,...
  description: <str(63)> 1982 American science fiction film direct...
  exhtml: <str(572)> <p><i><b>E.T. the Extra-Terrestrial</b></i> i...
  exrest: <str(551)> E.T. the Extra-Terrestrial is a 1982 American...
  extext: <str(1788)> _**E.T. the Extra-Terrestri

In [18]:
page.data['image']

[{u'descriptionshorturl': u'https://en.wikipedia.org/w/index.php?curid=7419503',
  u'descriptionurl': u'https://en.wikipedia.org/wiki/File:E_t_the_extra_terrestrial_ver3.jpg',
  'file': 'File:E t the extra terrestrial ver3.jpg',
  u'height': 394,
  'kind': 'parse-image',
  'metadata': {u'Assessments': {u'hidden': u'',
    u'source': u'commons-categories',
    u'value': u''},
   u'Categories': {u'hidden': u'',
    u'source': u'commons-categories',
    u'value': u'All non-free media|E.T. the Extra-Terrestrial|Fair use images of movie posters|Files with no machine-readable author|Files with no machine-readable description|Files with no machine-readable license|Files with no machine-readable source|Noindexed pages|Non-free images for NFUR review|Non-free posters'},
   u'CommonsMetadataExtension': {u'hidden': u'',
    u'source': u'extension',
    u'value': 1.2},
   u'DateTime': {u'hidden': u'',
    u'source': u'mediawiki-metadata',
    u'value': u'2016-06-04 10:30:46'},
   u'NonFree': {u'hi

In [19]:
page.data['image'][0]

{u'descriptionshorturl': u'https://en.wikipedia.org/w/index.php?curid=7419503',
 u'descriptionurl': u'https://en.wikipedia.org/wiki/File:E_t_the_extra_terrestrial_ver3.jpg',
 'file': 'File:E t the extra terrestrial ver3.jpg',
 u'height': 394,
 'kind': 'parse-image',
 'metadata': {u'Assessments': {u'hidden': u'',
   u'source': u'commons-categories',
   u'value': u''},
  u'Categories': {u'hidden': u'',
   u'source': u'commons-categories',
   u'value': u'All non-free media|E.T. the Extra-Terrestrial|Fair use images of movie posters|Files with no machine-readable author|Files with no machine-readable description|Files with no machine-readable license|Files with no machine-readable source|Noindexed pages|Non-free images for NFUR review|Non-free posters'},
  u'CommonsMetadataExtension': {u'hidden': u'',
   u'source': u'extension',
   u'value': 1.2},
  u'DateTime': {u'hidden': u'',
   u'source': u'mediawiki-metadata',
   u'value': u'2016-06-04 10:30:46'},
  u'NonFree': {u'hidden': u'',
   u's

In [20]:
page.data['infobox']['director']

'[[Steven Spielberg]]'

# Baixando arquivos de imagem

In [22]:
import requests
from PIL import Image
from io import BytesIO
r = requests.get(url)
filename = 'https://upload.wikimedia.org/wikipedia/en/6/66/E_t_the_extra_terrestrial_ver3.jpg'
with open(folder_name + '/' + filename, 'wb') as f:
    f.write(r.content)

IOError: [Errno 2] No such file or directory: 'ebert_reviews/https://upload.wikimedia.org/wikipedia/en/6/66/E_t_the_extra_terrestrial_ver3.jpg'

# API + JSON

In [1]:
import pandas as pd
import wptools
import os
import requests
from PIL import Image
from io import BytesIO

In [2]:
title_list = [
 'The_Wizard_of_Oz_(1939_film)',
 'Citizen_Kane',
 'The_Third_Man',
 'Get_Out_(film)',
 'Mad_Max:_Fury_Road',
 'The_Cabinet_of_Dr._Caligari',
 'All_About_Eve',
 'Inside_Out_(2015_film)',
 'The_Godfather',
 'Metropolis_(1927_film)',
 'E.T._the_Extra-Terrestrial',
 'Modern_Times_(film)',
 'It_Happened_One_Night',
 "Singin'_in_the_Rain",
 'Boyhood_(film)',
 'Casablanca_(film)',
 'Moonlight_(2016_film)',
 'Psycho_(1960_film)',
 'Laura_(1944_film)',
 'Nosferatu',
 'Snow_White_and_the_Seven_Dwarfs_(1937_film)',
 "A_Hard_Day%27s_Night_(film)",
 'La_Grande_Illusion',
 'North_by_Northwest',
 'The_Battle_of_Algiers',
 'Dunkirk_(2017_film)',
 'The_Maltese_Falcon_(1941_film)',
 'Repulsion_(film)',
 '12_Years_a_Slave_(film)',
 'Gravity_(2013_film)',
 'Sunset_Boulevard_(film)',
 'King_Kong_(1933_film)',
 'Spotlight_(film)',
 'The_Adventures_of_Robin_Hood',
 'Rashomon',
 'Rear_Window',
 'Selma_(film)',
 'Taxi_Driver',
 'Toy_Story_3',
 'Argo_(2012_film)',
 'Toy_Story_2',
 'The_Big_Sick',
 'Bride_of_Frankenstein',
 'Zootopia',
 'M_(1931_film)',
 'Wonder_Woman_(2017_film)',
 'The_Philadelphia_Story_(film)',
 'Alien_(film)',
 'Bicycle_Thieves',
 'Seven_Samurai',
 'The_Treasure_of_the_Sierra_Madre_(film)',
 'Up_(2009_film)',
 '12_Angry_Men_(1957_film)',
 'The_400_Blows',
 'Logan_(film)',
 'All_Quiet_on_the_Western_Front_(1930_film)',
 'Army_of_Shadows',
 'Arrival_(film)',
 'Baby_Driver',
 'A_Streetcar_Named_Desire_(1951_film)',
 'The_Night_of_the_Hunter_(film)',
 'Star_Wars:_The_Force_Awakens',
 'Manchester_by_the_Sea_(film)',
 'Dr._Strangelove',
 'Frankenstein_(1931_film)',
 'Vertigo_(film)',
 'The_Dark_Knight_(film)',
 'Touch_of_Evil',
 'The_Babadook',
 'The_Conformist_(film)',
 'Rebecca_(1940_film)',
 "Rosemary%27s_Baby_(film)",
 'Finding_Nemo',
 'Brooklyn_(film)',
 'The_Wrestler_(2008_film)',
 'The_39_Steps_(1935_film)',
 'L.A._Confidential_(film)',
 'Gone_with_the_Wind_(film)',
 'The_Good,_the_Bad_and_the_Ugly',
 'Skyfall',
 'Rome,_Open_City',
 'Tokyo_Story',
 'Hell_or_High_Water_(film)',
 'Pinocchio_(1940_film)',
 'The_Jungle_Book_(2016_film)',
 'La_La_Land_(film)',
 'Star_Trek_(film)',
 'High_Noon',
 'Apocalypse_Now',
 'On_the_Waterfront',
 'The_Wages_of_Fear',
 'The_Last_Picture_Show',
 'Harry_Potter_and_the_Deathly_Hallows_–_Part_2',
 'The_Grapes_of_Wrath_(film)',
 'Roman_Holiday',
 'Man_on_Wire',
 'Jaws_(film)',
 'Toy_Story',
 'The_Godfather_Part_II',
 'Battleship_Potemkin'
]

In [3]:
folder_name = 'bestofrt_posters'
# Make directory if it doesn't already exist
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [8]:
# List of dictionaries to build and convert to a DataFrame later
df_list = []
image_errors = {}
for title in title_list:
    try:
        # This cell is slow so print ranking to gauge time remaining
        ranking = title_list.index(title) + 1
        page = wptools.page(title, silent=True)
        # Your code here (three lines)
        images = page.get().data['image']
        # First image is usually the poster
        first_image_url = images[0]['url']
        print(first_image_url)
        r = requests.get(first_image_url)
        # Download movie poster image
        i = Image.open(BytesIO(r.content))
        image_file_format = first_image_url.split('.')[-1]
        i.save(folder_name + "/" + str(ranking) + "_" + title + '.' + image_file_format)
        # Append to list of dictionaries
        df_list.append({'ranking': int(ranking),
                        'title': title,
                        'poster_url': first_image_url})
    
    # Not best practice to catch all exceptions but fine for this short script
    except Exception as e:
        print(str(ranking) + "_" + title + ": " + str(e))
        image_errors[str(ranking) + "_" + title] = images

https://upload.wikimedia.org/wikipedia/commons/6/69/Wizard_of_oz_movie_poster.jpg
https://upload.wikimedia.org/wikipedia/en/c/ce/Citizenkane.jpg
https://upload.wikimedia.org/wikipedia/en/2/21/ThirdManUSPoster.jpg
https://upload.wikimedia.org/wikipedia/en/a/a3/Get_Out_poster.png
https://upload.wikimedia.org/wikipedia/en/6/6e/Mad_Max_Fury_Road.jpg
https://upload.wikimedia.org/wikipedia/commons/5/52/Das_Cabinet_des_Dr._Caligari.JPG
https://upload.wikimedia.org/wikipedia/en/2/22/AllAboutEve.jpeg
https://upload.wikimedia.org/wikipedia/en/0/0a/Inside_Out_%282015_film%29_poster.jpg
https://upload.wikimedia.org/wikipedia/en/1/1c/Godfather_ver1.jpg
https://upload.wikimedia.org/wikipedia/en/0/06/Metropolisposter.jpg
https://upload.wikimedia.org/wikipedia/en/6/66/E_t_the_extra_terrestrial_ver3.jpg
https://upload.wikimedia.org/wikipedia/commons/3/36/Modern_Times_poster.jpg
https://upload.wikimedia.org/wikipedia/commons/d/dc/It-happened-one-night-poster.jpg
https://upload.wikimedia.org/wikipedia/en

API error: {u'info': u'Bad title "A_Hard_Day%27s_Night_(film)".', u'code': u'invalidtitle', u'docref': u'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes.'}


22_A_Hard_Day%27s_Night_(film): https://en.wikipedia.org/w/api.php?action=parse&formatversion=2&contentmodel=text&disableeditsection=&disablelimitreport=&disabletoc=&prop=text|iwlinks|parsetree|wikitext|displaytitle|properties&redirects&page=A_Hard_Day%2527s_Night_%28film%29
https://upload.wikimedia.org/wikipedia/en/3/33/GrandeIllusion.jpg
https://upload.wikimedia.org/wikipedia/commons/8/83/Northbynorthwest1.jpg
https://upload.wikimedia.org/wikipedia/en/a/aa/The_Battle_of_Algiers_poster.jpg
https://upload.wikimedia.org/wikipedia/en/1/15/Dunkirk_Film_poster.jpg
https://upload.wikimedia.org/wikipedia/en/9/99/Falconm.JPG
https://upload.wikimedia.org/wikipedia/en/8/89/Repulsion_%281965_film_poster%29.jpg
https://upload.wikimedia.org/wikipedia/en/5/5c/12_Years_a_Slave_film_poster.jpg
https://upload.wikimedia.org/wikipedia/en/f/f6/Gravity_Poster.jpg
https://upload.wikimedia.org/wikipedia/en/0/0a/SunsetBoulevardfilmposter.jpg
https://upload.wikimedia.org/wikipedia/commons/f/f3/Kingkongposter.

API error: {u'info': u'Bad title "Rosemary%27s_Baby_(film)".', u'code': u'invalidtitle', u'docref': u'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes.'}


72_Rosemary%27s_Baby_(film): https://en.wikipedia.org/w/api.php?action=parse&formatversion=2&contentmodel=text&disableeditsection=&disablelimitreport=&disabletoc=&prop=text|iwlinks|parsetree|wikitext|displaytitle|properties&redirects&page=Rosemary%2527s_Baby_%28film%29
https://upload.wikimedia.org/wikipedia/en/2/29/Finding_Nemo.jpg
https://upload.wikimedia.org/wikipedia/en/5/5b/Brooklyn_FilmPoster.jpg
https://upload.wikimedia.org/wikipedia/en/3/3e/The_Wrestler_poster.jpg
https://upload.wikimedia.org/wikipedia/en/9/95/The_39_Steps_1935_British_poster.jpg
https://upload.wikimedia.org/wikipedia/en/d/d8/La_confidential.jpg
https://upload.wikimedia.org/wikipedia/commons/2/27/Poster_-_Gone_With_the_Wind_01.jpg
https://upload.wikimedia.org/wikipedia/en/4/45/Good_the_bad_and_the_ugly_poster.jpg
https://upload.wikimedia.org/wikipedia/en/a/a7/Skyfall_poster.jpg
https://upload.wikimedia.org/wikipedia/en/1/19/Open_City_DVD.jpg
https://upload.wikimedia.org/wikipedia/en/5/5f/Tokyo_Story_poster.jpg
h

In [9]:
for key in image_errors.keys():
    print(key)

64_Dr._Strangelove
53_12_Angry_Men_(1957_film)
55_Logan_(film)
93_Harry_Potter_and_the_Deathly_Hallows_–_Part_2
22_A_Hard_Day%27s_Night_(film)
72_Rosemary%27s_Baby_(film)


In [14]:
# Inspect unidentifiable images and download them individually
for rank_title, images in image_errors.items():
    if rank_title == '22_A_Hard_Day%27s_Night_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/4/47/A_Hard_Days_night_movieposter.jpg'
    elif rank_title == '53_12_Angry_Men_(1957_film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/9/91/12_angry_men.jpg'
    elif rank_title == '72_Rosemary%27s_Baby_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/e/ef/Rosemarys_baby_poster.jpg'
    elif rank_title == '93_Harry_Potter_and_the_Deathly_Hallows_–_Part_2':
        url = 'https://upload.wikimedia.org/wikipedia/en/d/df/Harry_Potter_and_the_Deathly_Hallows_%E2%80%93_Part_2.jpg'
    else:
        print "%s without image", rank_title
        break
    title = rank_title[3:]
    df_list.append({'ranking': int(title_list.index(title) + 1),
                    'title': title,
                    'poster_url': url})
    r = requests.get(url)
    # Download movie poster image
    i = Image.open(BytesIO(r.content))
    image_file_format = url.split('.')[-1]
    i.save(folder_name + "/" + rank_title + '.' + image_file_format)

('%s without image', '64_Dr._Strangelove')


In [5]:
# Create DataFrame from list of dictionaries
df = pd.DataFrame(df_list, columns = ['ranking', 'title', 'poster_url'])
df = df.sort_values('ranking').reset_index(drop=True)
df
df.to_csv('dataset.csv', index=False)

NameError: name 'pd' is not defined

In [37]:
import pandas as pd
df = pd.read_csv('bestofrt_master.csv')
df.head(3)

Unnamed: 0,ranking,title,critic_score,number_of_critic_ratings,audience_score,number_of_audience_ratings,review_url,review_text,poster_url
0,1,The Wizard of Oz (1939),99,110,89,874425,http://www.rogerebert.com/reviews/great-movie-...,As a child I simply did not notice whether a m...,https://upload.wikimedia.org/wikipedia/commons...
1,2,Citizen Kane (1941),100,75,90,157274,http://www.rogerebert.com/reviews/great-movie-...,“I don't think any word can explain a man's li...,https://upload.wikimedia.org/wikipedia/en/c/ce...
2,3,The Third Man (1949),100,77,93,53081,http://www.rogerebert.com/reviews/great-movie-...,Has there ever been a film where the music mor...,https://upload.wikimedia.org/wikipedia/en/2/21...


# SQL and Python

### 1. Connect to a database

In [43]:

from sqlalchemy import create_engine

In [44]:
# Create SQLAlchemy Engine and empty bestofrt database
# bestofrt.db will not show up in the Jupyter Notebook dashboard yet
engine = create_engine('sqlite:///bestofrt.db')

### 2. Store pandas DataFrame in database
Store the data in the cleaned master dataset (bestofrt_master) in that database.

In [45]:
# Store cleaned master DataFrame ('df') in a table called master in bestofrt.db
# bestofrt.db will be visible now in the Jupyter Notebook dashboard
df.to_sql('master', engine, index=False)

ProgrammingError: (sqlite3.ProgrammingError) You must not use 8-bit bytestrings unless you use a text_factory that can interpret 8-bit bytestrings (like text_factory = str). It is highly recommended that you instead just switch your application to Unicode strings. [SQL: u'INSERT INTO master (ranking, title, critic_score, number_of_critic_ratings, audience_score, number_of_audience_ratings, review_url, review_text, poster_url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)'] [parameters: ((1, 'The Wizard of Oz (1939)', 99, 110, 89, 874425, 'http://www.rogerebert.com/reviews/great-movie-the-wizard-of-oz-1939', 'As a child I simply did not notice whether a movie was in color or not. The movies themselves were such an overwhelming mystery that if they wanted t ... (7993 characters truncated) ... to be stuck in,\xe2\x80\x9d decided young Terry McMillan, discontented with her life in Michigan. \xe2\x80\x9cIt beat the farm in Kansas.\xe2\x80\x9d', 'https://upload.wikimedia.org/wikipedia/commons/c/ca/WIZARD_OF_OZ_ORIGINAL_POSTER_1939.jpg'), (2, 'Citizen Kane (1941)', 100, 75, 90, 157274, 'http://www.rogerebert.com/reviews/great-movie-citizen-kane-1941', "\xe2\x80\x9cI don't think any word can explain a man's life,\xe2\x80\x9d says one of the searchers through the warehouse of treasures left behind by  ... (8065 characters truncated) ... as he walks toward us, his stature grows again. A man always seems the same size to himself, because he does not stand where we stand to look at him.", 'https://upload.wikimedia.org/wikipedia/en/c/ce/Citizenkane.jpg'), (3, 'The Third Man (1949)', 100, 77, 93, 53081, 'http://www.rogerebert.com/reviews/great-movie-the-third-man-1949', 'Has there ever been a film where the music more perfectly suited the action than in Carol Reed\'s "The Third Man"? The score was performed on a zithe ... (8000 characters truncated) ... ey plan to remake "The Third Man." Do you think Anna will cave in to Holly--or will she remain true to her bitter cynicism and unspeakable knowledge?', 'https://upload.wikimedia.org/wikipedia/en/2/21/ThirdManUSPoster.jpg'), (4, 'Get Out (2017)', 99, 282, 87, 63837, 'http://www.rogerebert.com/reviews/get-out-2017', 'With the ambitious and challenging \xe2\x80\x9cGet Out,\xe2\x80\x9d which premiered in a secret screening at the 2017 Sundance Film Festival, Jordan  ... (6795 characters truncated) ... st deserve credit for trying something so daring; he should have producers knocking down his door to see what else he\xe2\x80\x99s never seen before.', 'https://upload.wikimedia.org/wikipedia/en/e/eb/Teaser_poster_for_2017_film_Get_Out.png'), (5, 'Mad Max: Fury Road (2015)', 97, 370, 86, 123937, 'http://www.rogerebert.com/reviews/mad-max-fury-road-2015', 'George Miller\xe2\x80\x99s \xe2\x80\x9cMad Max\xe2\x80\x9d films didn\xe2\x80\x99t just make Mel Gibson a star\xe2\x80\x94they completely transformed ... (8500 characters truncated) ...  filmmakers, urging them to follow its audacious path into the genre\xe2\x80\x99s future and, like Miller, try their hardest to create something new.', 'https://upload.wikimedia.org/wikipedia/en/6/6e/Mad_Max_Fury_Road.jpg'), (6, 'The Cabinet of Dr. Caligari (Das Cabinet des Dr. Caligari) (1920)', 100, 49, 89, 27163, 'http://www.rogerebert.com/reviews/great-movie-the-cabinet-of-dr-caligari-1920', 'The first thing everyone notices and best remembers about "The Cabinet of Dr. Caligari" (1920) is the film\'s bizarre look. The actors inhabit a jagg ... (7557 characters truncated) ... titles are also in my Great Movies Collection.)\n\nNote:"Caligari" is part of Kino\'s excellent German Expressionism set and is available separately.', 'https://upload.wikimedia.org/wikipedia/commons/5/52/Das_Cabinet_des_Dr._Caligari.JPG'), (7, 'All About Eve (1950)', 100, 64, 94, 44564, 'http://www.rogerebert.com/reviews/great-movie-all-about-eve-1950', 'Growing older was a smart career move for Bette Davis, whose personality was adult, hard-edged and knowing. Never entirely comfortable as an ingenue, ... (7616 characters truncated) ... d he had no complaints. The reason they have the "no refunds" sign in the theater ticket window, he said, is to keep the rubes from calling the cops.', 'https://upload.wikimedia.org/wikipedia/en/2/22/AllAboutEve.jpeg'), (8, 'Inside Out (2015)', 98, 324, 89, 133558, 'http://www.rogerebert.com/reviews/inside-out-2015', '"Inside Out," a comedy-adventure set inside the mind of an 11-year old girl, is the kind of classic that lingers in the mind after you\'ve seen it, s ... (9298 characters truncated) ... st being childish, or that she wouldn\'t be taking everything so seriously if she were older. We feel for her, and with her. She contains multitudes.', 'https://upload.wikimedia.org/wikipedia/en/0/0a/Inside_Out_%282015_film%29_poster.jpg')  ... displaying 10 of 89 total bound parameter sets ...  (99, 'The Godfather, Part II (1974)', 97, 72, 97, 409574, 'http://www.rogerebert.com/reviews/great-movie-the-godfather-part-ii-1974', 'The musical score plays an even greater role in \xe2\x80\x9cThe Godfather: Part II\xe2\x80\x9d than it did in the original film. Nostalgic, mournful, ... (8491 characters truncated) ...  Bernard Hermann\xe2\x80\x99s score for \xe2\x80\x9cCitizen Kane,\xe2\x80\x9d another film about a man who got everything he wanted and then lost it.', 'https://upload.wikimedia.org/wikipedia/en/0/03/Godfather_part_ii.jpg'), (100, 'Battleship Potemkin (1925)', 100, 45, 85, 18709, 'http://www.rogerebert.com/reviews/great-movie-the-battleship-potemkin-1925', '"The Battleship Potemkin\xe2\x80\x9d has been so famous for so long that it is almost impossible to come to it with a fresh eye. It is one of the fun ... (8272 characters truncated) ... n film history, and the other night in that small-town parking lot I got a sense, a stirring, of the buried power it still contains, awaiting a call.', 'https://upload.wikimedia.org/wikipedia/commons/8/85/Vintage_Potemkin.jpg'))] (Background on this error at: http://sqlalche.me/e/f405)

### 3. Read database data into a pandas DataFrame
Read the brand new data in that database back into a pandas DataFrame.

In [46]:
df_gather = pd.read_sql('SELECT * FROM master', engine)

In [47]:
df_gather.head(3)

Unnamed: 0,ranking,title,critic_score,number_of_critic_ratings,audience_score,number_of_audience_ratings,review_url,review_text,poster_url


## Visual Assessment: Acquaint Yourself
This Auralin Phase II clinical trial dataset comes in three tables: `patients`, `treatments`, and `adverse_reactions`. Acquaint yourself with them through visual assessment below.

### Gather

In [2]:
import pandas as pd

In [3]:
patients = pd.read_csv('patients.csv')
treatments = pd.read_csv('treatments.csv')
adverse_reactions = pd.read_csv('adverse_reactions.csv')

### Assess
In the cells below, each column of each table in this clinical trial dataset is described. To see the table that goes hand in hand with these descriptions, display each table in its entirety by displaying the pandas DataFrame that it was gathered into. This task is the mechanical part of visual assessment in pandas.

# Display the patients table

In [4]:
patients

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
0,1,female,Zoe,Wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,951-719-9170ZoeWellish@superrito.com,7/10/1976,121.7,66,19.6
1,2,female,Pamela,Hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,PamelaSHill@cuvox.de+1 (217) 569-3204,4/3/1967,118.8,66,19.2
2,3,male,Jae,Debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,402-363-6804JaeMDebord@gustr.com,2/19/1980,177.8,71,24.8
3,4,male,Liêm,Phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,PhanBaLiem@jourrapide.com+1 (732) 636-8246,7/26/1951,220.9,70,31.7
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,334-515-7487TimNeudorf@cuvox.de,2/18/1928,192.3,27,26.1
5,6,male,Rafael,Costa,1140 Willis Avenue,Daytona Beach,Florida,32114.0,United States,386-334-5237RafaelCardosoCosta@gustr.com,8/31/1931,183.9,70,26.4
6,7,female,Mary,Adams,3145 Sheila Lane,Burbank,NV,84728.0,United States,775-533-5933MaryBAdams@einrot.com,11/19/1969,146.3,65,24.3
7,8,female,Xiuxiu,Chang,2687 Black Oak Hollow Road,Morgan Hill,CA,95037.0,United States,XiuxiuChang@einrot.com1 408 778 3236,8/13/1958,158.0,60,30.9
8,9,male,Dsvid,Gustafsson,1790 Nutter Street,Kansas City,MO,64105.0,United States,816-265-9578DavidGustafsson@armyspy.com,3/6/1937,163.9,66,26.5
9,10,female,Sophie,Cabrera,3303 Anmoore Road,New York,New York,10011.0,United States,SophieCabreraIbarra@teleworm.us1 718 795 9124,12/3/1930,194.7,64,33.4


`patients` columns:
- **patient_id**: the unique identifier for each patient in the [Master Patient Index](https://en.wikipedia.org/wiki/Enterprise_master_patient_index) (i.e. patient database) of the pharmaceutical company that is producing Auralin
- **assigned_sex**: the assigned sex of each patient at birth (male or female)
- **given_name**: the given name (i.e. first name) of each patient
- **surname**: the surname (i.e. last name) of each patient
- **address**: the main address for each patient
- **city**: the corresponding city for the main address of each patient
- **state**: the corresponding state for the main address of each patient
- **zip_code**: the corresponding zip code for the main address of each patient
- **country**: the corresponding country for the main address of each patient (all United states for this clinical trial)
- **contact**: phone number and email information for each patient
- **birthdate**: the date of birth of each patient (month/day/year). The [inclusion criteria](https://en.wikipedia.org/wiki/Inclusion_and_exclusion_criteria) for this clinical trial is  age >= 18 *(there is no maximum age because diabetes is a [growing problem](http://www.diabetes.co.uk/diabetes-and-the-elderly.html) among the elderly population)*
- **weight**: the weight of each patient in pounds (lbs)
- **height**: the height of each patient in inches (in)
- **bmi**: the Body Mass Index (BMI) of each patient. BMI is a simple calculation using a person's height and weight. The formula is BMI = kg/m<sup>2</sup> where kg is a person's weight in kilograms and m<sup>2</sup> is their height in metres squared. A BMI of 25.0 or more is overweight, while the healthy range is 18.5 to 24.9. *The [inclusion criteria](https://en.wikipedia.org/wiki/Inclusion_and_exclusion_criteria) for this clinical trial is 16 >= BMI >= 38.*

# Display the treatments table


In [5]:
treatments

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
0,veronika,jindrová,41u - 48u,-,7.63,7.20,
1,elliot,richardson,-,40u - 45u,7.56,7.09,0.97
2,yukitaka,takenaka,-,39u - 36u,7.68,7.25,
3,skye,gormanston,33u - 36u,-,7.97,7.62,0.35
4,alissa,montez,-,33u - 29u,7.78,7.46,0.32
5,jasmine,sykes,-,42u - 44u,7.56,7.18,0.38
6,sophia,haugen,37u - 42u,-,7.65,7.27,0.38
7,eddie,archer,31u - 38u,-,7.89,7.55,0.34
8,saber,ménard,-,54u - 54u,8.08,7.70,
9,asia,woźniak,30u - 36u,-,7.76,7.37,


350 patients participated in this clinical trial. None of the patients were using Novodra (a popular injectable insulin) or Auralin (the oral insulin being researched) as their primary source of insulin before.  All were experiencing elevated HbA1c levels.

All 350 patients were treated with Novodra to establish a baseline HbA1c level and insulin dose. After four weeks, which isn’t enough time to capture all the change in HbA1c that can be attributed by the switch to Auralin or Novodra:
- 175 patients switched to Auralin for 24 weeks
- 175 patients continued using Novodra for 24 weeks

`treatments` columns:
- **given_name**: the given name of each patient in the Master Patient Index that took part in the clinical trial
- **surname**: the surname of each patient in the Master Patient Index that took part in the clinical trial
- **auralin**: the baseline median daily dose of insulin from the week prior to switching to Auralin (the number before the dash) *and* the ending median daily dose of insulin at the end of the 24 weeks of treatment measured over the 24th week of treatment (the number after the dash). Both are measured in units (shortform 'u'), which is the [international unit](https://en.wikipedia.org/wiki/International_unit) of measurement and the standard measurement for insulin.
- **novodra**: same as above, except for patients that continued treatment with Novodra
- **hba1c_start**: the patient's HbA1c level at the beginning of the first week of treatment. HbA1c stands for Hemoglobin A1c. The [HbA1c test](https://depts.washington.edu/uwcoe/healthtopics/diabetes.html) measures what the average blood sugar has been over the past three months. It is thus a powerful way to get an overall sense of how well diabetes has been controlled. Everyone with diabetes should have this test 2 to 4 times per year. Measured in %.
- **hba1c_end**: the patient's HbA1c level at the end of the last week of treatment
- **hba1c_change**: the change in the patient's HbA1c level from the start of treatment to the end, i.e., `hba1c_start` - `hba1c_end`. For Auralin to be deemed effective, it must be "noninferior" to Novodra, the current standard for insulin. This "noninferiority" is statistically defined as the upper bound of the 95% confidence interval being less than 0.4% for the difference between the mean HbA1c changes for Novodra and Auralin (i.e. Novodra minus Auralin).

# Display the adverse_reactions table


In [6]:
adverse_reactions

Unnamed: 0,given_name,surname,adverse_reaction
0,berta,napolitani,injection site discomfort
1,lena,baer,hypoglycemia
2,joseph,day,hypoglycemia
3,flavia,fiorentino,cough
4,manouck,wubbels,throat irritation
5,jasmine,sykes,hypoglycemia
6,louise,johnson,hypoglycemia
7,albinca,komavec,hypoglycemia
8,noe,aranda,hypoglycemia
9,sofia,hermansen,injection site discomfort


`adverse_reactions` columns:
- **given_name**: the given name of each patient in the Master Patient Index that took part in the clinical trial and had an adverse reaction (includes both patients treated Auralin and Novodra)
- **surname**: the surname of each patient in the Master Patient Index that took part in the clinical trial and had an adverse reaction (includes both patients treated Auralin and Novodra)
- **adverse_reaction**: the adverse reaction reported by the patient

Additional useful information:
- [Insulin resistance varies person to person](http://www.tudiabetes.org/forum/t/how-much-insulin-is-too-much-on-a-daily-basis/9804/5), which is why both starting median daily dose and ending median daily dose are required, i.e., to calculate change in dose.
- It is important to test drugs and medical products in the people they are meant to help. People of different age, race, sex, and ethnic group must be included in clinical trials. This [diversity](https://www.clinicalleader.com/doc/an-fda-perspective-on-patient-diversity-in-clinical-trials-0001) is reflected in the `patients` table.
- Ensuring column names are descriptive enough is an important step in acquainting yourself with the data. 'Descriptive enough' is subjective. Ideally you want short column names (so they are easier to type and read in code form) but also fully descriptive. Length vs. descriptiveness is a tradeoff and common debate (a [similar debate](https://softwareengineering.stackexchange.com/questions/176582/is-there-an-excuse-for-short-variable-names) exists for variable names). The *auralin* and *novodra* column names are probably not descriptive enough, but you'll address that later so don't worry about that for now.

In [7]:
703 * 192.3 / (27 * 27)

185.44156378600823

In [8]:
patients.ix[patients['height'] == 27].index

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Int64Index([4], dtype='int64')

# Data Quality Dimensions

## Completeness

* Do we have all of the records that we should?
* Do we have missing records or not? 
* Are there specific rows, columns, or cells missing?
* Ex.: Missing Hba1c change variable

## Validity

* We've the records but they are not valid. More technically, it doesn't conform to a defined schema (defined set of rules for data).

## Accuracy

* Inaccurate data is wrong data that is valid. It adheres to the defined schema, but it's still incorrect.

## Consistency

* Inconsistent data is both valid and accurate.
* Ex.: State abreviations

## Quality

### patients table
* zip code is a float and not a string;
* zip code has four digits sometimes;
* Tim Neudorf height is 27 instead of 72 in;
* full state names sometimes, abbreviations other times;
* Dsvid Gustafsson;
* missing demographic information (address - contact columns);
* erroneous dataypes (assigned sex, state, zip_code, and birthdate columns);
* multiple phone number formats

### treatments table
* missing HbA1c changes;
* the letter u in starting and ending doses for Auralin and Novodra;
* lowercase given names and surnames;
* missing records (280 instead of 350);
* erroneous datatypes (auralin and novodra columns);
* inaccurate HbA1c changes (4 s mistaken as 9 s)
### adverse reactions table
* lower case given names and surnames

In [4]:
patients.head()

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
0,1,female,Zoe,Wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,951-719-9170ZoeWellish@superrito.com,7/10/1976,121.7,66,19.6
1,2,female,Pamela,Hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,PamelaSHill@cuvox.de+1 (217) 569-3204,4/3/1967,118.8,66,19.2
2,3,male,Jae,Debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,402-363-6804JaeMDebord@gustr.com,2/19/1980,177.8,71,24.8
3,4,male,Liêm,Phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,PhanBaLiem@jourrapide.com+1 (732) 636-8246,7/26/1951,220.9,70,31.7
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,334-515-7487TimNeudorf@cuvox.de,2/18/1928,192.3,27,26.1


In [5]:
patients.tail()

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
498,499,male,Mustafa,Lindström,2530 Victoria Court,Milton Mills,ME,3852.0,United States,207-477-0579MustafaLindstrom@jourrapide.com,4/10/1959,181.1,72,24.6
499,500,male,Ruman,Bisliev,494 Clarksburg Park Road,Sedona,AZ,86341.0,United States,928-284-4492RumanBisliev@gustr.com,3/26/1948,239.6,70,34.4
500,501,female,Jinke,de Keizer,649 Nutter Street,Overland Park,MO,64110.0,United States,816-223-6007JinkedeKeizer@teleworm.us,1/13/1971,171.2,67,26.8
501,502,female,Chidalu,Onyekaozulu,3652 Boone Crockett Lane,Seattle,WA,98109.0,United States,ChidaluOnyekaozulu@jourrapide.com1 360 443 2060,2/13/1952,176.9,67,27.7
502,503,male,Pat,Gersten,2778 North Avenue,Burr,Nebraska,68324.0,United States,PatrickGersten@rhyta.com402-848-4923,5/3/1954,138.2,71,19.3


In [6]:
treatments.sample()

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
184,chân,bùi,31u - 42u,-,7.53,7.18,0.35


In [7]:
treatments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 7 columns):
given_name      280 non-null object
surname         280 non-null object
auralin         280 non-null object
novodra         280 non-null object
hba1c_start     280 non-null float64
hba1c_end       280 non-null float64
hba1c_change    171 non-null float64
dtypes: float64(3), object(4)
memory usage: 15.4+ KB


In [8]:
patients.describe()

Unnamed: 0,patient_id,zip_code,weight,height,bmi
count,503.0,491.0,503.0,503.0,503.0
mean,252.0,49084.118126,173.43499,66.634195,27.483897
std,145.347859,30265.807442,33.916741,4.411297,5.276438
min,1.0,1002.0,48.8,27.0,17.1
25%,126.5,21920.5,149.3,63.0,23.3
50%,252.0,48057.0,175.3,67.0,27.2
75%,377.5,75679.0,199.5,70.0,31.75
max,503.0,99701.0,255.9,79.0,37.7


In [9]:
adverse_reactions['adverse_reaction'].value_counts

<bound method Series.value_counts of 0     injection site discomfort
1                  hypoglycemia
2                  hypoglycemia
3                         cough
4             throat irritation
5                  hypoglycemia
6                  hypoglycemia
7                  hypoglycemia
8                  hypoglycemia
9     injection site discomfort
10                     headache
11                        cough
12                 hypoglycemia
13    injection site discomfort
14                 hypoglycemia
15                       nausea
16                 hypoglycemia
17                       nausea
18                 hypoglycemia
19                     headache
20                 hypoglycemia
21    injection site discomfort
22    injection site discomfort
23                 hypoglycemia
24    injection site discomfort
25                 hypoglycemia
26            throat irritation
27                 hypoglycemia
28                 hypoglycemia
29                 hypoglycemia
30 

In [10]:
patients[patients['city'] == 'New York']

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
9,10,female,Sophie,Cabrera,3303 Anmoore Road,New York,New York,10011.0,United States,SophieCabreraIbarra@teleworm.us1 718 795 9124,12/3/1930,194.7,64,33.4
35,36,female,Kamila,Pecinová,3558 Longview Avenue,New York,New York,10004.0,United States,718-501-0503KamilaPecinova@dayrep.com,12/23/1985,198.9,62,36.4
84,85,female,Nương,Vũ,465 Southern Street,New York,NY,10001.0,United States,VuCamNuong@fleckens.hu516-720-5094,2/1/1981,138.2,63,24.5
129,130,female,Rebecca,Jephcott,989 Wayback Lane,New York,NY,10004.0,United States,631-370-7406RebeccaJephcott@armyspy.com,8/1/1966,203.3,65,33.8
142,143,male,Finley,Chandler,2754 Westwood Avenue,New York,New York,10001.0,United States,516-740-5280FinleyChandler@dayrep.com,10/25/1936,150.9,70,21.6
152,153,male,Christopher,Woodward,3450 Southern Street,New York,NY,10004.0,United States,ChristopherWoodward@jourrapide.com+1 (516) 630...,9/4/1984,212.2,66,34.2
188,189,male,Søren,Sørensen,2397 Bell Street,New York,NY,10011.0,United States,SrenSrensen@superrito.com1 212 201 3108,12/31/1942,157.1,67,24.6
213,214,female,Onyemaechi,Onwughara,685 Duncan Avenue,New York,NY,10013.0,United States,917-622-9142OnyemaechiOnwughara@einrot.com,3/8/1989,131.1,69,19.4
215,216,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
229,230,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4


In [11]:
len(patients[patients['city'] == 'New York'])

18