# Data and How To Source It

# What is an API?

API stands for Application Programming Interface

# How do we make HTTPS requests in Python

In [1]:
import requests

In [2]:
# Issues a get request to the API Endpoint
r = requests.get('https://api.github.com/events')

In [3]:
#Attribute that gets me the raw text as a string
r.text



In [4]:
r.json()[0]

{'id': '20962391598',
 'type': 'PushEvent',
 'actor': {'id': 25180681,
  'login': 'renovate-bot',
  'display_login': 'renovate-bot',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/renovate-bot',
  'avatar_url': 'https://avatars.githubusercontent.com/u/25180681?'},
 'repo': {'id': 287383519,
  'name': 'renovate-bot/python-docs-samples-1',
  'url': 'https://api.github.com/repos/renovate-bot/python-docs-samples-1'},
 'payload': {'push_id': 9462562735,
  'size': 0,
  'distinct_size': 0,
  'ref': 'refs/heads/main',
  'head': 'fe8dd4ce6953f40afa7e6b5b797b22e7f4daf546',
  'before': 'fe8dd4ce6953f40afa7e6b5b797b22e7f4daf546',
  'commits': []},
 'public': True,
 'created_at': '2022-03-28T00:19:50Z'}

In [5]:
# It's as simple as that. Sometimes, the encoding isn't right. Requests tries to infer
# What encoding to use, so you can change it manually with
# r.encoding = 'encoding_to_use'
# We can see that the returned object is what we are operating on.
#UTF-8

# Let's work through an example

In [6]:
r = requests.get('https://data.cityofnewyork.us/resource/f9bf-2cp4.json')

In [7]:
r.text

'[{"dbn":"01M292","school_name":"HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES","num_of_sat_test_takers":"29","sat_critical_reading_avg_score":"355","sat_math_avg_score":"404","sat_writing_avg_score":"363"}\n,{"dbn":"01M448","school_name":"UNIVERSITY NEIGHBORHOOD HIGH SCHOOL","num_of_sat_test_takers":"91","sat_critical_reading_avg_score":"383","sat_math_avg_score":"423","sat_writing_avg_score":"366"}\n,{"dbn":"01M450","school_name":"EAST SIDE COMMUNITY SCHOOL","num_of_sat_test_takers":"70","sat_critical_reading_avg_score":"377","sat_math_avg_score":"402","sat_writing_avg_score":"370"}\n,{"dbn":"01M458","school_name":"FORSYTH SATELLITE ACADEMY","num_of_sat_test_takers":"7","sat_critical_reading_avg_score":"414","sat_math_avg_score":"401","sat_writing_avg_score":"359"}\n,{"dbn":"01M509","school_name":"MARTA VALLE HIGH SCHOOL","num_of_sat_test_takers":"44","sat_critical_reading_avg_score":"390","sat_math_avg_score":"433","sat_writing_avg_score":"384"}\n,{"dbn":"01M515","school_name":"LOWE

In [8]:
r.json()

[{'dbn': '01M292',
  'school_name': 'HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES',
  'num_of_sat_test_takers': '29',
  'sat_critical_reading_avg_score': '355',
  'sat_math_avg_score': '404',
  'sat_writing_avg_score': '363'},
 {'dbn': '01M448',
  'school_name': 'UNIVERSITY NEIGHBORHOOD HIGH SCHOOL',
  'num_of_sat_test_takers': '91',
  'sat_critical_reading_avg_score': '383',
  'sat_math_avg_score': '423',
  'sat_writing_avg_score': '366'},
 {'dbn': '01M450',
  'school_name': 'EAST SIDE COMMUNITY SCHOOL',
  'num_of_sat_test_takers': '70',
  'sat_critical_reading_avg_score': '377',
  'sat_math_avg_score': '402',
  'sat_writing_avg_score': '370'},
 {'dbn': '01M458',
  'school_name': 'FORSYTH SATELLITE ACADEMY',
  'num_of_sat_test_takers': '7',
  'sat_critical_reading_avg_score': '414',
  'sat_math_avg_score': '401',
  'sat_writing_avg_score': '359'},
 {'dbn': '01M509',
  'school_name': 'MARTA VALLE HIGH SCHOOL',
  'num_of_sat_test_takers': '44',
  'sat_critical_reading_avg_score': '390'

In [9]:
import pandas as pd

In [10]:
df = pd.read_json(r.text)

In [11]:
df


Unnamed: 0,dbn,school_name,num_of_sat_test_takers,sat_critical_reading_avg_score,sat_math_avg_score,sat_writing_avg_score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384
...,...,...,...,...,...,...
473,75X012,P.S. X012 LEWIS AND CLARK SCHOOL,s,s,s,s
474,75X754,J. M. RAPPORT SCHOOL CAREER DEVELOPMENT,s,s,s,s
475,79M645,SCHOOL FOR COOPERATIVE TECHNICAL EDUCATION,s,s,s,s
476,79Q950,GED PLUS s CITYWIDE,8,496,400,426


In [12]:
payload = {'$limit':10,"$$app_token":"tq19sOJfOJFRot5OrORkOEbsS"}
r = requests.get('https://data.cityofnewyork.us/resource/f9bf-2cp4.json', params=payload)

In [13]:
r.text

'{\n  "code" : "permission_denied",\n  "error" : true,\n  "message" : "Invalid app_token specified"\n}\n'

In [14]:
r.json()

{'code': 'permission_denied',
 'error': True,
 'message': 'Invalid app_token specified'}

In [15]:
# form a get to grab the websites HTML
r = requests.get('https://basketball-reference.com/teams/PHI/2022.html')

In [16]:
from bs4 import BeautifulSoup

In [17]:
# turn the raw HTML into a soup object
soup = BeautifulSoup(r.text,'html.parser')

In [18]:
#One way of finding the table that contains the data
earnest_attempt = soup.findAll('div',attrs={'id':'div_roster'})

In [19]:
#Another way of finding the table that contains the data.
found_table = soup.findAll('table')

In [20]:
#I know the roster table is the first table that appears, so I'm grabbing it
roster_table = found_table[0]

In [21]:
#Grab all the table rows from the table.
player_rows = roster_table('tr')

In [22]:
player_rows

[<tr>
 <th aria-label="No." class="poptip sort_default_asc center" data-stat="number" data-tip="Uniform Number" scope="col">No.</th>
 <th aria-label="Player" class="poptip sort_default_asc center" data-stat="player" scope="col">Player</th>
 <th aria-label="Pos" class="poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</th>
 <th aria-label="Ht" class="poptip sort_default_asc center" data-stat="height" data-tip="Height" scope="col">Ht</th>
 <th aria-label="Wt" class="poptip sort_default_asc center" data-stat="weight" data-tip="Weight" scope="col">Wt</th>
 <th aria-label="Birth Date" class="poptip sort_default_asc center" data-stat="birth_date" scope="col">Birth Date</th>
 <th aria-label=" " class="poptip center" data-stat="birth_country" scope="col"> </th>
 <th aria-label="Exp" class="poptip sort_default_asc center" data-stat="years_experience" data-tip="Years experience in NBA/ABA (prior to this season)" scope="col">Exp</th>
 <th aria-label="College" cla

In [23]:
#I know that I don't need the first row, so I'm deleting it
del player_rows[0]

In [24]:
player_rows

[<tr><th class="center" data-stat="number" scope="row">20</th><td class="left" csk="Niang,Georges" data-stat="player"><a href="/players/n/niangge01.html">Georges Niang</a></td><td class="center" csk="4" data-stat="pos">PF</td><td class="right" csk="79.0" data-stat="height">6-7</td><td class="right" data-stat="weight">230</td><td class="left" csk="19930617" data-stat="birth_date">June 17, 1993</td><td class="right" data-stat="birth_country"><span class="f-i f-us" style="">us</span></td><td class="right" csk="5" data-stat="years_experience">5</td><td class="left" data-stat="college"><a href="/friv/colleges.fcgi?college=iowast">Iowa State</a></td></tr>,
 <tr><th class="center iz" data-stat="number" scope="row">0</th><td class="left" csk="Maxey,Tyrese" data-stat="player"><a href="/players/m/maxeyty01.html">Tyrese Maxey</a></td><td class="center" csk="1" data-stat="pos">PG</td><td class="right" csk="74.0" data-stat="height">6-2</td><td class="right" data-stat="weight">200</td><td class="lef

In [25]:
#I am selecting all the Table Data from the first row
player_rows[0].find_all('td')

[<td class="left" csk="Niang,Georges" data-stat="player"><a href="/players/n/niangge01.html">Georges Niang</a></td>,
 <td class="center" csk="4" data-stat="pos">PF</td>,
 <td class="right" csk="79.0" data-stat="height">6-7</td>,
 <td class="right" data-stat="weight">230</td>,
 <td class="left" csk="19930617" data-stat="birth_date">June 17, 1993</td>,
 <td class="right" data-stat="birth_country"><span class="f-i f-us" style="">us</span></td>,
 <td class="right" csk="5" data-stat="years_experience">5</td>,
 <td class="left" data-stat="college"><a href="/friv/colleges.fcgi?college=iowast">Iowa State</a></td>]

In [26]:
#List comprehension to extract all the text data from each TD element.
extracted_text = [ele.text for ele in player_rows[0].find_all('td') ]

In [27]:
extracted_text 

['Georges Niang', 'PF', '6-7', '230', 'June 17, 1993', 'us', '5', 'Iowa State']

In [28]:
#This is going to serve as the keys in my k,v pairings
keys = ['Player_Name','Position','Height','Weight','DOB','COB','Experience','Alma_Matter']

In [29]:
#initialize a dict to hold the result of my for loop
player_dict = []


#Iterate over the list of rows in my table
#apply the same logic as before.
#Locate all the td elements
#extract all the text from the td elements
#zip them with my keys
for player in player_rows:
    found_rows = player.find_all('td')
    stripped_text = [ele.text for ele in found_rows]
    player_dict.append(dict(zip(keys,stripped_text))) 

In [30]:
player_dict

[{'Player_Name': 'Georges Niang',
  'Position': 'PF',
  'Height': '6-7',
  'Weight': '230',
  'DOB': 'June 17, 1993',
  'COB': 'us',
  'Experience': '5',
  'Alma_Matter': 'Iowa State'},
 {'Player_Name': 'Tyrese Maxey',
  'Position': 'PG',
  'Height': '6-2',
  'Weight': '200',
  'DOB': 'November 4, 2000',
  'COB': 'us',
  'Experience': '1',
  'Alma_Matter': 'Kentucky'},
 {'Player_Name': 'Tobias Harris',
  'Position': 'PF',
  'Height': '6-8',
  'Weight': '226',
  'DOB': 'July 15, 1992',
  'COB': 'us',
  'Experience': '10',
  'Alma_Matter': 'Tennessee'},
 {'Player_Name': 'Furkan Korkmaz',
  'Position': 'SG',
  'Height': '6-7',
  'Weight': '202',
  'DOB': 'July 24, 1997',
  'COB': 'tr',
  'Experience': '4',
  'Alma_Matter': ''},
 {'Player_Name': 'Joel Embiid',
  'Position': 'C',
  'Height': '7-0',
  'Weight': '280',
  'DOB': 'March 16, 1994',
  'COB': 'cm',
  'Experience': '5',
  'Alma_Matter': 'Kansas'},
 {'Player_Name': 'Matisse Thybulle',
  'Position': 'SG',
  'Height': '6-5',
  'Weight

In [31]:
#create a new df from my list of Dicts
df = pd.DataFrame(player_dict)

In [32]:
df

Unnamed: 0,Player_Name,Position,Height,Weight,DOB,COB,Experience,Alma_Matter
0,Georges Niang,PF,6-7,230,"June 17, 1993",us,5,Iowa State
1,Tyrese Maxey,PG,6-2,200,"November 4, 2000",us,1,Kentucky
2,Tobias Harris,PF,6-8,226,"July 15, 1992",us,10,Tennessee
3,Furkan Korkmaz,SG,6-7,202,"July 24, 1997",tr,4,
4,Joel Embiid,C,7-0,280,"March 16, 1994",cm,5,Kansas
5,Matisse Thybulle,SG,6-5,201,"March 4, 1997",us,2,Washington
6,Danny Green,SF,6-6,215,"June 22, 1987",us,12,UNC
7,Isaiah Joe,SG,6-4,165,"July 2, 1999",us,1,Arkansas
8,Shake Milton,PG,6-5,205,"September 26, 1996",us,3,SMU
9,Paul Reed,C,6-9,210,"June 14, 1999",us,1,DePaul


In [33]:
#Method number 2 using lists
player_dict = []
for player in player_rows:
    found_rows = player.find_all('td')
    stripped_text = [ele.text for ele in found_rows]
    player_dict.append(stripped_text)

In [34]:
#Initialize my DF using list of lists
df = pd.DataFrame(player_dict,columns=['Player_Name','Position','Height','Weight','DOB','COB','Experience','Alma_Matter'])


In [35]:
df

Unnamed: 0,Player_Name,Position,Height,Weight,DOB,COB,Experience,Alma_Matter
0,Georges Niang,PF,6-7,230,"June 17, 1993",us,5,Iowa State
1,Tyrese Maxey,PG,6-2,200,"November 4, 2000",us,1,Kentucky
2,Tobias Harris,PF,6-8,226,"July 15, 1992",us,10,Tennessee
3,Furkan Korkmaz,SG,6-7,202,"July 24, 1997",tr,4,
4,Joel Embiid,C,7-0,280,"March 16, 1994",cm,5,Kansas
5,Matisse Thybulle,SG,6-5,201,"March 4, 1997",us,2,Washington
6,Danny Green,SF,6-6,215,"June 22, 1987",us,12,UNC
7,Isaiah Joe,SG,6-4,165,"July 2, 1999",us,1,Arkansas
8,Shake Milton,PG,6-5,205,"September 26, 1996",us,3,SMU
9,Paul Reed,C,6-9,210,"June 14, 1999",us,1,DePaul


In [36]:
soup.findAll('table')

[<table class="sortable stats_table" data-cols-to-freeze=",2" id="roster">
 <caption>Roster Table</caption>
 <colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
 <thead>
 <tr>
 <th aria-label="No." class="poptip sort_default_asc center" data-stat="number" data-tip="Uniform Number" scope="col">No.</th>
 <th aria-label="Player" class="poptip sort_default_asc center" data-stat="player" scope="col">Player</th>
 <th aria-label="Pos" class="poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</th>
 <th aria-label="Ht" class="poptip sort_default_asc center" data-stat="height" data-tip="Height" scope="col">Ht</th>
 <th aria-label="Wt" class="poptip sort_default_asc center" data-stat="weight" data-tip="Weight" scope="col">Wt</th>
 <th aria-label="Birth Date" class="poptip sort_default_asc center" data-stat="birth_date" scope="col">Birth Date</th>
 <th aria-label=" " class="poptip center" data-stat="birth_country" scope="col"> </th>
 <th ari

In [37]:
import selenium

In [38]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys


In [39]:
#driver = webdriver.Chrome(executable_path='C:/path/to/chromedriver.exe')

In [40]:
#Code to initialize the chrome driver
driver = webdriver.Chrome()

#have the chrome browser issue a get request to googl
driver.get("http://www.google.com")

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://chromedriver.chromium.org/home


In [None]:
search_box = driver.find_element(By.NAME, "q")

In [None]:
search_button = driver.find_element(By.NAME, "btnK")

In [None]:
search_box.send_keys("Selenium")

In [None]:
search_button.click()

In [None]:
driver.find_element(By.NAME, "q").get_attribute("value")

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys



#driver = webdriver.Chrome(executable_path='C:/path/to/chromedriver.exe')
#This is for you want the driver in a static place

driver = webdriver.Chrome()
#Initialize the chrome instance

driver.get("http://www.google.com")
#Issue a get request to the website


search_box = driver.find_element(By.NAME, "q")
#Search the webpage for an element called a, set it equal to search_box


driver.implicitly_wait(2)
#Tell the driver to not do anything for 2 seconds ( Why? It's because we need the search button to load in after some text is being put into the search box)

search_button = driver.find_element(By.NAME, "btnK")
#Find the element on the webpage called btnL


search_box.send_keys("Selenium")
#Mimic the keystrokes S E L E N I U M to the search box


search_button.click()
#Mimic a click of the button on the search_button element



driver.find_element(By.NAME, "q").get_attribute("value") # => "Selenium"
#Find the element on the webpage called q, and get the attribute value.


In [None]:
driver = webdriver.Chrome()
driver.implicitly_wait(2)



driver.get("https://www.basketball-reference.com/teams/PHI/2022.html")
roster_table = driver.find_element(By.ID, "roster")
raw_html = roster_table.get_attribute('innerHTML')
print(roster_table.get_attribute('innerHTML'))
driver.close()


In [None]:
soup = BeautifulSoup(raw_html,'html.parser')

In [None]:
all_rows = soup.findAll('tr')

In [None]:
del all_rows[0]

In [None]:
player_dict = []
player_list = []
keys = ['Player_Name','Position','Height','Weight','DOB','COB','Number','Alma_Matter']
for player in all_rows:
    found_rows= player.find_all('td')
    stripped_text = [ele.text for ele in found_rows]
    player_dict.append(dict(zip(keys,stripped_text)))


player_dict

In [None]:
df = pd.DataFrame(player_dict)

In [None]:
df

In [None]:
driver = webdriver.Chrome()
driver.implicitly_wait(2)



driver.get("https://www.basketball-reference.com/teams/PHI/2022.html")
salaries_table = driver.find_element(By.ID, "salaries2")
raw_html = salaries_table.get_attribute('innerHTML')
print(salaries_table.get_attribute('innerHTML'))
driver.close()


In [None]:
soup = BeautifulSoup(raw_html,'html.parser')

In [None]:
player_salaries = soup.findAll('tr')

In [None]:
del player_salaries[0]

In [None]:
player_salaries

In [None]:
player_salaries_list = []
for player in player_salaries:
    found_rows= player.find_all('td')
    stripped_text = [ele.text for ele in found_rows]
    player_salaries_list.append(stripped_text)




In [None]:
player_salaries_list