In [318]:
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests
from webdriver_manager.chrome import ChromeDriverManager

In [319]:
teams = ['ATL', 'BRK', 'BOS', 'CHO', 'CHI', 
         'CLE', 'DAL', 'DEN', 'DET', 'GSW', 
         'HOU', 'IND', 'LAC', 'LAL', 'MEM', 
         'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 
         'OKC', 'ORL', 'PHI', 'PHO', 'POR', 
         'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

# create dictionary with info for each baller on an nba roster at the end of the 2020 season
ballers = {'Name': [], 'Page': []}

ogs = []

# iterate through each team
for team in teams:
    
    # set url for team's basketball reference page in 2020
    url = 'https://www.basketball-reference.com/teams/'+team+'/2020.html'

    # retrieve page with the requests module
    html = requests.get(url)

    # create BeautifulSoup object; parse with 'html.parser'
    soup = bs(html.text, 'html.parser')

    # scrape the name and bball ref page of each baller listed on the roster
    for x in range(1,50):
        try:
            name = soup.find_all('tr')[x].a.text
            ballers['Name'].append(name)
            ogs.append(name)
            ballers['Page'].append('https://www.basketball-reference.com/' + soup.find_all('tr')[x].a['href'])
        except:
            # break loop when team's roster has been exhausted
            break

# show ballers dictionary
ballers

    

{'Name': ["DeAndre' Bembry",
  'Charlie Brown',
  'Vince Carter',
  'John Collins',
  'Allen Crabbe',
  'Dewayne Dedmon',
  'Bruno Fernando',
  'Brandon Goodwin',
  'Treveon Graham',
  'Kevin Huerter',
  "De'Andre Hunter",
  'Damian Jones',
  'Alex Len',
  'Jabari Parker',
  'Chandler Parsons',
  'Cam Reddish',
  'Jeff Teague',
  'Evan Turner',
  'Tyrone Wallace',
  'Paul Watson',
  'Trae Young',
  'Jarrett Allen',
  'Justin Anderson',
  'Wilson Chandler',
  'Chris Chiozza',
  'Nicolas Claxton',
  'Jamal Crawford',
  'Spencer Dinwiddie',
  'Henry Ellenson',
  'Donta Hall',
  'Joe Harris',
  'Kyrie Irving',
  'Tyler Johnson',
  'DeAndre Jordan',
  'Rodions Kurucs',
  'Caris LeVert',
  'Timothé Luwawu-Cabarrot',
  'Jeremiah Martin',
  'Džanan Musa',
  'David Nwaba',
  'Theo Pinson',
  'Taurean Prince',
  'Iman Shumpert',
  'Garrett Temple',
  'Lance Thomas',
  'Jaylen Brown',
  'Carsen Edwards',
  'Tacko Fall',
  'Javonte Green',
  'Gordon Hayward',
  'Enes Kanter',
  'Romeo Langford',
 

In [322]:
nope = []

ballers['Photo'] = [] 
ballers['City'] = []
ballers['Country'] = []

# go through each ballers bballref page
for i in range(0, (len(ballers['Page'])-1)):

    try:
        print("")
        print(ballers['Name'][i])

        # setup scrape
        url = ballers['Page'][i]
        html = requests.get(url)

        # go to page and scrape
        soup = bs(html.text, 'html.parser')

        # get conventional per game stats and put in pandas df
        stats = pd.read_html(url)[0]

        # filter for full seasons from 2015-2020
        standard = stats.loc[(stats['Season'] == '2019-20') | 
                  (stats['Season'] == '2018-19') |
                  (stats['Season'] == '2017-18') | 
                  (stats['Season'] == '2016-17') | 
                  (stats['Season'] == '2015-16') ]

        # get advanced per game stats and put in pandas df
        table = soup.find_all(string=lambda text: isinstance(text, Comment))[46]
        stats = pd.read_html(table)[0]

        # filter for full seasons from 2015-2020 and remove redundant columns
        advanced = stats.loc[(stats['Season'] == '2019-20') | 
                  (stats['Season'] == '2018-19') |
                  (stats['Season'] == '2017-18') | 
                  (stats['Season'] == '2016-17') | 
                  (stats['Season'] == '2015-16') ].drop(['Age', 'Tm', 'Lg', 'Pos', 'G', 'MP'], axis=1)

        # merge conventional and advanced per game stats into one pandas df
        stats = pd.merge(standard, advanced, on='Season')

        # create new keys in "ballers" dictionary for each statistical category
        if i == 0:
            for c in stats.columns:
                    ballers[c] = []

        # place individual baller's stats in dictionary for all ballers
        for j in range(0, len(stats.columns)):

            # set stat being refered to as a variable
            stat = stats.columns[j]

            # most recent autobiographical/contextual data appended without manipulation
            if j < 5:
                ballers[stat].append(stats[stat][len(stats[stat])-1])


            # games played/started are appended as an accumulative sum over last 5 years
            elif j < 7:
                ballers[stat].append(stats[stat].sum())


            # per game stats are appended as an average over last 5 years
            else:
                stats[stat] = pd.to_numeric(stats[stat])
#                 print(stat + ': ')
#                 print(str(stats[stat].mean()))
                ballers[stat].append(stats[stat].mean())

        # get baller's photo, home city, and home country
        print(ballers['Name'][i])
        ballers['Photo'].append(soup.find_all('img', itemscope="image")[0]['src'])
        ballers['City'].append(soup.find_all('span', itemprop="birthPlace")[0].text.replace('\n    in\xa0', "").replace('\xa0', " "))
        ballers['Country'].append(soup.find_all('span')[13].text)
    
    except:
        nope.append(ballers['Name'][i])
        ballers['Name'].remove(ballers['Name'][i])
        ballers['Page'].remove(ballers['Page'][i])




# # remove unnecessary keys
# ballers.pop('Page')
# ballers.pop('Season')
# ballers.pop('Unnamed: 24')
# ballers.pop('Unnamed: 19')

# show ballers dictionary
ballers



DeAndre' Bembry
DeAndre' Bembry

Charlie Brown
Charlie Brown

Vince Carter
Vince Carter

John Collins
John Collins

Allen Crabbe
Allen Crabbe

Dewayne Dedmon
Dewayne Dedmon

Bruno Fernando
Bruno Fernando

Brandon Goodwin
Brandon Goodwin

Treveon Graham
Treveon Graham

Kevin Huerter
Kevin Huerter

De'Andre Hunter
De'Andre Hunter

Damian Jones
Damian Jones

Alex Len
Alex Len

Jabari Parker
Jabari Parker

Chandler Parsons
Chandler Parsons

Cam Reddish
Cam Reddish

Jeff Teague
Jeff Teague

Evan Turner
Evan Turner

Tyrone Wallace
Tyrone Wallace

Paul Watson
Paul Watson

Trae Young
Trae Young

Jarrett Allen
Jarrett Allen

Justin Anderson
Justin Anderson

Wilson Chandler
Wilson Chandler

Chris Chiozza
Chris Chiozza

Nicolas Claxton
Nicolas Claxton

Jamal Crawford
Jamal Crawford

Spencer Dinwiddie
Spencer Dinwiddie

Henry Ellenson
Henry Ellenson

Donta Hall
Donta Hall

Joe Harris
Joe Harris

Kyrie Irving
Kyrie Irving

Tyler Johnson
Tyler Johnson

DeAndre Jordan
DeAndre Jordan

Rodions Kurucs


Dillon Brooks

Bruno Caboclo
Bruno Caboclo

Brandon Clarke
Brandon Clarke

Jae Crowder
Jae Crowder

Gorgui Dieng
Gorgui Dieng

Marko Guduric
Marko Guduric

Dusty Hannahs

Jaren Jackson
Jaren Jackson

Josh Jackson
Josh Jackson

Tyus Jones
Tyus Jones

John Konchar
John Konchar

De'Anthony Melton
De'Anthony Melton

Ja Morant
Ja Morant

Anthony Tolliver
Anthony Tolliver

Jarrod Uthoff
Jarrod Uthoff

Jonas Valančiūnas
Jonas Valančiūnas

Yuta Watanabe
Yuta Watanabe

Bam Adebayo
Bam Adebayo

Kyle Alexander
Kyle Alexander

Jimmy Butler
Jimmy Butler

Jae Crowder
Jae Crowder

Goran Dragić
Goran Dragić

Udonis Haslem
Udonis Haslem

Tyler Herro
Tyler Herro

Solomon Hill
Solomon Hill

Andre Iguodala
Andre Iguodala

James Johnson
James Johnson

Derrick Jones
Derrick Jones

Meyers Leonard
Meyers Leonard

Daryl Macon
Daryl Macon

Kendrick Nunn
Kendrick Nunn

KZ Okpala
KZ Okpala

Kelly Olynyk
Kelly Olynyk

Duncan Robinson
Duncan Robinson

Chris Silva
Chris Silva

Gabe Vincent
Gabe Vincent

Dion Waiters

Justin Wright-Foreman

Bradley Beal
Bradley Beal

Dāvis Bertāns
Dāvis Bertāns

Isaac Bonga
Isaac Bonga

Troy Brown
Troy Brown

Thomas Bryant
Thomas Bryant

Chris Chiozza
Chris Chiozza

Jerian Grant
Jerian Grant

Rui Hachimura
Rui Hachimura

Ian Mahinmi
Ian Mahinmi

Garrison Mathews
Garrison Mathews

Jordan McRae
Jordan McRae

C.J. Miles
C.J. Miles

Shabazz Napier
Shabazz Napier

Anžejs Pasečņiks
Anžejs Pasečņiks

Gary Payton
Gary Payton

Jerome Robinson
Jerome Robinson

Justin Robinson
Justin Robinson

Admiral Schofield
Admiral Schofield

Ish Smith
Ish Smith

Isaiah Thomas
Isaiah Thomas

Jarrod Uthoff
Jarrod Uthoff

Moritz Wagner
Moritz Wagner

Johnathan Williams
Johnathan Williams



IndexError: list index out of range

In [339]:
for stat in advanced.columns:
    print(stat)

Season
PER
TS%
3PAr
FTr
ORB%
DRB%
TRB%
AST%
STL%
BLK%
TOV%
USG%
Unnamed: 19
OWS
DWS
WS
WS/48
Unnamed: 24
OBPM
DBPM
BPM
VORP


In [323]:
nope

['Sviatoslav Mykhailiuk', 'Dusty Hannahs', 'Chimezie Metu']

In [335]:
this = pd.DataFrame({"Name": ballers["Name"], "Page": ballers["Page"]})

that = this["Name"].value_counts()

for baller in that:
    print(baller)


3
3
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [328]:
# for mmm in ballers.keys():
#     print(mmm + ": " + str(len(ballers[mmm])))


# for baller in ballers["Name"]:
#     if baller == 'Chimezie Metu':
#         print(baller)

# import numpy as np
# diff = np.setdiff1d(ogs,ballers["Name"])
# diff

ballers.loc[[]]
    
    

Name: 589
Page: 589
Photo: 586
City: 586
Country: 586
Season: 586
Age: 586
Tm: 586
Lg: 586
Pos: 586
G: 586
GS: 586
MP: 586
FG: 586
FGA: 586
FG%: 586
3P: 586
3PA: 586
3P%: 586
2P: 586
2PA: 586
2P%: 586
eFG%: 586
FT: 586
FTA: 586
FT%: 586
ORB: 586
DRB: 586
TRB: 586
AST: 586
STL: 586
BLK: 586
TOV: 586
PF: 586
PTS: 586
PER: 586
TS%: 586
3PAr: 586
FTr: 586
ORB%: 586
DRB%: 586
TRB%: 586
AST%: 586
STL%: 586
BLK%: 586
TOV%: 586
USG%: 586
Unnamed: 19: 586
OWS: 586
DWS: 586
WS: 586
WS/48: 586
Unnamed: 24: 586
OBPM: 586
DBPM: 586
BPM: 586
VORP: 586


In [317]:
ayy = [0,1,2,3]

ayy.remove(0)

len(ayy)

3

In [179]:
teams = ['ATL', 'BRK', 'BOS', 'CHO', 'CHI', 
         'CLE', 'DAL', 'DEN', 'DET', 'GSW', 
         'HOU', 'IND', 'LAC', 'LAL', 'MEM', 
         'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 
         'OKC', 'ORL', 'PHI', 'PHO', 'POR', 
         'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

for team in teams:
    team_url = 'https://www.basketball-reference.com/teams/'+team+'/2020.html'
    print(team+':')
    print(pd.read_html(team_url)[0])
    

ATL:
    No.            Player Pos    Ht   Wt          Birth Date Unnamed: 6 Exp  \
0    95   DeAndre' Bembry  SG   6-5  210        July 4, 1994         us   3   
1     4     Charlie Brown  SG   6-6  199    February 2, 1997         us   R   
2    15      Vince Carter  PF   6-6  220    January 26, 1977         us  21   
3    20      John Collins  PF   6-9  235  September 23, 1997         us   2   
4    33      Allen Crabbe  SG   6-5  212       April 9, 1992         us   6   
5    14    Dewayne Dedmon   C   7-0  245     August 12, 1989         us   6   
6    24    Bruno Fernando   C   6-9  240     August 15, 1998         ao   R   
7     0   Brandon Goodwin  PG   6-0  180     October 2, 1995         us   1   
8     2    Treveon Graham  SF   6-5  219    October 28, 1993         us   3   
9     3     Kevin Huerter  SG   6-7  190     August 27, 1998         us   1   
10   12   De'Andre Hunter  SF   6-8  225    December 2, 1997         us   R   
11   30      Damian Jones   C  6-11  245       

    No.              Player Pos    Ht   Wt         Birth Date Unnamed: 6 Exp  \
0    51    Ryan Arcidiacono  PG   6-3  195     March 26, 1994         us   2   
1    34      Wendell Carter   C  6-10  270     April 16, 1999         us   1   
2    32           Kris Dunn  PG   6-3  205     March 18, 1994         us   3   
3     6   Cristiano Felício   C  6-11  270       July 7, 1992         br   4   
4    12      Daniel Gafford   C  6-10  234    October 1, 1998         us   R   
5     3  Shaquille Harrison  SG   6-7  190    October 6, 1993         us   2   
6    15  Chandler Hutchison  SF   6-6  210     April 26, 1996         us   1   
7     2         Luke Kornet   C   7-2  250      July 15, 1995         us   2   
8     8         Zach LaVine  SG   6-5  200     March 10, 1995         us   5   
9    24     Lauri Markkanen  PF   7-0  240       May 22, 1997         fi   2   
10   20         Adam Mokoka  SF   6-4  190      July 18, 1998         fr   R   
11   22         Otto Porter  SF   6-8  1

      No.                 Player Pos    Ht   Wt          Birth Date  \
0      18            Jordan Bone  PG   6-2  180    November 5, 1997   
1       6            Bruce Brown  PG   6-4  202     August 15, 1996   
2      45        Sekou Doumbouya  PF   6-8  230   December 23, 2000   
3       0         Andre Drummond   C  6-10  279     August 10, 1993   
4      12            Tim Frazier  PG   6-0  170    November 1, 1990   
5       9      Langston Galloway  SG   6-1  200    December 9, 1991   
6      23          Blake Griffin  PF  6-10  250      March 16, 1989   
7      42             Donta Hall  PF   6-9  232      August 7, 1997   
8      31            John Henson   C   6-9  219   December 28, 1990   
9       1         Reggie Jackson  PG   6-2  208      April 16, 1990   
10      5           Luke Kennard  SG   6-5  206       June 24, 1996   
11     14             Louis King  SF   6-7  205       April 6, 1999   
12     20         Brandon Knight  PG   6-2  182    December 2, 1991   
13    

      No.             Player Pos    Ht   Wt         Birth Date Unnamed: 6 Exp  \
0      21   Patrick Beverley  PG   6-1  180      July 12, 1988         us   7   
1       7        Amir Coffey  SG   6-7  210      June 17, 1997         us   R   
2      13        Paul George  SF   6-8  220        May 2, 1990         us   9   
3       4     JaMychal Green  PF   6-8  227      June 21, 1990         us   5   
4   8, 11   Maurice Harkless  PF   6-7  220       May 11, 1993         us   7   
5       5   Montrezl Harrell   C   6-7  240   January 26, 1994         us   4   
6       1     Reggie Jackson  SG   6-2  208     April 16, 1990         it   8   
7      25  Mfiondu Kabengele   C   6-9  250    August 14, 1997         ca   R   
8       2      Kawhi Leonard  SF   6-7  225      June 29, 1991         us   8   
9      14       Terance Mann  SF   6-5  215   October 18, 1996         us   R   
10     19    Rodney McGruder  SG   6-4  205      July 29, 1991         us   3   
11     31      Marcus Morris

    No.                  Player Pos    Ht   Wt         Birth Date Unnamed: 6  \
0    34   Giannis Antetokounmpo  PF  6-11  242   December 6, 1994         gr   
1    43  Thanasis Antetokounmpo  SF   6-6  219      July 18, 1992         gr   
2    17           Dragan Bender   C   7-0  225  November 17, 1997         ba   
3     6            Eric Bledsoe  PG   6-1  214   December 9, 1989         us   
4    23          Sterling Brown  SF   6-5  219  February 10, 1995         us   
5    24         Pat Connaughton  SG   6-5  209    January 6, 1993         us   
6     0        Donte DiVincenzo  SG   6-4  203   January 31, 1997         us   
7     3             George Hill  PG   6-4  188        May 4, 1986         us   
8     7          Ersan İlyasova  PF   6-9  235       May 15, 1987         tr   
9    26             Kyle Korver  SF   6-7  212     March 17, 1981         us   
10   11             Brook Lopez   C   7-0  282      April 1, 1988         us   
11   42             Robin Lopez   C   7-

    No.                   Player Pos    Ht   Wt          Birth Date  \
0    12             Steven Adams   C  6-11  265       July 20, 1993   
1     7            Darius Bazley  PF   6-8  208       June 12, 2000   
2    30            Deonte Burton  SF   6-4  240    January 31, 1994   
3     6           Hamidou Diallo  SF   6-5  202       July 31, 1998   
4     5            Luguentz Dort  SF   6-3  215      April 19, 1999   
5    23        Terrance Ferguson  SF   6-6  190        May 17, 1998   
6     8         Danilo Gallinari  PF  6-10  233      August 8, 1988   
7     2  Shai Gilgeous-Alexander  SG   6-6  180       July 12, 1998   
8    14               Devon Hall  PG   6-2  215        July 7, 1995   
9    15             Kevin Hervey  PF   6-9  230        July 9, 1996   
10   33             Mike Muscala   C  6-10  240        July 1, 1991   
11   11              Abdel Nader  SF   6-5  225  September 25, 1993   
12    9             Nerlens Noel   C  6-11  220      April 10, 1994   
13   1

    No.            Player Pos    Ht   Wt          Birth Date Unnamed: 6 Exp  \
0    10      Jaylen Adams  PG   6-0  225         May 4, 1996         us   1   
1     0   Carmelo Anthony  PF   6-7  238        May 29, 1984         us  16   
2     8      Trevor Ariza  SF   6-8  215       June 30, 1985         us  15   
3    24     Kent Bazemore  SF   6-4  195        July 1, 1989         us   7   
4     4       Moses Brown   C   7-2  245    October 13, 1999         us   R   
5    33      Zach Collins  PF  6-11  250   November 19, 1997         us   2   
6    35    Wenyen Gabriel  PF   6-9  205      March 26, 1997         sd   R   
7    44     Mario Hezonja  PF   6-8  220   February 25, 1995         hr   4   
8     6      Jaylen Hoard  PF   6-8  216      March 30, 1999         fr   R   
9     5       Rodney Hood  SF   6-8  208    October 20, 1992         us   5   
10   17   Skal Labissière   C  6-10  235      March 18, 1996         ht   3   
11    0    Damian Lillard  PG   6-2  195       July 

       No.                 Player Pos    Ht   Wt          Birth Date  \
0       44       Bojan Bogdanović  SF   6-7  226      April 18, 1989   
1       13           Tony Bradley   C  6-10  248     January 8, 1998   
2        5       Jarrell Brantley  PF   6-5  250        June 7, 1996   
3       00        Jordan Clarkson  SG   6-4  194        June 7, 1992   
4       10            Mike Conley  PG   6-1  175    October 11, 1987   
5       17               Ed Davis   C   6-9  218        June 5, 1989   
6       11             Dante Exum  SG   6-5  214       July 13, 1995   
7       27            Rudy Gobert   C   7-1  258       June 26, 1992   
8       22             Jeff Green  PF   6-8  235     August 28, 1986   
9        2             Joe Ingles  PF   6-8  220     October 2, 1987   
10      15           Stanton Kidd  SG   6-6  215      March 18, 1992   
11      45       Donovan Mitchell  SG   6-1  215   September 7, 1996   
12      16           Juwan Morgan  PF   6-7  232      April 17, 

In [189]:
# Try to scrape the site using the URL
url = 'https://www.basketball-reference.com/teams/BRK/2020.html'

# Retrieve page with the requests module
html = requests.get(url)

# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(html.text, 'html.parser')

# Get article title and summary paragraph

ballers = {'Name': [], 'Page': [], 'Photo': [], 'City': [], 'Country': []}

for x in range(1,50):
    try:
        ballers['Name'].append(soup.find_all('tr')[x].a.text)
        ballers['Page'].append('https://www.basketball-reference.com/' + soup.find_all('tr')[x].a['href'])
    except:
        break

ballers


{'Name': ['Jarrett Allen',
  'Justin Anderson',
  'Wilson Chandler',
  'Chris Chiozza',
  'Nicolas Claxton',
  'Jamal Crawford',
  'Spencer Dinwiddie',
  'Henry Ellenson',
  'Donta Hall',
  'Joe Harris',
  'Kyrie Irving',
  'Tyler Johnson',
  'DeAndre Jordan',
  'Rodions Kurucs',
  'Caris LeVert',
  'Timothé Luwawu-Cabarrot',
  'Jeremiah Martin',
  'Džanan Musa',
  'David Nwaba',
  'Theo Pinson',
  'Taurean Prince',
  'Iman Shumpert',
  'Garrett Temple',
  'Lance Thomas'],
 'Page': ['https://www.basketball-reference.com//players/a/allenja01.html',
  'https://www.basketball-reference.com//players/a/anderju01.html',
  'https://www.basketball-reference.com//players/c/chandwi01.html',
  'https://www.basketball-reference.com//players/c/chiozch01.html',
  'https://www.basketball-reference.com//players/c/claxtni01.html',
  'https://www.basketball-reference.com//players/c/crawfja01.html',
  'https://www.basketball-reference.com//players/d/dinwisp01.html',
  'https://www.basketball-reference.co

In [190]:
for i in range(0, len(ballers['Page'])):
    
    # setup scrape
    url = ballers['Page'][i]
    html = requests.get(url)

    # scrape
    soup = bs(html.text, 'html.parser')
    table = soup.find_all(string=lambda text: isinstance(text, Comment))[46]

    stats = pd.read_html(table)[0]

    advanced = stats.loc[(stats['Season'] == '2019-20') | 
              (stats['Season'] == '2018-19') |
              (stats['Season'] == '2017-18') | 
              (stats['Season'] == '2016-17') | 
              (stats['Season'] == '2015-16') ].drop(['Age', 'Tm', 'Lg', 'Pos', 'G', 'MP'], axis=1)

    stats = pd.read_html(url)[0]

    standard = stats.loc[(stats['Season'] == '2019-20') | 
              (stats['Season'] == '2018-19') |
              (stats['Season'] == '2017-18') | 
              (stats['Season'] == '2016-17') | 
              (stats['Season'] == '2015-16') ]

    stats = pd.merge(standard, advanced, on='Season')
    
#     if i == 0:
#         for c in stats.columns:
#                 ballers[c] = []    

#     for j in range(0, len(stats.columns)):
#         stat = stats.columns[j]
#         if j < 5:
#             ballers[stat].append(stats[stat][len(stats[stat])-1])
#         elif j < 7:
#             ballers[stat].append(stats[stat].sum())
#         else:
#             ballers[stat].append(stats[stat].mean())
            
    ballers['Photo'].append(soup.find_all('img', alt=("Photo of " + ballers['Name'][i]))[0]['src'])
    ballers['City'].append(soup.find_all('span', itemprop="birthPlace")[0].text.replace('\n    in\xa0', "").replace('\xa0', " "))
    ballers['Country'].append(soup.find_all('span')[13].text)

            

# ballers.pop('Page')
# ballers.pop('Season')
# ballers.pop('Unnamed: 24')
# ballers.pop('Unnamed: 19')
# ballers


In [191]:
stats

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP
0,2015-16,27.0,NYK,NBA,PF,59.0,5.0,22.3,2.9,6.5,...,,1.2,0.5,1.7,0.062,,-1.7,-1.5,-3.2,-0.4
1,2016-17,28.0,NYK,NBA,PF,46.0,15.0,21.0,2.1,5.3,...,,0.6,0.4,0.9,0.046,,-2.4,-1.1,-3.5,-0.4
2,2017-18,29.0,NYK,NBA,PF,73.0,31.0,18.5,1.4,3.7,...,,0.3,0.6,0.9,0.031,,-3.2,-0.5,-3.7,-0.6
3,2018-19,30.0,NYK,NBA,PF,46.0,17.0,17.0,1.7,4.3,...,,-0.3,0.3,0.1,0.004,,-4.8,-0.9,-5.7,-0.7
4,2019-20,31.0,BRK,NBA,PF,7.0,4.0,14.0,1.1,3.3,...,,-0.1,0.1,0.0,-0.015,,-5.7,-2.6,-8.3,-0.2


In [162]:
baller_df = pd.DataFrame(ballers)
baller_df

Unnamed: 0,Name,Photo,City,Country,Age,Tm,Lg,Pos,G,GS,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,Kostas Antetokounmpo,https://www.basketball-reference.com/req/20210...,"Athens, Greece",gr,22.0,LAL,NBA,PF,7.0,0.0,...,18.95,16.4,0.0,0.0,0.0,-0.027,-6.2,-0.5,-6.7,0.0
1,Avery Bradley,https://www.basketball-reference.com/req/20210...,"Tacoma, Washington",us,29.0,LAL,NBA,SG,834.0,825.0,...,12.295238,19.57619,-0.442857,0.92381,0.490476,0.020381,-2.490476,-0.728571,-3.22381,-0.361905
2,Devontae Cacok,https://www.basketball-reference.com/req/20210...,"Chicago, Illinois",us,23.0,LAL,NBA,C,1.0,0.0,...,0.0,28.1,0.0,0.0,0.0,0.226,4.8,-4.6,0.2,0.0
3,Kentavious Caldwell-Pope,https://www.basketball-reference.com/req/20210...,"Thomaston, Georgia",us,26.0,LAL,NBA,SG,377.0,274.0,...,8.84,17.42,2.28,2.32,4.56,0.0956,-0.72,0.42,-0.3,1.0
4,Alex Caruso,https://www.basketball-reference.com/req/20210...,"College Station, Texas",us,25.0,LAL,NBA,PG,126.0,13.0,...,18.533333,15.233333,0.3,1.0,1.266667,0.076333,-2.466667,1.333333,-1.1,0.2
5,Quinn Cook,https://www.basketball-reference.com/req/20210...,"Washington, District of Columbia",us,26.0,LAL,NBA,PG,235.0,29.0,...,15.366667,19.691667,0.208333,0.191667,0.408333,0.0455,-1.591667,-1.6,-3.2,-0.058333
6,Troy Daniels,https://www.basketball-reference.com/req/20210...,"Roanoke, Virginia",us,28.0,DEN,NBA,SG,522.0,19.0,...,7.6,17.476923,0.253846,0.330769,0.615385,0.040231,-1.792308,-1.623077,-3.407692,-0.069231
7,Anthony Davis,https://www.basketball-reference.com/req/20210...,"Chicago, Illinois",us,26.0,LAL,NBA,PF,329.0,329.0,...,8.98,30.2,6.44,4.06,10.5,0.2186,5.16,1.62,6.78,5.04
8,Jared Dudley,https://www.basketball-reference.com/req/20210...,"San Diego, California",us,34.0,LAL,NBA,PF,297.0,74.0,...,14.42,11.02,1.02,0.96,1.96,0.0798,-2.12,0.88,-1.24,0.28
9,Danny Green,https://www.basketball-reference.com/req/20210...,"North Babylon, New York",us,32.0,LAL,NBA,SG,365.0,355.0,...,11.68,14.54,0.9,3.08,3.96,0.0982,-0.6,1.54,0.96,1.46


In [163]:
ballers_percentile = {}
ballers_rank = {}



        
for i in range(0, len(baller_df.columns)):
    if (i == 0) | (i == 1) | (i == 2) | (i == 3) |(i == 5) | (i == 6) |(i == 7):
        ballers_percentile[baller_df.columns[i]] = []
        ballers_rank[baller_df.columns[i]] = []
    else:
        ballers_percentile['%ile in ' + baller_df.columns[i]] = []
        ballers_rank['Rank in ' + baller_df.columns[i]] = []

        


for i in range(0, len(baller_df.columns)):

    stat = baller_df.columns[i]

    for j in range(0, len(baller_df[stat])):

        if (i == 0) | (i == 1) | (i == 2) | (i == 3) |(i == 5) | (i == 6) |(i == 7):

            ballers_percentile[stat].append(baller_df[stat][j])
            ballers_rank[stat].append(baller_df[stat][j])

        else:

            ranked = baller_df.sort_values(stat)[stat].to_numpy()

            for k in range(0, len(ranked)):
                if ranked[k] == baller_df[stat][j]:
    #                 print([baller_df['Name'][j], stat, ranked[k], k])
                    below = k
                    break

            percentile = round((below/len(ranked)*100), 2)
            rank = len(ranked) - below
#             print([baller_df['Name'][j], rank, stat])
            
#                 print([baller_df['Name'][j], [rank,len(ranked)], percentile, stat])
            ballers_percentile['%ile in ' + stat].append(percentile)
            ballers_rank['Rank in ' + stat].append(rank)


ballers_percentile.pop('City')
ballers_percentile.pop('Country')
ballers_percentile.pop('Photo')
ballers_percentile.pop('Tm')
ballers_percentile.pop('Lg')
ballers_percentile.pop('Pos')
ballers_percentile


ballers_rank.pop('City')
ballers_rank.pop('Country')
ballers_rank.pop('Photo')
ballers_rank.pop('Tm')
ballers_rank.pop('Lg')
ballers_rank.pop('Pos')
ballers_rank


{'Name': ['Kostas Antetokounmpo',
  'Avery Bradley',
  'Devontae Cacok',
  'Kentavious Caldwell-Pope',
  'Alex Caruso',
  'Quinn Cook',
  'Troy Daniels',
  'Anthony Davis',
  'Jared Dudley',
  'Danny Green',
  'Talen Horton-Tucker',
  'Dwight Howard',
  'LeBron James',
  'Kyle Kuzma',
  'JaVale McGee',
  'Markieff Morris',
  'Zach Norvell',
  'Rajon Rondo',
  'J.R. Smith',
  'Dion Waiters'],
 'Rank in Age': [19,
  9,
  17,
  14,
  15,
  14,
  11,
  14,
  4,
  7,
  20,
  4,
  1,
  16,
  7,
  8,
  19,
  5,
  4,
  11],
 'Rank in G': [18,
  2,
  20,
  4,
  16,
  13,
  3,
  7,
  11,
  5,
  19,
  9,
  6,
  15,
  8,
  1,
  17,
  10,
  14,
  12],
 'Rank in GS': [20,
  1,
  20,
  6,
  16,
  14,
  15,
  5,
  13,
  3,
  17,
  7,
  4,
  12,
  10,
  2,
  20,
  8,
  9,
  11],
 'Rank in MP': [20,
  4,
  18,
  3,
  12,
  14,
  17,
  2,
  13,
  8,
  16,
  7,
  1,
  5,
  15,
  10,
  19,
  6,
  9,
  11],
 'Rank in FG': [20,
  4,
  12,
  6,
  16,
  14,
  17,
  2,
  18,
  11,
  15,
  5,
  1,
  3,
  10,
  8

In [164]:
percentile_df = pd.DataFrame(ballers_percentile)
rank_df = pd.DataFrame(ballers_rank)


In [165]:
all_stats = pd.merge(baller_df, rank_df, on='Name')
all_stats = pd.merge(all_stats, percentile_df, on='Name')
all_stats

Unnamed: 0,Name,Photo,City,Country,Age,Tm,Lg,Pos,G,GS,...,%ile in TOV%,%ile in USG%,%ile in OWS,%ile in DWS,%ile in WS,%ile in WS/48,%ile in OBPM,%ile in DBPM,%ile in BPM,%ile in VORP
0,Kostas Antetokounmpo,https://www.basketball-reference.com/req/20210...,"Athens, Greece",gr,22.0,LAL,NBA,PF,7.0,0.0,...,90.0,25.0,15.0,0.0,5.0,5.0,5.0,35.0,5.0,25.0
1,Avery Bradley,https://www.basketball-reference.com/req/20210...,"Tacoma, Washington",us,29.0,LAL,NBA,SG,834.0,825.0,...,45.0,50.0,0.0,40.0,30.0,15.0,20.0,25.0,20.0,0.0
2,Devontae Cacok,https://www.basketball-reference.com/req/20210...,"Chicago, Illinois",us,23.0,LAL,NBA,C,1.0,0.0,...,0.0,85.0,15.0,0.0,5.0,95.0,85.0,0.0,75.0,25.0
3,Kentavious Caldwell-Pope,https://www.basketball-reference.com/req/20210...,"Thomaston, Georgia",us,26.0,LAL,NBA,SG,377.0,274.0,...,10.0,30.0,80.0,75.0,80.0,65.0,50.0,60.0,65.0,80.0
4,Alex Caruso,https://www.basketball-reference.com/req/20210...,"College Station, Texas",us,25.0,LAL,NBA,PG,126.0,13.0,...,85.0,15.0,45.0,55.0,45.0,55.0,25.0,75.0,50.0,50.0
5,Quinn Cook,https://www.basketball-reference.com/req/20210...,"Washington, District of Columbia",us,26.0,LAL,NBA,PG,235.0,29.0,...,70.0,55.0,35.0,15.0,20.0,30.0,45.0,10.0,25.0,20.0
6,Troy Daniels,https://www.basketball-reference.com/req/20210...,"Roanoke, Virginia",us,28.0,DEN,NBA,SG,522.0,19.0,...,5.0,35.0,40.0,25.0,35.0,25.0,40.0,5.0,15.0,5.0
7,Anthony Davis,https://www.basketball-reference.com/req/20210...,"Chicago, Illinois",us,26.0,LAL,NBA,PF,329.0,329.0,...,15.0,90.0,90.0,95.0,90.0,90.0,90.0,85.0,90.0,90.0
8,Jared Dudley,https://www.basketball-reference.com/req/20210...,"San Diego, California",us,34.0,LAL,NBA,PF,297.0,74.0,...,60.0,0.0,70.0,50.0,55.0,60.0,30.0,65.0,45.0,55.0
9,Danny Green,https://www.basketball-reference.com/req/20210...,"North Babylon, New York",us,32.0,LAL,NBA,SG,365.0,355.0,...,35.0,10.0,60.0,85.0,75.0,70.0,60.0,80.0,80.0,85.0


In [166]:
for col in all_stats.columns:
    print(col)

Name
Photo
City
Country
Age
Tm
Lg
Pos
G
GS
MP
FG
FGA
FG%
3P
3PA
3P%
2P
2PA
2P%
eFG%
FT
FTA
FT%
ORB
DRB
TRB
AST
STL
BLK
TOV
PF
PTS
PER
TS%
3PAr
FTr
ORB%
DRB%
TRB%
AST%
STL%
BLK%
TOV%
USG%
OWS
DWS
WS
WS/48
OBPM
DBPM
BPM
VORP
Rank in Age
Rank in G
Rank in GS
Rank in MP
Rank in FG
Rank in FGA
Rank in FG%
Rank in 3P
Rank in 3PA
Rank in 3P%
Rank in 2P
Rank in 2PA
Rank in 2P%
Rank in eFG%
Rank in FT
Rank in FTA
Rank in FT%
Rank in ORB
Rank in DRB
Rank in TRB
Rank in AST
Rank in STL
Rank in BLK
Rank in TOV
Rank in PF
Rank in PTS
Rank in PER
Rank in TS%
Rank in 3PAr
Rank in FTr
Rank in ORB%
Rank in DRB%
Rank in TRB%
Rank in AST%
Rank in STL%
Rank in BLK%
Rank in TOV%
Rank in USG%
Rank in OWS
Rank in DWS
Rank in WS
Rank in WS/48
Rank in OBPM
Rank in DBPM
Rank in BPM
Rank in VORP
%ile in Age
%ile in G
%ile in GS
%ile in MP
%ile in FG
%ile in FGA
%ile in FG%
%ile in 3P
%ile in 3PA
%ile in 3P%
%ile in 2P
%ile in 2PA
%ile in 2P%
%ile in eFG%
%ile in FT
%ile in FTA
%ile in FT%
%ile in ORB
%ile in DRB

In [None]:
# STOP HERE!!!!

In [235]:
# for baller in ballers:
#     print(pd.read_html(baller)[0])

# setup scrape
# url = ballers['Page'][0]

url = 'https://www.basketball-reference.com/players/j/jamesle01.html'

html = requests.get(url)

# scrape
soup = bs(html.text, 'html.parser')
table = soup.find_all(string=lambda text: isinstance(text, Comment))[46]

stats = pd.read_html(table)[0]

stats

advanced = stats.loc[(stats['Season'] == '2019-20') | 
          (stats['Season'] == '2018-19') |
          (stats['Season'] == '2017-18') | 
          (stats['Season'] == '2016-17') | 
          (stats['Season'] == '2015-16') ].drop(['Age', 'Tm', 'Lg', 'Pos', 'G', 'MP'], axis=1)

stats = pd.read_html(url)[0]

standard = stats.loc[(stats['Season'] == '2019-20') | 
          (stats['Season'] == '2018-19') |
          (stats['Season'] == '2017-18') | 
          (stats['Season'] == '2016-17') | 
          (stats['Season'] == '2015-16') ]

stats = pd.merge(standard, advanced, on='Season')

this = {}

# this['Photo'] = soup.find_all('img', alt=("Photo of " + ballers['Name'][0]))[0]['src']

photo = soup.find_all('img', itemscope="image")[0]['src']
# [0]['src']
# city = soup.find_all('span', itemprop="birthPlace")[0].text.replace('\n    in\xa0', "").replace('\xa0', " ")
# country = append(soup.find_all('span')[13].text

photo



'https://www.basketball-reference.com/req/202104203/images/players/jamesle01.jpg'

In [210]:

stats["MP"] = pd.to_numeric(stats["MP"])
stats["MP"][0]
# stats["MP"].mean()


24.4

In [None]:
kostas = {}

kostas['Name'] = ballers['Name'][0]

i = 0
for c in stats.columns:
    if i < 5:
        kostas[c] = stats[c][len(stats[c])-1]
    elif i < 7:
        kostas[c] = stats[c].sum()
    else:
        kostas[c] = stats[c].mean()
    i = i+1
    
kostas['Photo'] = ""

ballers['Name'][12]

# kostas.pop('Season')
# kostas.pop('Unnamed: 24')
# kostas.pop('Unnamed: 19')
# kostas

In [None]:
# quit the browser
browser.quit()

In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

# Jump to image link and setup soup
browser.links.find_by_partial_text('FULL IMAGE').click()
html = browser.html
soup = bs(html, 'html.parser')

# Get featured image url
featured_image_url = url.replace('index.html', soup.find_all('img')[1]['src'])

# quit the browser
browser.quit()


In [147]:
# Try to scrape the site using the URL
AD ='https://www.basketball-reference.com/players/d/davisan02.html'
LBJ = 'https://www.basketball-reference.com/players/j/jamesle01.html'
Kos = 'https://www.basketball-reference.com/players/a/antetko01.html'

url = AD

# Retrieve page with the requests module
html = requests.get(url)

# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(html.text, 'html.parser')

# Get article title and summary paragraph

# city = soup.find_all('span', itemprop="birthPlace")[0].text
country = soup.find_all('span')[13].text
st = soup.find_all('table')[0]


city = soup.find_all('span', itemprop="birthPlace")[0].text.replace('\n    in\xa0', "").replace('\xa0', " ")
country = soup.find_all('span')[13].text

print(city)
print(country)


# news_H = soup.find_all('div', class_='content_title')[0].a.text.replace("\n", "")
# news_P = soup.find_all('div', class_='rollover_description_inner')[0].text.replace("\n", "")


Chicago, Illinois
us


In [None]:
# Try to scrape the site using the URL
url = 'https://space-facts.com/mars/'

# Retrieve page with the requests module
html = requests.get(url)

# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(html.text, 'html.parser')

# Get Mars descriptions
col1 = soup.find_all('td', class_='column-1')
col2 = soup.find_all('td', class_='column-2')
props = []
stats = []

# Put descriptions in a dataframe
for x in range(0, len(col1)):
    props.append(col1[x].text)
    stats.append(col2[x].text)
    
fax_table = pd.DataFrame({"Description": props, "Mars": stats}).set_index('Description').to_html().replace("\n", "").replace("   ", " ")



In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

html = browser.html
soup = bs(html, 'html.parser')

# Get pages to jump to and set arrays to fill
jumps = []
titles = []
img_urls = []
hemisphere_urls = []

that = soup.find_all('h3')

for this in that:
    jumps.append(this.text)
    
# Jump through pages and get image titles and urls
for jump in jumps:
    try:
        browser.links.find_by_partial_text(jump).click()
        html = browser.html
        soup = bs(html, 'html.parser')
        titles.append(soup.find_all('h2')[0].text.replace(" Enhanced", ""))
        img_urls.append(soup.find_all('a')[4]['href'])
        print("1 " + jump)
    except:
        try:
            browser.links.find_by_partial_text('2').click()
            browser.links.find_by_partial_text(jump).click()
            html = browser.html
            soup = bs(html, 'html.parser')
            titles.append(soup.find_all('h2')[0].text.replace(" Enhanced", ""))
            img_urls.append(soup.find_all('a')[4]['href'])
            print("2 " + jump)
        except:
            browser.links.find_by_partial_text('1').click()
            browser.links.find_by_partial_text(jump).click()
            html = browser.html
            soup = bs(html, 'html.parser')
            titles.append(soup.find_all('h2')[0].text.replace(" Enhanced", ""))
            img_urls.append(soup.find_all('a')[4]['href'])
            print("1 " + jump)

# Quit the browser
browser.quit()

# Set results to list of dictionaries
for x in range(0, len(img_urls)):
    hemisphere_urls.append({"title": titles[x], "img_url": img_urls[x]})


In [None]:
mars = {"news_H": news_H,
        "news_P": news_P,
        "featured_image_url": featured_image_url,
        "fax_table": fax_table,
        "hemisphere_urls": hemisphere_urls}
mars
