In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import sys
import string
import requests
import datetime
import progressbar
import time
import re


def player_basic_info():
    players = []
    base_url = 'http://www.basketball-reference.com/players/'
    for letter in string.ascii_lowercase:
        page_request = requests.get(base_url + letter)
        soup = BeautifulSoup(page_request.text, 'lxml')
        table = soup.find('table')
        if table:
            table_body = table.find('tbody')
            for row in table_body.findAll('tr'):
                player_url = row.find('a')
                player_names = player_url.text
                player_pages = player_url['href']
                cells = row.findAll('td')  # all data for all players uniform across database
                active_from = int(cells[0].text)
                active_to = int(cells[1].text)
                position = cells[2].text
                height = cells[3].text
                weight = cells[4].text
                birth_date = cells[5].text
                college = cells[6].text
                player_entry = {
                    'url': player_pages,
                    'name': player_names,
                    'active_from': active_from,
                    'active_to': active_to,
                    'position': position,
                    'college': college,
                    'height': height,
                    'weight': weight,
                    'birth_date': birth_date,
                    }
                if active_from >= 20:    
                  players.append(player_entry)
    return pd.DataFrame(players)


def player_info(url):

    # define all quantites

    NBA_g = None
    NBA_mp = None
    NBA_pts = None
    NBA_fg = None
    NBA_fga = None
    NBA_fgp = None
    NBA_2p = None
    NBA_2pa = None
    NBA_2pp = None
    NBA_3p = None
    NBA_3pa = None
    NBA_3pp = None
    NBA_ft = None
    NBA_fta = None
    NBA_ftp = None
    NBA_orb = None
    NBA_drb = None
    NBA_trb = None
    NBA_ast = None
    NBA_stl = None
    NBA_blk = None
    NBA_tov = None
    NCAA_g = None
    NCAA_mp = None
    NCAA_pts = None
    NCAA_fg = None
    NCAA_fga = None
    NCAA_fgp = None
    NCAA_2p = None
    NCAA_2pa = None
    NCAA_2pp = None
    NCAA_3p = None
    NCAA_3pa = None
    NCAA_3pp = None
    NCAA_ft = None
    NCAA_fta = None
    NCAA_ftp = None
    NCAA_orb = None
    NCAA_drb = None
    NCAA_trb = None
    NCAA_ast = None
    NCAA_stl = None
    NCAA_blk = None
    NCAA_tov = None

    # print('url = ' + str('http://www.basketball-reference.com' + str(url)))

    page_request = requests.get('http://www.basketball-reference.com'
                                + str(url))
    soup = BeautifulSoup(page_request.text, 'lxml')
    table = soup.find('table')  # the first table is luckily the per game stats
    if table:
        table_body = table.find('tbody')
        for row in table_body.findAll('tr')[1:2]:
            cells = row.findAll('td')
            playerData = str(cells)  # the indexes are not uniform across the database
            try:
                NBA_g = re.search(r'data-stat="g">(.*?)</td>',
                                    playerData).group(1)
                NBA_mp = re.search(r'data-stat="mp_per_g">(.*?)</td>',
                                    playerData).group(1)
                NBA_pts = re.search(r'data-stat="pts_per_g">(.*?)</td>'
                        , playerData).group(1)

        # total field goals

                NBA_fg = re.search(r'data-stat="fg_per_g">(.*?)</td>',
                                    playerData).group(1)
                NBA_fga = re.search(r'data-stat="fga_per_g">(.*?)</td>'
                        , playerData).group(1)
                NBA_fgp = re.search(r'data-stat="fg_pct">(.*?)</td>',
                        playerData).group(1)

        # 2 point stats
                NBA_2p = re.search(r'data-stat="fg2_per_g">(.*?)</td>', playerData).group(1)
                NBA_2pa = re.search(r'data-stat="fg2a_per_g">(.*?)</td>',
                              playerData).group(1)
                NBA_2pp = re.search(r'data-stat="fg2_pct">(.*?)</td>',
                        playerData).group(1)

        # 3 point stats - if statements if we need data back to before the 3p line

                NBA_3p = re.search(r'data-stat="fg3_per_g">(.*?)</td>', 
                              playerData).group(1)
                NBA_3pa = re.search(r'data-stat="fg3a_per_g">(.*?)</td>',
                              playerData).group(1)
                NBA_3pp = re.search(r'data-stat="fg3_pct">(.*?)</td>',
                        playerData).group(1)

        # Free Throw stats

                NBA_ft = re.search(r'data-stat="ft_per_g">(.*?)</td>',
                                    playerData).group(1)
                NBA_fta = re.search(r'data-stat="fta_per_g">(.*?)</td>'
                        , playerData).group(1)
                NBA_ftp = re.search(r'data-stat="ft_pct">(.*?)</td>',
                        playerData).group(1)

        # Rebounds stats

                NBA_orb = re.search(r'data-stat="orb_per_g">(.*?)</td>'
                        , playerData).group(1)
                NBA_drb = re.search(r'data-stat="drb_per_g">(.*?)</td>'
                        , playerData).group(1)
                NBA_trb = re.search(r'data-stat="trb_per_g">(.*?)</td>'
                        , playerData).group(1)

        # Assists / steals/ blocks / Turnover stats

                NBA_ast = re.search(r'data-stat="ast_per_g">(.*?)</td>'
                        , playerData).group(1)
                NBA_stl = re.search(r'data-stat="stl_per_g">(.*?)</td>'
                        , playerData).group(1)
                NBA_blk = re.search(r'data-stat="blk_per_g">(.*?)</td>'
                        , playerData).group(1)
                NBA_tov = re.search(r'data-stat="tov_per_g">(.*?)</td>'
                        , playerData).group(1)
            except:
              print('nopeee')

    college_url = get_player_college_url(url)
    if college_url != None:
        page_request_cbb = requests.get(college_url)
        soupy = BeautifulSoup(page_request_cbb.text, 'lxml')
        table_cbb = soupy.find('table')
        if table_cbb:
            table_foot = table_cbb.find('tfoot')
            for row in table_foot.findAll('tr'):
                try:  
                    print('player ' + url + 'started')
                    cells = row.findAll('td')
                    playerData = str(cells)  # the indexes are not uniform across the database
                    NCAA_g = re.search(r'data-stat="g">(.*?)</td>',
                                      playerData).group(1)
                    NCAA_mp = re.search(r'data-stat="mp_per_g">(.*?)</td>',
                                        playerData).group(1)
                    NCAA_pts = re.search(r'data-stat="pts_per_g">(.*?)</td>'
                            , playerData).group(1)

            # total field goals

                    NCAA_fg = re.search(r'data-stat="fg_per_g">(.*?)</td>',
                                        playerData).group(1)
                    NCAA_fga = re.search(r'data-stat="fga_per_g">(.*?)</td>'
                            , playerData).group(1)
                    NCAA_fgp = re.search(r'data-stat="fg_pct">(.*?)</td>',
                            playerData).group(1)

            # 2 point stats
                    NCAA_2p = re.search(r'data-stat="fg2_per_g">(.*?)</td>', playerData).group(1)
                    NCAA_2pa = re.search(r'data-stat="fg2a_per_g">(.*?)</td>',
                                  playerData).group(1)
                    NCAA_2pp = re.search(r'data-stat="fg2_pct">(.*?)</td>',
                            playerData).group(1)

            # 3 point stats - if statements if we need data back to before the 3p line

                    NCAA_3p = re.search(r'data-stat="fg3_per_g">(.*?)</td>', 
                                  playerData).group(1)
                    NCAA_3pa = re.search(r'data-stat="fg3a_per_g">(.*?)</td>',
                                  playerData).group(1)
                    NCAA_3pp = re.search(r'data-stat="fg3_pct">(.*?)</td>',
                            playerData).group(1)

            # Free Throw stats

                    NCAA_ft = re.search(r'data-stat="ft_per_g">(.*?)</td>',
                                        playerData).group(1)
                    NCAA_fta = re.search(r'data-stat="fta_per_g">(.*?)</td>'
                            , playerData).group(1)
                    NCAA_ftp = re.search(r'data-stat="ft_pct">(.*?)</td>',
                            playerData).group(1)

            # Rebounds stats

                    NCAA_orb = re.search(r'data-stat="orb_per_g">(.*?)</td>'
                            , playerData).group(1)
                    NCAA_drb = re.search(r'data-stat="drb_per_g">(.*?)</td>'
                            , playerData).group(1)
                    NCAA_trb = re.search(r'data-stat="trb_per_g">(.*?)</td>'
                            , playerData).group(1)

            # Assists / steals/ blocks / Turnover stats

                    NCAA_ast = re.search(r'data-stat="ast_per_g">(.*?)</td>'
                            , playerData).group(1)
                    NCAA_stl = re.search(r'data-stat="stl_per_g">(.*?)</td>'
                            , playerData).group(1)
                    NCAA_blk = re.search(r'data-stat="blk_per_g">(.*?)</td>'
                            , playerData).group(1)
                    NCAA_tov = re.search(r'data-stat="tov_per_g">(.*?)</td>'
                            , playerData).group(1)
                except:
                    print('failed')

    if college_url == None:
      print(url)
      print(NCAA_g)
    player_entry = {'NBA_g': NBA_g,
                'NBA_mp': NBA_mp,
                'NBA_pts': NBA_pts,
                'NBA_fg': NBA_fg,
                'NBA_fga': NBA_fga,
                'NBA_fgp': NBA_fgp,
                'NBA_2p': NBA_2p,
                'NBA_2pa': NBA_2pa,
                'NBA_2pp': NBA_2pp,
                'NBA_3p': NBA_3p,
                'NBA_3pa': NBA_3pa,
                'NBA_3pp': NBA_3pp,
                'NBA_ft': NBA_ft,
                'NBA_fta': NBA_fta,
                'NBA_ftp': NBA_ftp,
                'NBA_orb': NBA_orb,
                'NBA_drb': NBA_drb,
                'NBA_trb': NBA_trb,
                'NBA_ast': NBA_ast,
                'NBA_stl': NBA_stl,
                'NBA_blk': NBA_blk,
                'NBA_tov': NBA_tov,
                'NCAA_g': NCAA_g,
                'NCAA_mp': NCAA_mp,
                'NCAA_pts': NCAA_pts,
                'NCAA_fg': NCAA_fg,
                'NCAA_fga': NCAA_fga,
                'NCAA_fgp': NCAA_fgp,
                'NCAA_2p': NCAA_2p,
                'NCAA_2pa': NCAA_2pa,
                'NCAA_2pp': NCAA_2pp,
                'NCAA_3p': NCAA_3p,
                'NCAA_3pa': NCAA_3pa,
                'NCAA_3pp': NCAA_3pp,
                'NCAA_ft': NCAA_ft,
                'NCAA_fta': NCAA_fta,
                'NCAA_ftp': NCAA_ftp,
                'NCAA_orb': NCAA_orb,
                'NCAA_drb': NCAA_drb,
                'NCAA_trb': NCAA_trb,
                'NCAA_ast': NCAA_ast,
                'NCAA_stl': NCAA_stl,
                'NCAA_blk': NCAA_blk,
                'NCAA_tov': NCAA_tov,
              }

    print('player ' + url + 'complete')
    print(player_entry)

    return player_entry


def get_player_college_url(NBA_url):
    page_request = requests.get('http://www.basketball-reference.com'
                                + str(url))
    soup = BeautifulSoup(page_request.text, 'lxml')
    links = str(soup.findAll('li'))  # regex time
    college_url = \
        re.search(r'<a href="(.*?)">College Basketball at Sports-Reference.com</a>'
                  , links)
    if college_url != None:
        return str(college_url.group(1))
    else:
        return None


######################################################################################
# MAIN

players_general_info = player_basic_info()  # call function that scrapes general info
print('General info/player url loaded...')
players_details_info_list = []
df = pd.DataFrame()
bar = progressbar.ProgressBar(max_value=len(players_general_info))
for (i, url) in enumerate(players_general_info.url):
    player = player_info(url)
    df = df.append(player, ignore_index=True)
    # print(df)
    bar.update(i)
    time.sleep(0.1)
print('Done!')  # takes an unholy amount of time
df = pd.concat([players_general_info, df], axis=1)
df = df.reindex(df.index)
df.to_csv('players-1995.csv', encoding='utf-8')

######################################################################################

 # print(players_details_info_list[i])
# for i,url in enumerate(players_general_info.url):
# ....try:
# ....    players_details_info_list.append(player_detail_info(url))
# ....    print(players_details_info_list[i])
# ....except:
# ........print('cannot load: %s; location %d' %(url,i))