In [86]:
import os
import time
import datetime
import urllib2
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs

In [2]:
def injury_url_2_df(link):
    test = urllib2.urlopen(link).read()
    final = bs(test, "lxml")

    holder = list()
    for i in final.find_all("table", class_="datatable center"):
        holder.append(i.text.split("\n"))

    columns = holder[0][:7]
    data = holder[0][9:]

    i=0
    new_list=[]
    while i<len(data):
        new_list.append(data[i:i+7])
        i+=7

    df = pd.DataFrame(new_list)[[0,1,2,3,4]]
    df.columns = ["date", "team", 'acquired', 'relinquished', 'notes']
    return df

In [109]:
mth = datetime.datetime.today().month
mth

12

In [112]:
link = "http://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate=2017-{}-01&EndDate=&ILChkBx=yes&InjuriesChkBx=yes&PersonalChkBx=yes&DisciplinaryChkBx=yes&Submit=Search&start=".format(mth)

In [122]:
# base = pd.DataFrame()
# Shifting to have a base, and scraping just that month's data
base = pd.read_csv("injury_list_2017.csv")
try:
    for i in range(0, 1000, 25):
        print("On results " + str(i))
        base = base.append(injury_url_2_df(link + str(i)))
        time.sleep(1)
except:
    print("Hit end of page! We are done!")

On results 0
On results 25
On results 50
On results 75
Hit end of page! We are done!


In [123]:
base = base.drop_duplicates()
base.shape

(1046, 5)

In [124]:
base.to_csv("injury_list_2017.csv", index=False, encoding='utf-8')

### Start from afresh 

In [125]:
base = pd.read_csv("injury_list_2017.csv")
base = base.reset_index(drop=True)
base['acquired'] = [i[5:] for i in base['acquired']]
base['relinquished'] = [i[5:] for i in base['relinquished']]
base['date'] = pd.to_datetime(base['date'], format="%Y-%m-%d")

#### Complicated part is linking injury to recover
- **Injury & recovery may not be 1-1.**
- **Date matching is difficult as they have to be chained together based on recency.**

In [126]:
injury = base[base['relinquished']!=""][["date", 'team', 'relinquished', "notes"]]
injury.rename(columns={'relinquished':'player'}, inplace=True)
injury['status'] = "injuried"

recovered = base[base['acquired']!=""][["date", 'team', 'acquired', "notes"]]
recovered.rename(columns={'acquired':'player'}, inplace=True)
recovered['status'] = "recovered"

### Creating operational injury list
- **Existing workflow is convolated because I want to create a list that I can do backfilling with.**
- **I will sort them by player names & dates. Then, I take the latest status of the player by their order.** 
- **When I wanna do backfilling, I will chop off the date **

In [127]:
c_injury_list = injury.append(recovered).sort_values(["player", "date"])
c_injury_list = c_injury_list[~c_injury_list.notes.str.contains("fined")]

### Side story 2 --> Jahlil Okafor 

In [128]:
c_injury_list[c_injury_list.player=="Jahlil Okafor"]

Unnamed: 0,date,team,player,notes,status
61,2017-10-18,76ers,Jahlil Okafor,illness (DTD),injuried
189,2017-10-21,76ers,Jahlil Okafor,returned to lineup,recovered
313,2017-10-28,76ers,Jahlil Okafor,placed on IL with upper respiratory infection,injuried
368,2017-10-30,76ers,Jahlil Okafor,activated from IL,recovered
602,2017-11-13,76ers,Jahlil Okafor,placed on IL with upper respiratory infection,injuried
636,2017-11-15,76ers,Jahlil Okafor,activated from IL,recovered
705,2017-11-18,76ers,Jahlil Okafor,placed on IL for personal reasons,injuried
757,2017-11-22,76ers,Jahlil Okafor,activated from IL,recovered
870,2017-11-29,76ers,Jahlil Okafor,placed on IL,injuried


In [129]:
c_injury_list = c_injury_list[c_injury_list['date'] < "2017-10-28"]
c_injury_list = c_injury_list.groupby(['player']).last().reset_index()

# Only keep those that are still injuried. If their last status is recovered, I can remove them!
final = c_injury_list[c_injury_list.status=='injuried']

#### Side story --> Test Al Horford --> He was injuried on 11-08-2017, and recovered on 11-12-2017 

In [130]:
c_injury_list[c_injury_list.player=="Al Horford"]

Unnamed: 0,player,date,team,notes,status


In [131]:
# c_injury_list = c_injury_list[c_injury_list['date'] < "2017-11-11"]
c_injury_list = c_injury_list.groupby(['player']).last().reset_index()

# Only keep those that are still injuried. If their last status is recovered, I can remove them!
final = c_injury_list[c_injury_list.status=='injuried']

In [132]:
final[final.player=="Al Horford"]

Unnamed: 0,player,date,team,notes,status


### Final operational injury list 

In [133]:
final.shape

(127, 5)

In [134]:
final.head()

Unnamed: 0,player,date,team,notes,status
0,(William) Tony Parker,2017-10-17,Spurs,placed on IL recovering from surgery to repai...,injuried
3,Adreian Payne,2017-10-18,Magic,placed on IL with fractured left hand,injuried
4,Alan Williams,2017-10-18,Suns,placed on IL recovering from surgery on right...,injuried
5,Alec Peters,2017-10-23,Suns,placed on IL,injuried
6,Alex Caruso,2017-10-22,Lakers,placed on IL,injuried


### Part 2 -> Player's team list
- I can just take main_player files from the latest game!

In [135]:
import glob
glob.glob("*main*.csv")

['main_players_2017.csv']

In [64]:
test = pd.read_csv('main_players_2017.csv')

#### Logic --> Team's latest set of players 

In [30]:
test_team_players = test[(test.GAME_ID == test[test.TEAM_ID == 1610612738].GAME_ID.max()) & 
                         (test.TEAM_ID == 1610612738)].PLAYER_NAME.unique().tolist()

In [None]:
player_season_stats = pd.read_csv("player_season_statistics.csv")

In [31]:
player_2016 = player_season_stats[player_season_stats.season==2016]
player_2016[player_2016.PLAYER_NAME.isin(test_team_players)]

Unnamed: 0,season,PLAYER_ID,PLAYER_NAME,PTS,AST,BLK,REB,STL,games_played,pts_l,...,reb_l,stl_l,type,defenders,facilitator,game_winners,inside_gamers,normal,pure_scorers,useless
10408,2016,201143,Al Horford,14.0,4.955882,1.264706,6.838235,0.764706,68,0,...,0,0,normal,0,0,0,0,1,0,0
10538,2016,203935,Marcus Smart,10.56962,4.607595,0.43038,3.860759,1.582278,79,0,...,0,1,defenders,1,0,0,0,0,0,0
10543,2016,1627759,Jaylen Brown,6.602564,0.820513,0.230769,2.820513,0.448718,79,0,...,0,0,useless,0,0,0,0,0,0,1
10550,2016,202681,Kyrie Irving,25.222222,5.805556,0.333333,3.194444,1.152778,80,1,...,0,0,game_winners,0,0,1,0,0,0,0
10660,2016,202694,Marcus Morris,13.987342,2.025316,0.316456,4.632911,0.658228,82,0,...,0,0,useless,0,0,0,0,0,0,1
10672,2016,203382,Aron Baynes,4.866667,0.426667,0.52,4.44,0.226667,82,0,...,0,0,useless,0,0,0,0,0,0,1
10708,2016,1626179,Terry Rozier,5.540541,1.77027,0.148649,3.067568,0.621622,82,0,...,0,0,useless,0,0,0,0,0,0,1


## Checkpoint on current workflow
1. **Have a daily script to update injuries** -- Current workflow does entire list, but an incremental one would be better tbh...
2. **Have a list to extract players from previous game, based on existing (pre-defined) date**
3. **Match names with **
4. Potential improvement: **This workflow doesn't include rookies. Might want to have a system that includes crazy rookies, but has to be done on hindsight and requires more hardcore hardcoding**