In [3]:
from bs4 import BeautifulSoup
import quopri
import pandas as pd
from os import listdir

In [21]:
def extract_row_data(row, dtype="current"):
    cells = row.find_all("td")
    if dtype == "current":
        return {
            "Player": cells[5].find(class_="playername").text,
            "Position": cells[4].text,
            # Some teams only have two characters, causing an extra space
            # at the start, so strip that
            "Team": cells[5].find(class_="playerTeam").text[-3:].strip(),
            "Opponent": cells[13].text,
            "Salary": cells[9].text,
            "Projection": cells[10].find("input").get("value")
        }
    elif dtype == "historical":
        # TODO: get points actually scored
            return {
                "Player": cells[5].find(class_="playername").text,
                "Position": cells[4].text,
                # Some teams only have two characters, causing an extra space
                # at the start, so strip that
                "Team": cells[5].find(class_="playerTeam").text[-3:].strip(),
                "Salary": cells[9].text,
                "Scored": cells[10].text,
                "Projection": cells[11].find("input").get("value"),
                "Consensus": cells[12].text,
                "Time": cells[13].text,
                "Opponent": cells[14].text,
                "Order": cells[16].text,
                "Bat/Arm": cells[17].text,
                "Consistent": cells[18].text,
                "Floor": cells[19].text,
                "Ceiling": cells[20].text,
                "Avg FP": cells[22].text,
                "Imp Runs": cells[23].text,
                "pOwn": cells[25].text,
                "actOwn": cells[26].text,
                "Leverage": cells[27].text,
                "Safety": cells[28].text
            }


def extract_linestar_data(filename, dtype="current"):
    html = open(filename, "r")
    html = quopri.decodestring(html.read())
    soup = BeautifulSoup(html)
    
    table = soup.find_all("table")[0]
    row_data = []
    for row in table.find_all("tr", class_="playerCardRow"):
        row_data.append(extract_row_data(row, dtype))
    
    return pd.DataFrame(row_data)

In [49]:
frames = []
for file in listdir("./data"):
    frame = extract_linestar_data("./data/" + file, dtype="historical")
    frame["Date"] = file[:10]
    frames.append(frame)

In [34]:
data = pd.concat(frames)
data["Salary"] = data["Salary"].replace("[\$,]", "", regex=True).astype(int)
data["Projection"] = data["Projection"].astype(float)
data["Scored"] = data["Scored"].astype(float)
data[["pOwn", "actOwn"]] = data[["pOwn", "actOwn"]].replace("[\%]", "", regex=True).astype(float)
data["Position"] = data["Position"].str.split("/", expand=True)[0]

Unnamed: 0,Player,Position,Team,Salary,Scored,Projection,Consensus,Time,Opponent,Order,Bat/Arm,Consistent,Floor,Ceiling,Avg FP,Imp Runs,pOwn,actOwn,Leverage,Safety
0,Joe Musgrove (R),P,SD,9000,49.0,39.22,0,09:10 PM,vs MIA,-,"6R, 2L",70%,16.1,43.4,26.75,4.5,44.0,44.0,47,86
1,Freddy Peralta (R),P,MIL,10200,0.0,37.64,0,07:05 PM,@CHC,-,"3R, 5L",82%,28.1,49.6,37.09,5.0,39.0,16.0,45,74
2,Jameson Taillon (R),P,NYY,8100,34.0,35.68,0,07:10 PM,@KC,-,"5R, 4L",71%,12.4,46.1,27,5.2,46.0,15.0,43,82
3,Lucas Giolito (R),P,CWS,9500,55.0,32.86,0,07:10 PM,@MIN,-,"4R, 5L",68%,17.3,51.8,32.3,5.6,46.0,18.0,36,76
4,Zach Thompson (R),P,MIA,7400,6.0,26.56,0,09:10 PM,@SD,-,"3R, 5L",69%,11.6,41.9,24.1,3.0,4.0,2.0,60,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,Jose Trevino,C,TEX,2200,3.0,7.33,0,06:10 PM,"Eduardo Rodriguez, BOS",9,R / L,43%,0.0,12.3,4.92,3.9,1.0,0.0,40,0
108,Ronald Torreyes,SS,PHI,2200,3.0,7.30,0,07:40 PM,"Joe Musgrove, SD",7,R / R,56%,0.0,13.5,6.76,3.4,1.0,2.0,40,0
109,Drew Ellis,3B,ARI,3200,6.2,7.10,0,07:10 PM,"Kyle Freeland, COL",8,R / L,42%,0.0,11.4,4.8,5.4,2.0,1.0,29,1
110,Andrew Knapp,C,PHI,2000,0.0,6.78,0,07:40 PM,"Joe Musgrove, SD",8,S / R,54%,0.0,9.1,3.49,3.4,1.0,0.0,40,0
