In [58]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import plotly.express as px
from sklearn.cluster import KMeans
from PyPDF2 import PdfReader
import re

import plotly.graph_objects as go



In [None]:
adp = pd.read_csv("data/fantasypros/adp/HALF_PPR_ADP.csv")
adp.dropna(subset=["PLAYER"], inplace=True)
adp

In [None]:
data = pd.read_csv("data/yearly/2019.csv")
data.dropna(subset=["Pos"], inplace=True)

In [321]:
def plotKmeans(df, x="ADP", y="FantasyPoints", position="RB", clusters=2):
    dataset = df.dropna(subset=[x, y])
    #dataset = dataset[dataset["Pos"] == position]
    points = list(zip(dataset[x].to_list(), dataset[y].to_list()))
    kmeans = KMeans(n_clusters=clusters, random_state=42) 
    kmeans.fit(points)
    kmeans.labels_
    dataset["labels"] = kmeans.labels_
    fig = px.scatter(dataset, x=x, y=y, color="labels", hover_data=["Player", "Year"])
    fig.update_layout({
        "font" : dict(
        family="sans serif",
        size=18,
        color="LightSeaGreen"
    ),
        "xaxis_title": x,
        "yaxis_title": y,
    })
    fig.show()
    
def plotElbow(df, x="ADP", y="FantasyPoints", position="RB"):
    xs = []
    ys = []
    for i in range(1, 15):
        dataset = df.dropna(subset=[x, y])
        dataset = dataset[dataset["Pos"] == position]
        points = list(zip(dataset[x].to_list(), dataset[y].to_list()))
        kmeans = KMeans(n_clusters=i, random_state=42) 
        kmeans.fit(points)
        kmeans.labels_
        dataset["labels"] = kmeans.labels_
        xs.append(i)
        ys.append(kmeans.inertia_)
    px.scatter(x=xs, y=ys).show()
       
    
def pdf_to_text(filename:str) -> str:

    text = ""
    try:
        reader = PdfReader(filename)
        for page in reader.pages:
            text += page.extract_text()
            print(page)
            return text
    except Exception as exc:
        print("you're fucked idiot")

def getNameFromSlice(s):
    try:
        return re.search(r"\) .+(?=,)", s).group()[2:]
    except:
        print(s)
        return re.search(r"\)",s).group()[2:]

def getAdpFromSlice(s):
    return float(re.search(r"\(.+(?=\))", s).group()[1:])

def getColumnIndex(s):
    try:
        return int(re.search(r"\d+\.", s).group()[:-1])
    except:
        return -1

def getNewColumnMap(columns, column_map, year):
    if columns[0] is None:
        column_map[0] = "TE"
    if columns[2] is None:
        column_map[2] = "WR"
    if columns[3] is None:
        if column_map[3] == "WR":
            column_map[3] = "DST"
        elif column_map[3] == "DST":
            column_map[3] = "K"
        elif column_map[3] == "DST":
            column_map[3] = "K"
    

def getDataFromLine2(line, break_num, column_map):
    output = {
        "QB" : [],
        "RB" : [],
        "WR" : [],
        "TE" : [],
        "K"  : [],
        "DST": [],
        "GARBAGE_COLLECT" : []
    }        
    line_split = line.split("$")
    
    if break_num == 1:
        line_split.insert(2, "")
        column_map[2] = "WR"
    elif break_num == 2:
        line_split.insert(3, "")
        column_map[3] = "DST"
    elif break_num == 3:
        column_map[0] = "TE"
        column_map[3] = "K"
        line_split.insert(0, "")
        line_split.insert(3, "")
    elif break_num == 4:
        line_split.insert(3, "")
        column_map[3] = "GARBAGE_COLLECT"
    
    col0 = line_split[0]
    col1 = line_split[1]
    col2 = line_split[2]
    col3 = line_split[3]
    
    if col0:
        output[column_map[0]].append((getNameFromSlice(col0), getAdpFromSlice(col0)))
    if col1:
        output[column_map[1]].append((getNameFromSlice(col1), getAdpFromSlice(col1)))
    if col2:
        output[column_map[2]].append((getNameFromSlice(col2), getAdpFromSlice(col2)))
    if col3 and break_num < 4:
        output[column_map[3]].append((getNameFromSlice(col3), getAdpFromSlice(col3)))
    return output
    

def loadAdp2(year: str) :
    filename = 'espn_' + year + ".pdf"
    text = pdf_to_text(filename)
    #seperate text into a list of rows
    lines = []
    line = ""
    for char in text:
        if char == "\n":
            lines.append(line)
            line=""
            continue
        line+=char
    #Find runningback value in each row
    rb = []
    qb = []
    wr = []
    te = []
    prev_line = None
    column_map = ["QB", "RB", "RB", "WR"]
    second_break = False
    line_break = 0
    for line in lines[:77]:
        #data = getDataFromLine(line, prev_line, column_map, year)
        if (len(line.split("$")) < 5 and not second_break):
            line_break += 1
            second_break = True
        elif (len(line.split("$")) < 5):
                second_break = False
        data = getDataFromLine2(line, line_break, column_map)
        
        #print(data)
        prev_line = line
        
        for k, v in data.items():
            if k == "RB" and len(v):
                rb.extend(v)
            elif k == "WR" and len(v):
                wr.extend(v)
            elif k == "QB" and len(v):
                qb.extend(v)
            elif k == "TE" and len(v):
                te.extend(v)
    qb_df = pd.DataFrame(qb, columns=["Name", "ADP"]) 
    qb_df["Pos"] = "QB"
    wr_df = pd.DataFrame(wr, columns=["Name", "ADP"])
    wr_df["Pos"] = "WR"
    rb_df = pd.DataFrame(rb, columns=["Name", "ADP"])
    rb_df["Pos"] = "RB"
    te_df = pd.DataFrame(te, columns=["Name", "ADP"])
    te_df["Pos"] = "TE"
    
    return (pd.concat([qb_df, wr_df, rb_df, te_df]))
 
        
        
def loadAndMergeData(year):
    adp = loadAdp2(str(year))
    data = pd.read_csv(f"data/yearly/{year}.csv")
    data.dropna(subset=["Pos"], inplace=True)
    data["ADP"] = pd.Series()
    data["Year"] = year
    for i, player in adp.iterrows():  
        try:
            ix = process.extractOne(player["Name"], data["Player"])
            data["ADP"].loc[ix[2]] = player["ADP"]
        except:
            print(player["Name"])
    #adp.dropna(subset=["PLAYER"], inplace=True)
    
    return data
    
    
    

In [319]:
# xs = []
# ys = []
# pos = []
# data["ADP"] = pd.Series()
# for i, player in adp.iterrows():  
#     try:
#         ix = process.extractOne(player["PLAYER"], data["Player"])
#         data["ADP"].loc[ix[2]] = player["AVG"]
#     except:
#         print(player["PLAYER"])
data18 = loadAndMergeData(2018) 
data19 = loadAndMergeData(2019)
data20 = loadAndMergeData(2020) 
data21 = loadAndMergeData(2021) 
aggData = pd.concat([data18, data19, data20, data21])

{'/Type': '/Page', '/Parent': IndirectObject(2, 0, 11424668496), '/Resources': {'/ExtGState': {'/GS6': IndirectObject(6, 0, 11424668496)}, '/Font': {'/F1': IndirectObject(7, 0, 11424668496), '/F2': IndirectObject(9, 0, 11424668496)}, '/XObject': {'/Image11': IndirectObject(11, 0, 11424668496)}, '/ProcSet': ['/PDF', '/Text', '/ImageB', '/ImageC', '/ImageI']}, '/MediaBox': [0, 0, 1176.92, 1523.08], '/Contents': IndirectObject(5, 0, 11424668496), '/Group': {'/Type': '/Group', '/S': '/Transparency', '/CS': '/DeviceRGB'}, '/Tabs': '/S', '/StructParents': 0}






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



{'/Type': '/Page', '/Parent': IndirectObject(2, 0, 11426704256), '/Resources': {'/ExtGState': {'/GS6': IndirectObject(6, 0, 11426704256), '/GS9': IndirectObject(9, 0, 11426704256)}, '/Font': {'/F1': IndirectObject(7, 0, 11426704256), '/F2': IndirectObject(10, 0, 11426704256)}, '/XObject': {'/Image12': IndirectObject(12, 0, 11426704256)}, '/ProcSet': ['/PDF', '/Text', '/ImageB', '/ImageC', '/ImageI']}, '/MediaBox': [0, 0, 612, 792], '/Contents': IndirectObject(5, 0, 11426704256), '/Group': {'/Type': '/Group', '/S': '/Transparency', '/CS': '/DeviceRGB'}, '/Tabs': '/S', '/StructParents': 0}






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



{'/Type': '/Page', '/Parent': IndirectObject(2, 0, 11349082224), '/Resources': {'/ExtGState': {'/GS6': IndirectObject(6, 0, 11349082224), '/GS9': IndirectObject(9, 0, 11349082224)}, '/Font': {'/F1': IndirectObject(7, 0, 11349082224), '/F2': IndirectObject(10, 0, 11349082224)}, '/XObject': {'/Image12': IndirectObject(12, 0, 11349082224)}, '/ProcSet': ['/PDF', '/Text', '/ImageB', '/ImageC', '/ImageI']}, '/MediaBox': [0, 0, 612, 792], '/Contents': IndirectObject(5, 0, 11349082224), '/Group': {'/Type': '/Group', '/S': '/Transparency', '/CS': '/DeviceRGB'}, '/Tabs': '/S', '/StructParents': 0}






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



{'/Type': '/Page', '/Parent': IndirectObject(2, 0, 11367740608), '/Resources': {'/ExtGState': {'/GS6': IndirectObject(6, 0, 11367740608), '/GS9': IndirectObject(9, 0, 11367740608)}, '/Font': {'/F1': IndirectObject(7, 0, 11367740608), '/F2': IndirectObject(10, 0, 11367740608)}, '/XObject': {'/Image12': IndirectObject(12, 0, 11367740608)}, '/ProcSet': ['/PDF', '/Text', '/ImageB', '/ImageC', '/ImageI']}, '/MediaBox': [0, 0, 612, 792], '/Contents': IndirectObject(5, 0, 11367740608), '/Group': {'/Type': '/Group', '/S': '/Transparency', '/CS': '/DeviceRGB'}, '/Tabs': '/S', '/StructParents': 0}






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [300]:
aggData = pd.concat([data18, data19, data20, data21])

In [62]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data18["ADP"], y=data["FantasyPoints"], mode='markers', text=data["Player"]))
fig.add_trace(go.Scatter(x=data19["ADP"], y=data["FantasyPoints"], mode='markers', text=data["Player"]))
fig.add_trace(go.Scatter(x=data20["ADP"], y=data["FantasyPoints"], mode='markers', text=data["Player"]))
fig.add_trace(go.Scatter(x=data21["ADP"], y=data["FantasyPoints"], mode='markers', text=data["Player"]))
fig.show()
#px.scatter(data, x="ADP", y="FantasyPoints", color="Pos", text="Player").show()

In [322]:
rbs = aggData[aggData["Pos"] == "WR"]
rbs["normFP"] = rbs["FantasyPoints"] / rbs["FantasyPoints"].max()
plotKmeans(rbs, y="normFP", clusters=10)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [304]:
plotKmeans(aggData, position="RB", clusters=5)
plotKmeans(aggData, position="WR", clusters=5)
plotKmeans(aggData, position="TE", clusters=5)
plotKmeans(aggData, position="QB", clusters=5)

In [64]:
plotElbow(data, position="RB")
plotElbow(data, position="WR")
plotElbow(data, position="TE")
plotElbow(data, position="QB")

ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [63]:
data18

Unnamed: 0.1,Unnamed: 0,Player,Tm,Pos,Age,G,GS,Tgt,Rec,PassingYds,...,RushingYds,RushingTD,RushingAtt,ReceivingYds,ReceivingTD,FantasyPoints,Int,Fumbles,FumblesLost,ADP
0,0,Todd Gurley,LAR,RB,24.0,14.0,14.0,81.0,59.0,0.0,...,1251.0,17.0,256.0,580.0,4.0,366.10,0.0,1.0,1.0,2.0
1,1,Saquon Barkley,NYG,RB,21.0,16.0,16.0,121.0,91.0,0.0,...,1307.0,11.0,261.0,721.0,4.0,383.80,0.0,0.0,0.0,6.0
2,2,Christian McCaffrey,CAR,RB,22.0,16.0,16.0,124.0,107.0,50.0,...,1098.0,7.0,219.0,867.0,6.0,385.50,0.0,4.0,1.0,18.0
3,3,Alvin Kamara,NOR,RB,23.0,15.0,13.0,105.0,81.0,0.0,...,883.0,14.0,194.0,709.0,4.0,348.20,0.0,1.0,0.0,7.0
4,4,Patrick Mahomes,KAN,QB,23.0,16.0,16.0,0.0,0.0,5097.0,...,272.0,2.0,60.0,0.0,0.0,415.08,12.0,9.0,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,617,Kaelin Clay,NYG,0,26.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-2.00,0.0,1.0,1.0,
618,618,JJ Jones,2TM,WR,26.0,4.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,3.0,0.0,-0.70,0.0,2.0,1.0,
619,619,Kyle Lauletta,NYG,QB,23.0,2.0,0.0,0.0,0.0,0.0,...,-2.0,0.0,1.0,0.0,0.0,-2.20,1.0,0.0,0.0,
620,620,Riley McCarron,NWE,0,25.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-2.00,0.0,1.0,1.0,


In [283]:

        
qb, wr, rb, te = loadAdp2("2021")

{'/Type': '/Page', '/Parent': IndirectObject(2, 0, 11342576176), '/Resources': {'/ExtGState': {'/GS6': IndirectObject(6, 0, 11342576176), '/GS9': IndirectObject(9, 0, 11342576176)}, '/Font': {'/F1': IndirectObject(7, 0, 11342576176), '/F2': IndirectObject(10, 0, 11342576176)}, '/XObject': {'/Image12': IndirectObject(12, 0, 11342576176)}, '/ProcSet': ['/PDF', '/Text', '/ImageB', '/ImageC', '/ImageI']}, '/MediaBox': [0, 0, 612, 792], '/Contents': IndirectObject(5, 0, 11342576176), '/Group': {'/Type': '/Group', '/S': '/Transparency', '/CS': '/DeviceRGB'}, '/Tabs': '/S', '/StructParents': 0}


In [317]:
aggData.Tm

0      LAR
1      NYG
2      CAR
3      NOR
4      KAN
      ... 
665    TEN
667    2TM
668    LAR
669    ATL
670    DEN
Name: Tm, Length: 2416, dtype: object

In [None]:
# def getDataFromLine(line, prev_line, column_map, year):
    
#     output = {
#         "QB" : [],
#         "RB" : [],
#         "WR" : [],
#         "TE" : [],
#         "K"  : [],
#         "DST": [],
#     }
#     s = line.split("$")
#     if prev_line is None:
#         line_split = line.split("$")
#         col0 = line_split[0]
#         col1 = line_split[1]
#         col2 = line_split[2]
#         col3 = line_split[3]
#         output[column_map[0]].append((getNameFromSlice(col0), getAdpFromSlice(col0)))
#         output[column_map[1]].append((getNameFromSlice(col1), getAdpFromSlice(col1)))
#         output[column_map[2]].append((getNameFromSlice(col2), getAdpFromSlice(col2)))
#         output[column_map[3]].append((getNameFromSlice(col3), getAdpFromSlice(col3)))
#         return output
#     line_split = line.split("$")
#     prev_line_split = prev_line.split("$")
#     if len(line_split) == 5:
#         col0 = line_split[0]
#         col1 = line_split[1]
#         col2 = line_split[2]
#         col3 = line_split[3]
#         output[column_map[0]].append((getNameFromSlice(col0), getAdpFromSlice(col0)))
#         output[column_map[1]].append((getNameFromSlice(col1), getAdpFromSlice(col1)))
#         output[column_map[2]].append((getNameFromSlice(col2), getAdpFromSlice(col2)))
#         output[column_map[3]].append((getNameFromSlice(col3), getAdpFromSlice(col3)))
#         return output
#     elif len(line_split) == len(prev_line_split):
#         col0 = None
#         col1 = None
#         col2 = None
#         col3 = None
        
#         while line_split:
#             col = line_split.pop()
            

#         if col0:
#             output[column_map[0]] = (getNameFromSlice(col0), getAdpFromSlice(col0))
#         if col1:
#             output[column_map[1]] = (getNameFromSlice(col1), getAdpFromSlice(col1))
#         if col2:
#             output[column_map[2]] = (getNameFromSlice(col2), getAdpFromSlice(col2))
#         if col3:
#             output[column_map[3]] = (getNameFromSlice(col3), getAdpFromSlice(col3))
#         return output
        
#     else:
#         col0 = None
#         col1 = None
#         col2 = None
#         col3 = None
        
#         while line_split[:-1]:
#             col = line_split.pop(0)
#             try:
#                 ix = getColumnIndex(col)
#             except:
#                 continue
#             pix0 = getColumnIndex(prev_line_split[0])
#             pix1 = getColumnIndex(prev_line_split[1])
#             pix2 = getColumnIndex(prev_line_split[2])
#             pix3 = getColumnIndex(prev_line_split[3])
#             if ix == pix3+1:
#                 col3 = col
#             elif ix == pix2+1 and col2 is None:
#                 col2 = col
#             elif ix == pix1+1 and col1 is None:
#                 col1 = col
#             elif ix == pix0+1 and col0 is None:
#                 col0 = col
                
#             print(col0, col1, col2, col3)
            
            
            
#         if col0:
#             output[column_map[0]].append((getNameFromSlice(col0), getAdpFromSlice(col0)))
#         if col1:
#             output[column_map[1]].append((getNameFromSlice(col1), getAdpFromSlice(col1)))
#         if col2:
#             output[column_map[2]].append((getNameFromSlice(col2), getAdpFromSlice(col2)))
#         if col3:
#             output[column_map[3]].append((getNameFromSlice(col3), getAdpFromSlice(col3)))
        
#         print(column_map)
#         getNewColumnMap([col0, col1, col2, col3], column_map, year)
#         print(column_map)
#         # do weird shit
#         return output