In [238]:
import polars as pl
import pandas as pd
import requests as re
import nflreadpy as nfl
from app.api import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Sleeper_API

In [224]:
user_name = 'IngrownHair132'
league_name = 'Any Given Sunday'
year = 2025

In [169]:
# Get user information (mine)
user = get_user(user_name)
user_id = user['user_id']

# Get user league information (mine)
league = get_leagues(user_id, 2025)
league_id = pd.DataFrame(league)['league_id'][0]

# Get league users
users = league_users(league_id)
user_info = pd.DataFrame(users)[['display_name', 'user_id']]

# Get league rosters
roster = get_rosters(league_id)

# players
players = get_players()

## league_table

In [178]:
overview = pd.merge(user_info, pd.DataFrame(roster)[['owner_id', 'metadata', 'players', 'starters', 'settings']],
         left_on='user_id', right_on='owner_id', how='left')

In [180]:
overview['points_for'] = overview['settings'].apply(lambda x: x['fpts'])
overview['points_against'] = overview['settings'].apply(lambda x: x['fpts_against'])
overview['wins'] = overview['settings'].apply(lambda x: x['wins'])
overview['losses'] = overview['settings'].apply(lambda x: x['losses'])
overview['ties'] = overview['settings'].apply(lambda x: x['ties'])

overview['streak'] = overview['metadata'].apply(lambda x: x['streak'])

overview['points_delta'] = overview['points_for'] - overview['points_against']

In [181]:
league_table = overview.sort_values(by=['wins', 'points_for'], ascending=False)[['display_name', 'streak', 'wins', 'losses', 'points_for', 'points_against', 'points_delta']]
league_table

Unnamed: 0,display_name,streak,wins,losses,points_for,points_against,points_delta
4,seifmahmoud,1W,9,1,1659,1200,459
1,BlaiseMaweedi,2W,7,3,1553,1366,187
6,ruids,1W,6,4,1266,1172,94
7,RoryQ94,2W,6,4,1213,1311,-98
3,youssefjerome,1L,5,5,1392,1316,76
0,IngrownHair132,1L,5,5,1246,1233,13
2,SeifMattar,5L,2,8,1110,1456,-346
5,Kiki99,10L,0,10,1078,1462,-384


## team_roster

In [267]:
overview[['display_name', 'players']]

Unnamed: 0,display_name,players
0,IngrownHair132,"[10229, 11564, 11581, 11584, 11604, 11627, 116..."
1,BlaiseMaweedi,"[11563, 11624, 11632, 12527, 12711, 1479, 4033..."
2,SeifMattar,"[11565, 11575, 11589, 12481, 12490, 12530, 146..."
3,youssefjerome,"[10236, 12489, 12498, 12514, 3321, 4018, 4037,..."
4,seifmahmoud,"[10859, 11533, 11560, 11637, 12504, 12509, 125..."
5,Kiki99,"[10219, 11576, 11620, 11655, 12484, 12506, 125..."
6,ruids,"[10222, 11583, 11631, 11635, 11638, 12508, 125..."
7,RoryQ94,"[11539, 11566, 12485, 12526, 12533, 2216, 4199..."


# Retrieval Augment Generation (RAG)

**Retrieval:** Accessing and retrieving information from a knowledge source, such as a database or memory

**Augment:** Enhancing or enriching something, in this case, the text generation process, with additional information or context

**Generation:** The process of creating or producing something, in this context, generating text or language

Vector Database Search:
  - KNN is not feasable as it will seach all vectors in the database, not efficient
  - ANN (approximate nearest neighbour) is an alternative
    - sacrifices precision for speed

Strategies for chunking (5):
  - Fixed-size
  - Semantic
  - Recursive
  - Document structure based
  - LLM based 

Workflow of RAG system:
  - Custom knowledge base -> (chunking) -> Chunks -> Embedding Model -> Vector Database
  - Query -> Embedding Model -> Vector Database
  - Vector Database -> (context) -> ReRanker model -> (context) -> Prompt template -> (prompt) -> LLM -> Response

RAG Step-by-step:
  1) Create chunks - break down additional knowledge into chunks before embedding and storing (there are chunking strategies)
  2) Generate embeddings - since these are context embedding models, models like the bi-encoders are highly relevant
  3) Store Embeddings in a vector database - these act as memory banks for your RAG applications 
  4) User input query - string representing the query
  5) Embed the query - query is transformed using the same embedding model
  6) Retrieve similar chunks - The vectorized query is then compared against our existing vectors in the database to find the most similar information, The vector database returns the k (a pre-defined parameter) most similar documents/chunks (using approximate nearest neighbor search)
  7) Re-rank the chunks - After retrieval the selected chunks might need further refinement to ensure the most relevant information is prioritized, a more sophisticated model (often a cross-encoder) evaluates the initial list of retrieved chunks alongside the query to assign a relevance score to each chunk