1) create a fake dataset like I was just sent out:

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# --- 1. Create some fake basketball-style data ---
np.random.seed(42)
n = 200
data = {
    "player_id": np.arange(1, n+1),
    "name": [f"Player_{i}" for i in range(1, n+1)],
    "points": np.random.poisson(12, n),
    "assists": np.random.poisson(4, n),
    "rebounds": np.random.poisson(6, n),
    "minutes": np.random.randint(10, 40, n),
    "team": np.random.choice(["Heat", "Mavs", "Lakers", "Celtics"], n),
}

df_fake = pd.DataFrame(data)

# Add some missing values + outliers
df_fake.loc[5:10, "assists"] = np.nan
df_fake.loc[15, "points"] = 100   # outlier
df_fake.loc[25, "rebounds"] = 0   # edge case

# --- 2. Save as if we just got it from somewhere ---
Path("data").mkdir(exist_ok=True)
csv_path = Path("notebooks/5080_gpu/interview_prep/data/fake_basketball.csv")
df_fake.to_csv(csv_path, index=False)

print(f"Sample CSV saved at {csv_path}")
print(df_fake.head())


Sample CSV saved at notebooks/5080_gpu/interview_prep/data/fake_basketball.csv
   player_id      name  points  assists  rebounds  minutes     team
0          1  Player_1      11      8.0         6       28  Celtics
1          2  Player_2      14      2.0         3       22   Lakers
2          3  Player_3       8      3.0         4       37  Celtics
3          4  Player_4      13      5.0         6       16     Mavs
4          5  Player_5      16      6.0         5       24     Mavs


# upgraded fake dataset for PER, VORP, EWA, PER, PIE


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# --- 0) Config & helpers ---
rng = np.random.default_rng(42)
TEAMS = ["HEA", "MAV", "LAL", "BOS"]     # tiny league for demo
N_PLAYERS = 200
SEASONS = ["2023-24", "2024-25"]
GAMES_PER_SEASON = 120  # total games in this tiny league

def season_from_date(dt: pd.Timestamp) -> str:
    y = dt.year if dt.month >= 8 else dt.year - 1
    return f"{y}-{str((y+1) % 100).zfill(2)}"

# --- 1) Players master (id, name, team, position) ---
positions = ["PG", "SG", "SF", "PF", "C"]
players = pd.DataFrame({
    "player_id": np.arange(1, N_PLAYERS+1),
    "name": [f"Player_{i}" for i in range(1, N_PLAYERS+1)],
    "team_id": rng.choice(TEAMS, N_PLAYERS),
    "position": rng.choice(positions, N_PLAYERS, p=[0.22, 0.22, 0.20, 0.20, 0.16])
})

# --- 2) Schedule + team-game totals ---
rows_team_game = []
rows_player_game = []

for season in SEASONS:
    # pick a date window for the season
    start = pd.Timestamp(int(season[:4]), 10, 1)
    dates = start + pd.to_timedelta(rng.integers(0, 180, size=GAMES_PER_SEASON), unit="D")

    for gid, gdate in enumerate(dates, start=1):
        # two distinct teams
        home, away = rng.choice(TEAMS, 2, replace=False)
        for is_home, team, opp in [(1, home, away), (0, away, home)]:
            # generate plausible team totals
            fga = rng.integers(75, 100)
            three_pa = rng.integers(int(0.28*fga), int(0.45*fga))
            two_pa = max(fga - three_pa, 0)

            two_fg = rng.binomial(two_pa, rng.uniform(0.48, 0.58))
            three_pm = rng.binomial(three_pa, rng.uniform(0.33, 0.41))
            fgm = two_fg + three_pm

            fta = rng.integers(15, 30)
            ftm = rng.binomial(fta, rng.uniform(0.72, 0.85))

            orb = rng.integers(8, 15)
            drb = rng.integers(25, 35)
            trb = orb + drb

            ast = rng.integers(18, 30)
            stl = rng.integers(5, 10)
            blk = rng.integers(3, 8)
            tov = rng.integers(10, 18)
            pf  = rng.integers(15, 24)
            pts = int(2*two_fg + 3*three_pm + ftm)

            rows_team_game.append({
                "team_id": team, "opp_team_id": opp,
                "game_id": f"{season.replace('-','')}_{gid}",
                "game_date": gdate, "season": season,
                "is_home": bool(is_home),
                "minutes_team": 240,
                "FGM": fgm, "FGA": fga, "3PM": three_pm, "3PA": three_pa,
                "FTM": ftm, "FTA": fta,
                "ORB": orb, "DRB": drb, "TRB": trb,
                "AST": ast, "STL": stl, "BLK": blk, "TOV": tov, "PF": pf, "PTS": pts
            })

            # distribute team totals to 8-10 players (very rough minutes share)
            roster = players.loc[players["team_id"] == team, "player_id"].tolist()
            if len(roster) < 10:
                roster = rng.choice(players["player_id"], 10, replace=False).tolist()
            game_players = rng.choice(roster, size=rng.integers(8, 11), replace=False)

            minute_shares = rng.dirichlet(np.ones(len(game_players))) * 240
            # clip to realistic on-court minutes
            minute_shares = np.clip(np.round(minute_shares, 1), 8, 42)
            minute_shares *= 240.0 / minute_shares.sum()

            def split_stat(total, shares):
                # integer-ish split with sum preserved
                raw = shares / shares.sum() * total
                vals = np.floor(raw).astype(int)
                for k in range(total - vals.sum()):
                    vals[k % len(vals)] += 1
                return vals

            # allocate counting stats roughly by minutes
            alloc_cols = ["FGM","FGA","3PM","3PA","FTM","FTA","ORB","DRB","AST","STL","BLK","TOV","PF","PTS"]
            alloc = {c: split_stat(int(rows_team_game[-1][c]), minute_shares) for c in alloc_cols}
            # derive TRB per player from ORB+DRB later

            for pid, mp, i in zip(game_players, minute_shares, range(len(game_players))):
                p_orb = int(alloc["ORB"][i]); p_drb = int(alloc["DRB"][i])
                rows_player_game.append({
                    "player_id": pid,
                    "name": players.loc[players.player_id==pid, "name"].iloc[0],
                    "team_id": team, "opp_team_id": opp,
                    "game_id": f"{season.replace('-','')}_{gid}",
                    "game_date": gdate, "season": season,
                    "home_away": "H" if is_home else "A",
                    "minutes": float(np.round(mp,1)),
                    "FGM": int(alloc["FGM"][i]), "FGA": int(alloc["FGA"][i]),
                    "3PM": int(alloc["3PM"][i]), "3PA": int(alloc["3PA"][i]),
                    "FTM": int(alloc["FTM"][i]), "FTA": int(alloc["FTA"][i]),
                    "ORB": p_orb, "DRB": p_drb, "TRB": p_orb + p_drb,
                    "AST": int(alloc["AST"][i]), "STL": int(alloc["STL"][i]),
                    "BLK": int(alloc["BLK"][i]), "TOV": int(alloc["TOV"][i]),
                    "PF":  int(alloc["PF"][i]),  "PTS": int(alloc["PTS"][i]),
                })

team_game_totals = pd.DataFrame(rows_team_game)
player_game_box = pd.DataFrame(rows_player_game)

# --- 3) League season totals (for PER constants later) ---
league_season_totals = (team_game_totals
    .groupby("season", as_index=False)
    .agg(lg_FG=("FGM","sum"), lg_FGA=("FGA","sum"),
         lg_3P=("3PM","sum"), lg_FT=("FTM","sum"), lg_FTA=("FTA","sum"),
         lg_ORB=("ORB","sum"), lg_TRB=("TRB","sum"),
         lg_TOV=("TOV","sum"), lg_AST=("AST","sum"), lg_PTS=("PTS","sum"))
)

# --- 4) Team-season possessions & pace (Oliver/BBR style, needs opponent stats) ---
# merge opponent stats per game
opp = team_game_totals.copy()
opp_cols = [c for c in opp.columns if c not in {"team_id","is_home"}]
opp = opp.rename(columns={c: f"opp_{c}" for c in opp_cols if c not in {"opp_team_id","game_id","game_date","season"}})
merged = (team_game_totals
          .merge(opp, left_on=["opp_team_id","game_id"],
                       right_on=["team_id","opp_game_id"], suffixes=("","_o")))

# Possessions estimate (BBR/Oliver)
tm = merged
tm_poss = 0.5 * (
    (tm["FGA"] + 0.4*tm["FTA"]
     - 1.07*(tm["ORB"]/(tm["ORB"] + tm["opp_DRB"]))*(tm["FGA"] - tm["FGM"])
     + tm["TOV"])
    +
    (tm["opp_FGA"] + 0.4*tm["opp_FTA"]
     - 1.07*(tm["opp_ORB"]/(tm["opp_ORB"] + tm["DRB"]))*(tm["opp_FGA"] - tm["opp_FGM"])
     + tm["opp_TOV"])
)

team_game_totals["possessions"] = tm_poss.values

team_season_pace = (team_game_totals
    .groupby(["team_id","season"], as_index=False)
    .agg(games=("game_id","nunique"),
         team_possessions=("possessions","sum"))
)
team_season_pace["pace"] = 48.0 * team_season_pace["team_possessions"] / team_season_pace["games"]

# --- 5) Player-season totals (for EWA/VORP aggregation later) ---
player_season_totals = (player_game_box
    .merge(players[["player_id","position"]], on="player_id", how="left")
    .groupby(["player_id","name","position","team_id","season"], as_index=False)
    .agg(games=("game_id","nunique"),
         minutes=("minutes","sum"),
         FGM=("FGM","sum"), FGA=("FGA","sum"), PM3=("3PM","sum"), PA3=("3PA","sum"),
         FTM=("FTM","sum"), FTA=("FTA","sum"),
         ORB=("ORB","sum"), DRB=("DRB","sum"), TRB=("TRB","sum"),
         AST=("AST","sum"), STL=("STL","sum"), BLK=("BLK","sum"),
         TOV=("TOV","sum"), PF=("PF","sum"), PTS=("PTS","sum"))
)

# --- 6) Save everything alongside your original path ---
Path("notebooks/5080_gpu/interview_prep/data").mkdir(parents=True, exist_ok=True)
base = Path("notebooks/5080_gpu/interview_prep/data")

player_game_box.to_csv(base/"player_game_box.csv", index=False)
team_game_totals.to_csv(base/"team_game_totals.csv", index=False)
league_season_totals.to_csv(base/"league_season_totals.csv", index=False)
team_season_pace.to_csv(base/"team_season_pace.csv", index=False)
player_season_totals.to_csv(base/"player_season_totals.csv", index=False)

print("Wrote:")
for f in ["player_game_box.csv","team_game_totals.csv","league_season_totals.csv",
          "team_season_pace.csv","player_season_totals.csv"]:
    print("-", base/f)


# Utils

In [8]:
%%writefile notebooks/5080_gpu/interview_prep/data/schema.yaml
# column roles for fake_basketball.csv
y_variable: points
numerical: [assists, rebounds, minutes]
nominal: [team, name, player_id]
ordinal: []          # add later if needed
id_cols: [player_id] # optional: kept for joins/reporting, dropped from X


Writing notebooks/5080_gpu/interview_prep/data/schema.yaml


In [None]:
%%writefile notebooks/5080_gpu/interview_prep/data/config.yaml

fake_bball_basic_csv_path = Path("notebooks/5080_gpu/interview_prep/data/fake_basketball.csv")
yaml_path = Path("notebooks/5080_gpu/interview_prep/data/schema.yaml")



In [None]:
%%writefile notebooks/5080_gpu/interview_prep/data/schema.yaml

y_variable: 
ordinal: []
nominal: []
numerical: []
id_cols: []

In [None]:
"""
csv_path = Path("notebooks/5080_gpu/interview_prep/data/fake_basketball.csv")
yaml_path = Path("notebooks/5080_gpu/interview_prep/data/schema.yaml")

Questions:
XX“Given a CSV dataset, how would you explore and summarize it?”

“Given a DataFrame, how would you handle missing values?”

“How would you detect and address outliers in a dataset?”

“Perform univariate, bivariate, and multivariate analysis on given columns.”

“Given a dataset, how would you normalize or standardize its features?”

“Write a function to compute summary statistics (mean, median, std, etc.) of a column.”

“Given a dataset, how would you identify the type of each variable and choose feature encoding?”

“Given a dataset and a target variable, how would you check for relationships or correlations?”

“Write code to detect missingness patterns and decide how to impute.”

“Describe the full data-analysis pipeline: from loading to insight delivery—then code accordingly.”
"""
import pandas as pd
import numpy as np
from pathlib import Path
from omegaconf import OmegaConf
from pydantic import BaseModel
from typing import List


def load_data(csv_path, sample_size=100, debug=False):
    
    # 1) pull in sample data for data type check and transform if needed
    sample_data = pd.read_csv(csv_path, nrows=sample_size)
    if debug:
        print(f"sample data dtypes ======={sample_data.dtypes} and head {sample_data.head()}")

    dtype_map = {}
    for c in sample_data.select_dtypes(include=["object"]):
        dtype_map[c] = "string"
    for c in sample_data.select_dtypes(include=["int64"]):
        dtype_map[c] = "float64"
    if "player_id" in sample_data.columns:
        dtype_map["player_id"] = "string"

    #2) update load in data with updated dtypes
    data = pd.read_csv(csv_path, dtype= dtype_map)
    if debug:
        print(f"updated data dtypes ======={data.dtypes} and head {data.head()}")

    return data

def eda(data, debug=False):
    if debug:
        print(f"data columns======{data.columns} data dtypes========{data.dtypes} data head()========{data.head()}")
    
    report = {}
    
    nulls = data.isnull().sum()
    total_rows = len(data)
    null_perc = (nulls / total_rows) *100
    
    report["total data rows"] = total_rows
    report["null percentages"] = null_perc
    
    report_df = pd.DataFrame(report)
    
    return report_df


def feature_engineering(data, debug=False):
    
    return
    

class ColumnSchema(BaseModel):
    y_variable: str
    nominal: List[str]
    ordinal: List[str]
    numerical: List[str]
    id_cols: List[str]
    
    
def load_schema(yaml_path, debug=False):
    cfg = OmegaConf.load(str(yaml_path))
    if debug:
        print(f"check on config if needed before dict form=========={cfg}")

    cfg_dict = OmegaConf.to_container(cfg, resolve=True)
    if debug:
        print(f"check on config if needed after dict form=========={cfg_dict}")

    return ColumnSchema(**cfg_dict)



if __name__ == "__main__":
    # XX“Given a CSV dataset, how would you explore and summarize it?”
    csv_path = Path("notebooks/5080_gpu/interview_prep/data/fake_basketball.csv")
    yaml_path = Path("notebooks/5080_gpu/interview_prep/data/schema.yaml")

    data = load_data(csv_path,debug=True)

    reports = eda(data, debug=True)
    print(reports)
    
    schema = load_schema(yaml_path)
    
    y_var = data[schema.y_variable]
    ord = data[schema.ordinal]
    nom = data[schema.nominal]
    num = data[schema.numerical]
    id_cols = data[schema.id_cols]
    
    categoricals = pd.concat([nom, ord], axis=1)
    print(categoricals)
    
    
    # 
    



name          object
points         int64
assists      float64
rebounds       int64
minutes        int64
team          object
dtype: object and head    player_id      name  points  assists  rebounds  minutes     team
0          1  Player_1      11      8.0         6       28  Celtics
1          2  Player_2      14      2.0         3       22   Lakers
2          3  Player_3       8      3.0         4       37  Celtics
3          4  Player_4      13      5.0         6       16     Mavs
4          5  Player_5      16      6.0         5       24     Mavs
name         string[python]
points              float64
assists             float64
rebounds            float64
minutes             float64
team         string[python]
dtype: object and head   player_id      name  points  assists  rebounds  minutes     team
0         1  Player_1    11.0      8.0       6.0     28.0  Celtics
1         2  Player_2    14.0      2.0       3.0     22.0   Lakers
2         3  Player_3     8.0      3.0       4.0   

In [5]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, KFold

# assume you already loaded: data, schema (with .numerical / .nominal / .ordinal)
num_cols = schema.numerical
cat_cols = schema.nominal + schema.ordinal

# --- Define interchangeable numerical imputers ---
num_imputer_options = {
    "median": SimpleImputer(strategy="median", add_indicator=True),  # fast, robust
    "knn": Pipeline([
        ("scale_for_knn", StandardScaler(with_mean=True, with_std=True)),
        ("impute", KNNImputer(n_neighbors=5)),
        ("ind", SimpleImputer(strategy="constant", fill_value=0, add_indicator=True)),
    ]),
    "iterative": IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        random_state=42,
        max_iter=10,
        sample_posterior=False
    ),
}

# Pick a default; GridSearch will override
num_imputer = num_imputer_options["median"]

numeric_pipe = Pipeline([
    ("imputer", num_imputer),
    ("scaler", StandardScaler(with_mean=True, with_std=True))
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preproc = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
])

# downstream estimator — replace with your task model
model = Ridge(random_state=42)

pipe = Pipeline([
    ("preproc", preproc),
    ("model", model),
])

# --- Search over imputation strategies cleanly (no leakage) ---
param_grid = [
    {
        "preproc__num__imputer": [num_imputer_options["median"]],
        "model__alpha": [0.1, 1.0, 10.0],
    },
    {
        "preproc__num__imputer": [num_imputer_options["knn"]],
        "preproc__num__imputer__impute__n_neighbors": [3, 5, 7],
        "model__alpha": [0.1, 1.0, 10.0],
    },
    {
        "preproc__num__imputer": [num_imputer_options["iterative"]],
        "preproc__num__imputer__max_iter": [5, 10],
        "model__alpha": [0.1, 1.0, 10.0],
    },
]


cv = KFold(n_splits=5, shuffle=True, random_state=42)
gs = GridSearchCV(pipe, param_grid=param_grid, cv=cv, scoring="neg_mean_absolute_error", n_jobs=-1)
gs.fit(data[num_cols + cat_cols], data[schema.y_variable])

best_pipe = gs.best_estimator_
print("Best imputer:", gs.best_params_.get("preproc__num__imputer"))
print("Best score (MAE):", -gs.best_score_)


Best imputer: SimpleImputer(add_indicator=True, strategy='median')
Best score (MAE): 3.510450550399642


In [None]:
"""
“How would you detect and address outliers in a dataset?”
"""

In [None]:
"""
“Perform univariate, bivariate, and multivariate analysis on given columns.”
"""

In [None]:
"""
“Given a dataset, how would you normalize or standardize its features?”
"""



In [None]:
"""
“Write a function to compute summary statistics (mean, median, std, etc.) of a column.”
"""



In [None]:
"""
“Given a dataset, how would you identify the type of each variable and choose feature encoding?”

"""

In [None]:
"""
“Given a dataset and a target variable, how would you check for relationships or correlations?”

"""

In [None]:
"""
“Write code to detect missingness patterns and decide how to impute.”

"""

In [None]:
"""
“Describe the full data-analysis pipeline: from loading to insight delivery—then code accordingly.”
"""

In [None]:
"""
here you go — 10 fresh, live-coding-friendly prompts you can answer with your current fake\_basketball dataset (`player_id, name, team, points, assists, rebounds, minutes`). they’re simple, pythonic, and reusable.

## 10 new questions (not in your list)

1. Who are the top 3 scorers **per team** by **points per-36** (filter to minutes ≥ 15)?

2. Build a simple **composite impact score** per player using standardized per-36 stats:
   `impact = z(points/36) + 0.7*z(assists/36) + 0.7*z(rebounds/36)`. Who are the top 10?

3. Do players **outperform their expected points** for their minutes/assists/rebounds?
   Fit a quick linear model `points ~ minutes + assists + rebounds` and list top 10 **positive residuals**.

4. Which team is **most balanced vs. star-heavy** by scoring?
   Compute **coefficient of variation** (std/mean) of `points` per team and rank.

5. Bucket players into **minutes tiers**: `[10–19, 20–29, 30–40]`.
   What are the mean/median of `points/assists/rebounds` per tier?

6. “Three-above-median” players: per team, who is **above the team median** in **points, assists, and rebounds** simultaneously?

7. Write a reusable helper `top_k(df, by, k, group=None)` and use it to return the **top 2 rebounders per team**.

8. What’s the **team effect** on scoring after controlling for other stats?
   One-hot encode `team`, fit a Ridge `points ~ minutes + assists + rebounds + team_*`, and show team coefficients.

9. Give a quick **bootstrap 95% CI** for **mean points per team** (1,000 resamples). Which teams have clearly higher means?

10. Detect potential **duplicate identity issues**: do we have any duplicate `(player_id, name)` rows? If so, keep the one with the **max minutes**.

"""





here you go — 10 fresh, live-coding-friendly prompts you can answer with your current fake\_basketball dataset (`player_id, name, team, points, assists, rebounds, minutes`). they’re simple, pythonic, and reusable.

## 10 new questions (not in your list)

1. Who are the top 3 scorers **per team** by **points per-36** (filter to minutes ≥ 15)?

2. Build a simple **composite impact score** per player using standardized per-36 stats:
   `impact = z(points/36) + 0.7*z(assists/36) + 0.7*z(rebounds/36)`. Who are the top 10?

3. Do players **outperform their expected points** for their minutes/assists/rebounds?
   Fit a quick linear model `points ~ minutes + assists + rebounds` and list top 10 **positive residuals**.

4. Which team is **most balanced vs. star-heavy** by scoring?
   Compute **coefficient of variation** (std/mean) of `points` per team and rank.

5. Bucket players into **minutes tiers**: `[10–19, 20–29, 30–40]`.
   What are the mean/median of `points/assists/rebounds` per tier?

6. “Three-above-median” players: per team, who is **above the team median** in **points, assists, and rebounds** simultaneously?

7. Write a reusable helper `top_k(df, by, k, group=None)` and use it to return the **top 2 rebounders per team**.

8. What’s the **team effect** on scoring after controlling for other stats?
   One-hot encode `team`, fit a Ridge `points ~ minutes + assists + rebounds + team_*`, and show team coefficients.

9. Give a quick **bootstrap 95% CI** for **mean points per team** (1,000 resamples). Which teams have clearly higher means?

10. Detect potential **duplicate identity issues**: do we have any duplicate `(player_id, name)` rows? If so, keep the one with the **max minutes**.

---

## Answers & quick steps (copy-ready)

> Assumes `import pandas as pd, numpy as np` and `df = pd.read_csv(csv_path)` are already done.

### 1) Top 3 points per-36 by team (minutes ≥ 15)

**Steps:** compute per-36, filter, groupby-rank, keep top 3.

```python
df1 = df.copy()
df1["pts_per36"] = df1["points"] * 36 / df1["minutes"]
ans1 = (df1.loc[df1["minutes"] >= 15]
          .assign(rk=lambda x: x.groupby("team")["pts_per36"].rank(ascending=False, method="first"))
          .query("rk <= 3")
          .sort_values(["team","rk"])[["team","name","pts_per36","rk"]])
ans1.head(12)
```

### 2) Composite impact (standardized per-36) — top 10

**Steps:** build per-36, z-score, combine, sort.

```python
df2 = df.copy()
for c, newc in [("points","pts36"),("assists","ast36"),("rebounds","reb36")]:
    df2[newc] = df2[c] * 36 / df2["minutes"]
for c in ["pts36","ast36","reb36"]:
    mu, sd = df2[c].mean(), df2[c].std(ddof=0)
    df2[c+"_z"] = (df2[c] - mu) / sd
df2["impact"] = df2["pts36_z"] + 0.7*df2["ast36_z"] + 0.7*df2["reb36_z"]
ans2 = df2.sort_values("impact", ascending=False)[["name","team","impact","pts36","ast36","reb36"]].head(10)
ans2
```

### 3) “Over-expected” scorers via residuals

**Steps:** fit quick linear model, compute residuals, rank.

```python
from sklearn.linear_model import LinearRegression
X = df[["minutes","assists","rebounds"]]
y = df["points"]
m = LinearRegression().fit(X, y)
df3 = df.assign(pred=m.predict(X), resid=lambda x: x["points"] - x["pred"])
ans3 = df3.sort_values("resid", ascending=False)[["name","team","points","pred","resid"]].head(10)
ans3
```

### 4) Team scoring parity (coefficient of variation)

**Steps:** groupby team; cv = std/mean; sort.

```python
g = df.groupby("team")["points"]
ans4 = (g.agg(mean="mean", std="std")
          .assign(cv=lambda x: x["std"]/x["mean"])
          .sort_values("cv", ascending=True))  # lower = more balanced
ans4
```

### 5) Minutes tiers summary table

**Steps:** cut into bins, aggregate.

```python
bins = [10, 20, 30, 40]
labels = ["10–19","20–29","30–40"]
df5 = df.assign(min_tier=pd.cut(df["minutes"], bins=bins, labels=labels, include_lowest=True, right=False))
ans5 = (df5.groupby("min_tier")[["points","assists","rebounds"]]
          .agg(["mean","median","count"]).round(2))
ans5
```

### 6) Three-above-median players (per team)

**Steps:** compute team medians, compare per row, filter.

```python
med = df.groupby("team")[["points","assists","rebounds"]].median().rename(columns=lambda c: c+"_med")
df6 = df.join(med, on="team")
mask = (df6["points"]>df6["points_med"]) & (df6["assists"]>df6["assists_med"]) & (df6["rebounds"]>df6["rebounds_med"])
ans6 = df6.loc[mask, ["team","name","points","assists","rebounds"]].sort_values(["team","points"], ascending=[True,False])
ans6
```

### 7) Reusable `top_k` + example (top 2 rebounders per team)

**Steps:** write helper; call with group.

```python
def top_k(d, by, k=3, group=None, ascending=False):
    x = d.copy()
    if group:
        x = x.assign(_rk=x.groupby(group)[by].rank(ascending=ascending, method="first"))
        return x.query(f"_rk <= {k}").sort_values(group + [by], ascending=[True, not ascending])
    else:
        return x.sort_values(by, ascending=not ascending).head(k)

ans7 = top_k(df.assign(reb_per36=df["rebounds"]*36/df["minutes"]),
             by="reb_per36", k=2, group=["team"], ascending=False)[["team","name","reb_per36"]]
ans7
```

### 8) Team effect on points (Ridge with one-hot team)

**Steps:** one-hot team, fit Ridge, show team coefs.

```python
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df[["minutes","assists","rebounds","team"]]
y = df["points"]

pre = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), ["team"])
], remainder="passthrough")

ridge = Pipeline([("pre", pre), ("model", Ridge(alpha=1.0, random_state=42))]).fit(X, y)
feat_names = list(ridge.named_steps["pre"].get_feature_names_out()) + ["minutes","assists","rebounds"]
coefs = pd.Series(ridge.named_steps["model"].coef_, index=feat_names).sort_values(ascending=False)
ans8 = coefs
ans8.head(12)
```

### 9) Bootstrap 95% CI for mean points per team

**Steps:** resample rows within team, compute mean for each resample, take percentiles.

```python
rng = np.random.default_rng(42)
def ci95(a, B=1000):
    boot = [rng.choice(a, size=a.size, replace=True).mean() for _ in range(B)]
    return np.percentile(boot, [2.5, 97.5])

ans9 = (df.groupby("team")["points"]
          .apply(lambda s: pd.Series(ci95(s.values), index=["lo","hi"]))
          .join(df.groupby("team")["points"].mean().rename("mean"))
          .reindex(columns=["mean","lo","hi"]).round(2))
ans9
```

### 10) Duplicate identity check & resolution

**Steps:** find duplicates of `(player_id, name)`; keep max minutes.

```python
dupes = df.duplicated(subset=["player_id","name"], keep=False)
ans10_dupe_rows = df.loc[dupes].sort_values(["player_id","minutes"], ascending=[True,False])
resolved = (df.sort_values("minutes", ascending=False)
              .drop_duplicates(subset=["player_id","name"], keep="first"))
ans10 = {"duplicate_rows": ans10_dupe_rows, "resolved_table": resolved.head(10)}
ans10
```

---

want me to drop these into your notebook cells as a neat section (Questions, then Answers) or wire them behind small functions so you can call `answer_1(df)`, `answer_2(df)`, etc.?
