{ “cells”: \[ { “cell_type”: “markdown”, “metadata”: {}, “source”: \[
“\# Guess the Character – 64 Marvel & DC Heroes”, “”, “Two-stage game:”,
“- **Round 1**: 7 binary questions”, “- **Round 2**: 7 more (5 binary +
2 genre)”, tone“,”**Genre = Last 2 Questions**“,”“,”Fixes:“,”- Duplicate
rare characters (if any appear once)“,”- No `stratify` → avoids
error“,”- `class_weight='balanced'`“,”“,”—” \] }, { “cell_type”: “code”,
“execution_count”: null, “metadata”: {}, “outputs”: \[\], “source”: \[
“\# Install dependencies (run once)”, “\# !pip install pandas
scikit-learn joblib”, “”, “import pandas as pd”, “import numpy as np”,
“from sklearn.model_selection import train_test_split”, “from
sklearn.ensemble import RandomForestClassifier”, “from sklearn.metrics
import accuracy_score”, “import joblib”, “import os”, “from collections
import Counter”, “import warnings”, “warnings.filterwarnings(‘ignore’)”
\] }, { “cell_type”: “markdown”, “metadata”: {}, “source”: \[ “\## 1.
Load Data” \] }, { “cell_type”: “code”, “execution_count”: null,
“metadata”: {}, “outputs”: \[\], “source”: \[ “\# Upload your char.csv
first (64 rows, 22 columns)”, “df_raw = pd.read_csv("char.csv")”, “”,
“print("Shape:", df_raw.shape)”, “print("\nColumns:")”,
“print(df_raw.columns.tolist())”, “print("\nSample:")”, “df_raw.head()”
\] }, { “cell_type”: “markdown”, “metadata”: {}, “source”: \[ “\## 2.
Preprocess & Fix Rare Characters” \] }, { “cell_type”: “code”,
“execution_count”: null, “metadata”: {}, “outputs”: \[\], “source”: \[
“\# Rename columns”, “df = df_raw.rename(columns={”, ” "name":
"character",“,” "male": "is_male",“,” "superhero": "is_superhero",“,”
"detective": "is_detective",“,” "comedian": "is_comedian",“,”
"billionaire": "is_billionaire",“,” "from_earth": "is_from_earth",“,”
"team_member": "is_team_member",“,” "sidekick":
"has_sidekick"“,”})“,”“,”\# One-hot encode genre“,”genre_dummies =
pd.get_dummies(df\[‘genre’\], prefix=‘genre’)“,”df =
pd.concat(\[df.drop(‘genre’, axis=1), genre_dummies\], axis=1)“,”“,”\#
Fix: Duplicate characters that appear only once“,”char_counts =
Counter(df\[‘character’\])“,”rare_chars = \[c for c, cnt in
char_counts.items() if cnt == 1\]“,”“,”if rare_chars:“,”
print(f"Duplicating {len(rare_chars)} rare characters…")“,” extra =
df\[df\[‘character’\].isin(rare_chars)\].copy()“,” df = pd.concat(\[df,
extra\], ignore_index=True)“,” print(f"New shape:
{df.shape}")“,”else:“,” print("All characters appear ≥2
times.")“,”“,”df.head()” \] }, { “cell_type”: “markdown”, “metadata”:
{}, “source”: \[ “\## 3. Select Questions via Information Gain” \] }, {
“cell_type”: “code”, “execution_count”: null, “metadata”: {}, “outputs”:
\[\], “source”: \[ “binary_cols = \[c for c in df.columns if c not in
\[‘character’\] + list(genre_dummies.columns)\]”, “genre_cols =
list(genre_dummies.columns)”, “”, “def entropy(p):”, ” if p \<= 0 or p
\>= 1: return 0.0“,” return -p \* np.log2(p) - (1-p) \*
np.log2(1-p)“,”“,”def info_gain(col):“,” p1 = col.mean()“,” if p1 == 0
or p1 == 1: return 0“,” H_parent = entropy(p1)“,” p0 = 1 - p1“,” H0 =
entropy(p0) \* p0“,” H1 = entropy(p1) \* p1“,” return H_parent - (H0 +
H1)“,”“,”gains = df\[binary_cols\].apply(info_gain)“,”top_binary =
gains.sort_values(ascending=False).index.tolist()“,”“,”\# Round 1: Top
7“,”TOP_7 = top_binary\[:7\]“,”“,”\# Round 2: Top 12 binary + 2
genre“,”TOP_12_BINARY = top_binary\[:12\]“,”TOP_14 = TOP_12_BINARY +
genre_cols\[:2\] \# Last 2 = genre“,”“,”print("TOP 7 QUESTIONS:")“,”for
i, q in enumerate(TOP_7, 1):“,” print(f" {i}. {q}")“,”“,”print("\nTOP 14
QUESTIONS (last 2 = genre):")“,”for i, q in enumerate(TOP_14, 1):“,”
print(f" {i}. {q}")” \] }, { “cell_type”: “markdown”, “metadata”: {},
“source”: \[ “\## 4. Prepare Features & Labels” \] }, { “cell_type”:
“code”, “execution_count”: null, “metadata”: {}, “outputs”: \[\],
“source”: \[ “X7 = df\[TOP_7\].values”, “X14 = df\[TOP_14\].values”, “y
= df\[‘character’\].values”, “”, “print(f"X7 shape: {X7.shape}, X14
shape: {X14.shape}, y shape: {y.shape}")” \] }, { “cell_type”:
“markdown”, “metadata”: {}, “source”: \[ “\## 5. Train-Test Split (No
Stratify)” \] }, { “cell_type”: “code”, “execution_count”: null,
“metadata”: {}, “outputs”: \[\], “source”: \[ “X7_train, X7_val,
y7_train, y7_val = train_test_split(X7, y, test_size=0.2,
random_state=42)”, “X14_train, X14_val, y14_train, y14_val =
train_test_split(X14, y, test_size=0.2, random_state=42)”, “”,
“print(f"Train size: {len(X7_train)}, Val size: {len(X7_val)}")” \] }, {
“cell_type”: “markdown”, “metadata”: {}, “source”: \[ “\## 6. Train
Models” \] }, { “cell_type”: “code”, “execution_count”: null,
“metadata”: {}, “outputs”: \[\], “source”: \[ “print("Training 7-Q
Model…")”, “model_7 = RandomForestClassifier(”, ” n_estimators=500,“,”
max_depth=12,“,” random_state=42,“,” n_jobs=-1,“,”
class_weight=‘balanced’“,”)“,”model_7.fit(X7_train, y7_train)“,”acc7 =
accuracy_score(y7_val, model_7.predict(X7_val))“,”print(f"7-Q Accuracy:
{acc7:.3%}")“,”“,”print("\nTraining 14-Q Model…")“,”model_14 =
RandomForestClassifier(“,” n_estimators=800,“,” max_depth=16,“,”
random_state=42,“,” n_jobs=-1,“,”
class_weight=‘balanced’“,”)“,”model_14.fit(X14_train, y14_train)“,”acc14
= accuracy_score(y14_val, model_14.predict(X14_val))“,”print(f"14-Q
Accuracy: {acc14:.3%}")” \] }, { “cell_type”: “markdown”, “metadata”:
{}, “source”: \[ “\## 7. Save Models & Questions” \] }, { “cell_type”:
“code”, “execution_count”: null, “metadata”: {}, “outputs”: \[\],
“source”: \[ “SAVE_DIR = "guess_game_models"”, “os.makedirs(SAVE_DIR,
exist_ok=True)”, “”, “joblib.dump(model_7, f"{SAVE_DIR}/model_7.pkl")”,
“joblib.dump(model_14, f"{SAVE_DIR}/model_14.pkl")”, “joblib.dump(TOP_7,
f"{SAVE_DIR}/questions_7.pkl")”, “joblib.dump(TOP_14,
f"{SAVE_DIR}/questions_14.pkl")”,
“joblib.dump(df\[‘character’\].tolist(), f"{SAVE_DIR}/characters.pkl")”,
“”, “print(f"All files saved to {SAVE_DIR}/")” \] }, { “cell_type”:
“markdown”, “metadata”: {}, “source”: \[ “\## 8. Play the Game!” \] }, {
“cell_type”: “code”, “execution_count”: null, “metadata”: {}, “outputs”:
\[\], “source”: \[ “def ask(q):”, ” while True:“,” a = input(f"{q}?
(y/n): ").strip().lower()“,” if a in ("y", "yes"): return 1“,” if a in
("n", "no"): return 0“,” print("Please type y or n")“,”“,”def
play_game():“,” print("\n" + "="*60)“,” print(" GUESS THE CHARACTER – 64
HEROES")“,” print("="*60)“,”“,” \# Round 1“,” ans7 = \[ask(q) for q in
TOP_7\]“,” guess1 = model_7.predict(\[ans7\])\[0\]“,” print(f"\nAfter 7
questions → **{guess1}**")“,” if input("Correct? (y/n):
").lower().startswith(‘y’):“,” print("I WIN IN 7!")“,” return“,”“,” \#
Round 2“,” extra = \[ask(q) for q in TOP_14\[7:12\]\]“,” genre_q =
\[ask(g.replace("genre\_", "Genre: ")) for g in genre_cols\[:2\]\]“,”
ans14 = ans7 + extra + genre_q“,”“,” guess2 =
model_14.predict(\[ans14\])\[0\]“,” print(f"\nAfter 14 questions →
**{guess2}**")“,” print("GOT IT!" if input("Correct now? (y/n):
").lower().startswith(‘y’) else "Better luck next time!")“,”“,”\#
Uncomment to play:“,”\# play_game()” \] }, { “cell_type”: “markdown”,
“metadata”: {}, “source”: \[ “—”, “\## Done! Models trained, saved, and
game ready.”, “”, “**Next Step**: Say **`Deploy web app`** → I’ll give
you Flask + HTML + Docker.” \] } \], “metadata”: { “kernelspec”: {
“display_name”: “Python 3”, “language”: “python”, “name”: “python3” },
“language_info”: { “name”: “python”, “version”: “3.11.0” } },
“nbformat”: 4, “nbformat_minor”: 2 }