In [1]:
print("hello world")

hello world


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import os

In [7]:
df_raw = pd.read_csv('marvel_dc_characters.csv')

In [8]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            64 non-null     object
 1   male            64 non-null     int64 
 2   superhero       64 non-null     int64 
 3   uses_magic      64 non-null     int64 
 4   wears_mask      64 non-null     int64 
 5   super_strength  64 non-null     int64 
 6   uses_gadgets    64 non-null     int64 
 7   flies           64 non-null     int64 
 8   detective       64 non-null     int64 
 9   comedian        64 non-null     int64 
 10  billionaire     64 non-null     int64 
 11  from_earth      64 non-null     int64 
 12  alien_origin    64 non-null     int64 
 13  team_member     64 non-null     int64 
 14  wears_armor     64 non-null     int64 
 15  uses_weapon     64 non-null     int64 
 16  sidekick        64 non-null     int64 
 17  immortal        64 non-null     int64 
 18  leader      

In [9]:
df_raw.isnull().sum()

name              0
male              0
superhero         0
uses_magic        0
wears_mask        0
super_strength    0
uses_gadgets      0
flies             0
detective         0
comedian          0
billionaire       0
from_earth        0
alien_origin      0
team_member       0
wears_armor       0
uses_weapon       0
sidekick          0
immortal          0
leader            0
scientist         0
mutant            0
genre             0
dtype: int64

In [10]:
df = df_raw.rename(columns={
    "name": "character",
    "male": "is_male",
    "superhero": "is_superhero",
    "detective": "is_detective",
    "comedian": "is_comedian",
    "billionaire": "is_billionaire",
    "from_earth": "is_from_earth",
    "team_member": "is_team_member",
    "sidekick": "has_sidekick"
})

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   character       64 non-null     object
 1   is_male         64 non-null     int64 
 2   is_superhero    64 non-null     int64 
 3   uses_magic      64 non-null     int64 
 4   wears_mask      64 non-null     int64 
 5   super_strength  64 non-null     int64 
 6   uses_gadgets    64 non-null     int64 
 7   flies           64 non-null     int64 
 8   is_detective    64 non-null     int64 
 9   is_comedian     64 non-null     int64 
 10  is_billionaire  64 non-null     int64 
 11  is_from_earth   64 non-null     int64 
 12  alien_origin    64 non-null     int64 
 13  is_team_member  64 non-null     int64 
 14  wears_armor     64 non-null     int64 
 15  uses_weapon     64 non-null     int64 
 16  has_sidekick    64 non-null     int64 
 17  immortal        64 non-null     int64 
 18  leader      

In [12]:
# ------------------- 3. One-hot encode 'genre' -------------------
genre_dummies = pd.get_dummies(df['genre'], prefix='genre')
df = pd.concat([df.drop('genre', axis=1), genre_dummies], axis=1)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   character       64 non-null     object
 1   is_male         64 non-null     int64 
 2   is_superhero    64 non-null     int64 
 3   uses_magic      64 non-null     int64 
 4   wears_mask      64 non-null     int64 
 5   super_strength  64 non-null     int64 
 6   uses_gadgets    64 non-null     int64 
 7   flies           64 non-null     int64 
 8   is_detective    64 non-null     int64 
 9   is_comedian     64 non-null     int64 
 10  is_billionaire  64 non-null     int64 
 11  is_from_earth   64 non-null     int64 
 12  alien_origin    64 non-null     int64 
 13  is_team_member  64 non-null     int64 
 14  wears_armor     64 non-null     int64 
 15  uses_weapon     64 non-null     int64 
 16  has_sidekick    64 non-null     int64 
 17  immortal        64 non-null     int64 
 18  leader      

In [14]:
# ------------------- 4. Full column list (after one-hot) -------------------
binary_cols = [c for c in df.columns if c not in ['character'] + list(genre_dummies.columns)]
genre_cols = list(genre_dummies.columns)  # e.g. ['genre_DC', 'genre_Marvel']

In [15]:
# ------------------- 5. Information Gain (on binary features only) -------------------
def entropy(p):
    if p <= 0 or p >= 1: return 0.0
    return -p * np.log2(p) - (1-p) * np.log2(1-p)

def info_gain(col):
    p1 = col.mean()
    if p1 == 0 or p1 == 1: return 0
    H_parent = entropy(p1)
    H0 = entropy((col==0).mean()) * (col==0).mean()
    H1 = entropy((col==1).mean()) * p1
    return H_parent - (H0 + H1)

gains = df[binary_cols].apply(info_gain)
top_binary = gains.sort_values(ascending=False).index.tolist()

In [None]:
# Round 1: Top 7 binary
TOP_7 = top_binary[:7]

# Round 2: Top 12 binary + 2 genre → 14 total
TOP_12_BINARY = top_binary[:12]
TOP_14 = TOP_12_BINARY + genre_cols[:2]  # Last 2 = genre

print("\nTOP 7 QUESTIONS:")
for i, q in enumerate(TOP_7, 1): print(f"  {i}. {q}")

print("\nTOP 14 QUESTIONS (last 2 are genre):")
for i, q in enumerate(TOP_14, 1): print(f"  {i}. {q}")




TOP 7 QUESTIONS:
  1. uses_magic
  2. flies
  3. leader
  4. is_male
  5. wears_mask
  6. is_superhero
  7. uses_gadgets

TOP 14 QUESTIONS (last 2 are genre):
  1. uses_magic
  2. flies
  3. leader
  4. is_male
  5. wears_mask
  6. is_superhero
  7. uses_gadgets
  8. super_strength
  9. is_comedian
  10. is_billionaire
  11. is_from_earth
  12. is_detective
  13. genre_DC
  14. genre_Marvel


In [19]:
# ------------------- 6. Prepare X and y -------------------
X7 = df[TOP_7].values
X14 = df[TOP_14].values
y = df['character'].values  # Actual names
# ------------------- 7. Train-test split -------------------
X7_train, X7_val, y7_train, y7_val = train_test_split(
X7, y, test_size=0.2, random_state=42, stratify=y)
X14_train, X14_val, y14_train, y14_val = train_test_split(
    X14, y, test_size=0.2, random_state=42, stratify=y)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [1]:
import numpy as np
print(np.__version__)

2.3.4
