**The S.H.I.E.L.D. Protocol: Hero vs. Villain Classifier**

Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('charcters_stats.csv') # Importing Data

In [3]:
df.head()
df.info()
df.shape
df.isnull().sum()
df['Alignment'].value_counts()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          611 non-null    object
 1   Alignment     608 non-null    object
 2   Intelligence  611 non-null    int64 
 3   Strength      611 non-null    int64 
 4   Speed         611 non-null    int64 
 5   Durability    611 non-null    int64 
 6   Power         611 non-null    int64 
 7   Combat        611 non-null    int64 
 8   Total         611 non-null    int64 
dtypes: int64(7), object(2)
memory usage: 43.1+ KB


Alignment
good       432
bad        165
neutral     11
Name: count, dtype: int64

In [4]:
#Removing rows where Alignment is 'neutral'
df = df[df['Alignment'] != 'neutral']
df = df.reset_index(drop=True)

df['Alignment'].value_counts()

Alignment
good    432
bad     165
Name: count, dtype: int64

Binary Conversion: Bad = 0 : Good = 1

In [5]:
df['label'] = df['Alignment'].map({'good': 1, 'bad' : 0})
print(df[['Name', 'Alignment', 'label']].head())

          Name Alignment  label
0      3-D Man      good    1.0
1       A-Bomb      good    1.0
2   Abe Sapien      good    1.0
3     Abin Sur      good    1.0
4  Abomination       bad    0.0


In [6]:
# Filter out rows where the 'Total' power is suspiciously low
# (Most real heroes have a Total > 100)
df_clean = df[df['Total'] > 50] 

# This is final dataset
print(f"Final Clean Army Size: {df_clean.shape}")

Final Clean Army Size: (422, 10)


In [7]:
feature_col = ['Intelligence', 'Strength', 'Speed', 'Durability', 'Power', 'Combat']
X = df_clean[feature_col].values # Containing all the Input Features in Numpy Array Format
Y = df_clean['label'].values.reshape(-1, 1) # Reshaping to make it vertical Comumn

# Checking Shapes
print(f'Features Matrix: {X.shape}')
print(f'Target: {Y.shape}')

Features Matrix: (422, 6)
Target: (422, 1)


Applying Sigmoid Function

In [8]:
X = X / 100.0 # Nomalizing the Data

def sigmoid(z):
    s = 1 / (1 + np.exp(-z))
    return s

def initialize_parameters(dim):
    w = np.zeros((dim, 1)) # Weight (y=wx+b), controls the slope and angle of the line
    b = 0.0 # Bais (y=wx+b), controls ups and downs in predection
    return w, b

# Let's test if these work before moving on
w_test, b_test = initialize_parameters(6)
print(f"Weights shape: {w_test.shape}")  
print(f"Sigmoid Test: {sigmoid(0)}")     

Weights shape: (6, 1)
Sigmoid Test: 0.5


In [11]:
def propagate(w, b, X, Y):
    """
    Implement the cost function and its gradient for the propagation
    """
    
    m = X.shape[0] # Number of examples (422)
    
    # --- FORWARD PROPAGATION (From X to Cost) ---
    # 1. Calculate Z (The Linear Part)
    # Matrix Multiplication: (422,6) dot (6,1) -> (422,1)
    Z = np.dot(X, w) + b 
    
    # 2. Calculate A (The Activation / Probability)
    A = sigmoid(Z)
    
    # 3. Calculate Cost (Log Loss)
    # We add a tiny epsilon (1e-5) inside log to prevent log(0) errors
    cost = (-1/m) * np.sum(Y * np.log(A + 1e-5) + (1-Y) * np.log(1-A + 1e-5))
    
    # --- BACKWARD PROPAGATION (To find gradients) ---
    dw = (1/m) * np.dot(X.T, (A - Y))
    db = (1/m) * np.sum(A - Y)
    
    cost = np.squeeze(cost) # Ensure cost is a scalar
    
    grads = {"dw": dw, "db": db}
    
    return grads, cost

def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost=False):
    """
    This function optimizes w and b by running a gradient descent algorithm
    """
    costs = []
    
    for i in range(num_iterations):
        # 1. Calculate Gradients and Cost
        grads, cost = propagate(w, b, X, Y)
        
        # 2. Retrieve derivatives
        dw = grads["dw"]
        db = grads["db"]
        
        # 3. Update Parameters (Gradient Descent)
        w = w - learning_rate * dw
        b = b - learning_rate * db
        
        # 4. Record cost every 100 iterations
        if i % 100 == 0:
            costs.append(cost)
            if print_cost:
                print(f"Cost after iteration {i}: {cost}")
    
    params = {"w": w, "b": b}
    grads = {"dw": dw, "db": db}
    
    return params, grads, costs

*Now we are going to predict*

In [13]:
# Model will try to predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)
def predict(w, b, X):
    
    m = X.shape[0] # Accesing all the rows
    Y_predection = np.zeros((m, 1)) # Creates an empty list to store my pred

    # Now, we wil calculate the Probablity
    Z = np.dot(X, w) + b    # y = mx + c
    A = sigmoid(Z)  # sigmoid function compresses the probablity between 0 & 1.

    # Looping through every single probablity
    for i in range (A.shape[0]):
        if A[i, 0] > 0.5: 
            Y_predection[i, 0] = 1
        else:
            Y_predection[i, 0] = 0
    return Y_predection


In [14]:
# 1. Initialize empty weights
w, b = initialize_parameters(X.shape[1])

# 2. TRAIN the model (The Gym)
# We will train for 2000 rounds with a learning rate of 0.01
print("Training Started...")
params, grads, costs = optimize(w, b, X, Y, num_iterations=2000, learning_rate=0.01, print_cost=True)

# 3. PREDICT using the learned weights
print("---------------------------------")
print("Training Complete. Testing Accuracy...")
Y_prediction_train = predict(params["w"], params["b"], X)

# 4. Check Accuracy
accuracy = 100 - np.mean(np.abs(Y_prediction_train - Y)) * 100
print(f"Final Training Accuracy: {accuracy:.2f} %")

Training Started...
Cost after iteration 0: nan
Cost after iteration 100: nan
Cost after iteration 200: nan
Cost after iteration 300: nan
Cost after iteration 400: nan
Cost after iteration 500: nan
Cost after iteration 600: nan
Cost after iteration 700: nan
Cost after iteration 800: nan
Cost after iteration 900: nan
Cost after iteration 1000: nan
Cost after iteration 1100: nan
Cost after iteration 1200: nan
Cost after iteration 1300: nan
Cost after iteration 1400: nan
Cost after iteration 1500: nan
Cost after iteration 1600: nan
Cost after iteration 1700: nan
Cost after iteration 1800: nan
Cost after iteration 1900: nan
---------------------------------
Training Complete. Testing Accuracy...
Final Training Accuracy: nan %
