**Libraries/Imports**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.dummy import DummyClassifier
import torch
import torch.nn as nn
import torch.optim as optim


**Data Preprocessing**

In [2]:
# Reading in raw Pokemon Database.csv
raw = pd.read_csv('Pokemon Database.csv')

# Cleaning string values
for index, pokemon in raw.iterrows():
    for column in raw.columns:
        if isinstance(pokemon[column], str):
            raw.at[index, column] = pokemon[column][1:-1]

# Converting Alternate Form Name to Correct Names
raw["Alternate Form Name"] = raw["Alternate Form Name"].replace({
    "Hisui": "Hisuian",
    "Alola": "Alolan",
    "Galar": "Galarian"
})    

# Updating Pokemon Names and Handling Missing Values
for index, pokemon in raw.iterrows():
    if pd.isna(pokemon['Legendary Type']):
        raw.at[index, "Legendary Type"] = "Regular"
    if pd.isna(pokemon["Secondary Type"]):
        raw.at[index, "Secondary Type"] = pokemon["Primary Type"]     
    alternate_form = pokemon['Alternate Form Name']
    if not pd.isna(alternate_form) and isinstance(alternate_form, str):
        if alternate_form in ["Mega X", "Mega Y"]:
            raw.at[index, "Pokemon Name"] = f"Mega {raw.at[index, 'Pokemon Name']} {alternate_form[-1]}"
        elif pokemon["Pokemon Name"] in ["Unown", "Hoopa"]:
            raw.at[index, "Pokemon Name"] = f"{raw.at[index, 'Pokemon Name']} {alternate_form}"
        else:
            raw.at[index, "Pokemon Name"] = f"{alternate_form} {raw.at[index, 'Pokemon Name']}"

# Selecting Relevant Columns
relevant = raw[['Pokemon Id', 'Pokedex Number', 'Pokemon Name',
       'Alternate Form Name', 'Original Pokemon ID', 'Legendary Type',
       'Pokemon Height', 'Pokemon Weight', 'Primary Type', 'Secondary Type', 
       'Male Ratio', 'Female Ratio', 'Base Happiness', 'Health Stat', 'Attack Stat',
       'Defense Stat', 'Special Attack Stat', 'Special Defense Stat',
       'Speed Stat', 'Base Stat Total', 'Health EV', 'Attack EV', 'Defense EV',
       'Special Attack EV', 'Special Defense EV', 'Speed EV', 'EV Yield Total',
       'Catch Rate', 'Experience Growth', 'Experience Growth Total', 'Egg Cycle Count']]

# Remove Gigantamax Forms
relevant = relevant.loc[relevant['Alternate Form Name'] != 'Gigantamax']
relevant = relevant.reset_index()

# Define Features for Transformation
features = ['Legendary Type', 'Pokemon Height', 'Pokemon Weight', 'Primary Type', 'Secondary Type',
            'Male Ratio', 'Female Ratio', 'Base Happiness', 'Health Stat', 'Attack Stat', 'Defense Stat', 
            'Special Attack Stat', 'Special Defense Stat', 'Speed Stat', 'Base Stat Total', 'Health EV', 
            'Attack EV', 'Defense EV', 'Special Attack EV', 'Special Defense EV', 'Speed EV', 
            'EV Yield Total', 'Catch Rate', 'Experience Growth', 'Experience Growth Total', 'Egg Cycle Count'] 

# Define Numerical Features
numerical_features = [col for col in features if col not in ['Legendary Type', 'Experience Growth', 'Primary Type', 'Secondary Type']]

# Apply Label Encoding to Typings (For Classification)
label_encoder_primary = LabelEncoder()
label_encoder_secondary = LabelEncoder()

relevant['Primary Typing Label'] = label_encoder_primary.fit_transform(relevant['Primary Type'])
relevant['Secondary Typing Label'] = label_encoder_secondary.fit_transform(relevant['Secondary Type'])

# Define One-Hot Encoding for Categorical Features, Scaling Values
categorical_features = ['Legendary Type', 'Experience Growth']
transformer = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),  
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  
])

# Apply Transformations
transformed = transformer.fit_transform(relevant[features])
encoded_feature_names = transformer.get_feature_names_out()

# Convert to DataFrame
processed = pd.DataFrame(transformed, columns=encoded_feature_names)

# Add Primary & Secondary Typing Labels
processed['Primary Typing Label'] = relevant['Primary Typing Label']
processed['Secondary Typing Label'] = relevant['Secondary Typing Label']

# Save Processed Data
print(processed.shape)
processed.to_csv('processed_data.csv', index=False)

(1350, 34)


**Splitting Test and Train Datasets**

In [8]:
# Defining Features and Labels Matricies
X = processed.drop(columns=['Primary Typing Label', 'Secondary Typing Label'])
y = processed[['Primary Typing Label']]

# Split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Random Guess**

In [4]:
# Declari Classifier and Train
random_guess = DummyClassifier(strategy='uniform', random_state=42)
random_guess.fit(X_train, y_train)

# Predict
y_pred = random_guess.predict(X_test)

# Accuracy Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.04      0.04      0.04        26
           1       0.13      0.12      0.13        16
           2       0.00      0.00      0.00         9
           3       0.14      0.15      0.15        13
           4       0.00      0.00      0.00        12
           5       0.00      0.00      0.00         9
           6       0.00      0.00      0.00        13
           7       0.00      0.00      0.00         0
           8       0.06      0.14      0.09         7
           9       0.00      0.00      0.00        17
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00         9
          12       0.11      0.03      0.05        32
          13       0.00      0.00      0.00        11
          14       0.07      0.05      0.06        22
          15       0.12      0.08      0.10        24
          16       0.06      0.09      0.07        11
          17       0.10    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Probabilistic Guess**

In [5]:
# Declari Classifier and Train
prob_guess = DummyClassifier(strategy='stratified', random_state=42)
prob_guess.fit(X_train, y_train)

# Predict
y_pred = prob_guess.predict(X_test)

# Accuracy Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.08      0.08      0.08        26
           1       0.08      0.06      0.07        16
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00        12
           5       0.11      0.11      0.11         9
           6       0.07      0.08      0.07        13
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         7
           9       0.11      0.18      0.14        17
          10       0.00      0.00      0.00         8
          11       0.14      0.11      0.12         9
          12       0.04      0.03      0.03        32
          13       0.10      0.09      0.10        11
          14       0.08      0.09      0.09        22
          15       0.00      0.00      0.00        24
          16       0.09      0.09      0.09        11
          17       0.12    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**SVM Model**

In [6]:
# Declare Classifier and Train
svm_classifier = SVC(kernel='rbf', decision_function_shape='ovr')  # 'ovo' also works
svm_classifier.fit(X_train, y_train)

# Predict
y_pred = svm_classifier.predict(X_test)

# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.38      0.43        26
           1       0.00      0.00      0.00        16
           2       0.36      0.56      0.43         9
           3       0.36      0.31      0.33        13
           4       0.89      0.67      0.76        12
           5       0.25      0.22      0.24         9
           6       0.43      0.23      0.30        13
           8       0.14      0.14      0.14         7
           9       0.21      0.35      0.27        17
          10       0.10      0.12      0.11         8
          11       0.00      0.00      0.00         9
          12       0.37      0.91      0.53        32
          13       0.00      0.00      0.00        11
          14       0.59      0.45      0.51        22
          15       0.88      0.29      0.44        24
          16       0.60      0.55      0.57        11
          17       0.23      0.32      0.27        31

    accuracy              

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Neural Network**


In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Defining Features and Labels Matricies
X = processed.drop(columns=['Primary Typing Label', 'Secondary Typing Label'])
y = processed[['Primary Typing Label', 'Secondary Typing Label']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)  # Ensure labels are long integers for multi-class
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

input_dim = X.shape[1]
hidden_dim = 64
output_dim = 18  # Since there are 18 possible types

# Define two separate output layers for primary and secondary types
layer1 = nn.Linear(input_dim, hidden_dim)
activation = nn.ReLU()
primary_output = nn.Linear(hidden_dim, output_dim)
secondary_output = nn.Linear(hidden_dim, output_dim)

# Optimizer and loss function
params = list(layer1.parameters()) + list(primary_output.parameters()) + list(secondary_output.parameters())
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(params, lr=0.001)

# Training loop
for epoch in range(100):
    layer1.train()
    primary_output.train()
    secondary_output.train()

    out1 = layer1(X_train_tensor)
    act1 = activation(out1)

    primary_logits = primary_output(act1)
    secondary_logits = secondary_output(act1)

    primary_loss = criterion(primary_logits, y_train_tensor[:, 0]) 
    secondary_loss = criterion(secondary_logits, y_train_tensor[:, 1]) 

    total_loss = primary_loss + secondary_loss

    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss.item():.4f}")
        
with torch.no_grad():
    final_primary_logits = primary_output(layer1(X_test_tensor))
    final_secondary_logits = secondary_output(layer1(X_test_tensor))

    final_primary_preds = torch.argmax(final_primary_logits, dim=1)
    final_secondary_preds = torch.argmax(final_secondary_logits, dim=1)

    final_predictions_list = list(zip(final_primary_preds.numpy(), final_secondary_preds.numpy()))
    final_actual_labels_list = list(zip(y_test_tensor[:, 0].numpy(), y_test_tensor[:, 1].numpy()))

    final_primary_loss = criterion(final_primary_logits, y_test_tensor[:, 0])
    final_secondary_loss = criterion(final_secondary_logits, y_test_tensor[:, 1])
    final_total_loss = final_primary_loss + final_secondary_loss
    
    correct_preds = ((final_primary_preds == y_test_tensor[:, 0]) | (final_secondary_preds == y_test_tensor[:, 1])).float()
    accuracy = correct_preds.mean()

    primary_classes = torch.unique(y_test_tensor[:, 0])
    secondary_classes = torch.unique(y_test_tensor[:, 1])
    primary_target_names = [f'Class {i}' for i in primary_classes.numpy()]
    secondary_target_names = [f'Class {i}' for i in secondary_classes.numpy()]
    
    primary_report = classification_report(y_test_tensor[:, 0].numpy(), final_primary_preds.numpy(),
                                           target_names=primary_target_names)
    secondary_report = classification_report(y_test_tensor[:, 1].numpy(), final_secondary_preds.numpy(),
                                             target_names=secondary_target_names)

print(f"Overall Accuracy (either primary or secondary correct): {accuracy.item():.4f}")
print("\nPrimary Type Classification Report:")
print(primary_report)
print("\nSecondary Type Classification Report:")
print(secondary_report)


final_predictions_flat = [(p[0], p[1]) for p in final_predictions_list]
final_labels_flat = [(l[0], l[1]) for l in final_actual_labels_list] 


results_df = pd.DataFrame({
    'Predictions Primary Type': [final_predictions_flat[i][0] for i in range(len(final_predictions_flat))],
    'Actual Primary Type': [final_labels_flat[i][0] for i in range(len(final_labels_flat))],
    'Predictions Secondary Type': [final_predictions_flat[i][1] for i in range(len(final_predictions_flat))],
    'Actual Secondary Type': [final_labels_flat[i][1] for i in range(len(final_labels_flat))],
    'Loss': [final_total_loss.item()] * len(final_predictions_flat)
})

results_df.to_csv('final_predictions_and_loss.csv', index=False)


Epoch 0, Loss: 5.8575
Epoch 10, Loss: 5.6689
Epoch 20, Loss: 5.5055
Epoch 30, Loss: 5.3503
Epoch 40, Loss: 5.1991
Epoch 50, Loss: 5.0536
Epoch 60, Loss: 4.9167
Epoch 70, Loss: 4.7914
Epoch 80, Loss: 4.6785
Epoch 90, Loss: 4.5765
Overall Accuracy (either primary or secondary correct): 0.3630

Primary Type Classification Report:
              precision    recall  f1-score   support

     Class 0       0.16      0.23      0.19        26
     Class 1       0.00      0.00      0.00        16
     Class 2       0.08      0.11      0.10         9
     Class 3       0.13      0.38      0.20        13
     Class 4       0.26      0.67      0.37        12
     Class 5       0.16      0.44      0.24         9
     Class 6       0.50      0.08      0.13        13
     Class 8       0.00      0.00      0.00         7
     Class 9       0.12      0.18      0.14        17
    Class 10       0.00      0.00      0.00         8
    Class 11       0.00      0.00      0.00         9
    Class 12       0.4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
