# Pokemon Ability Prediction Model

This notebook analyzes Pokemon types and builds a machine learning model to predict Pokemon abilities based on their stats and characteristics.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')

## 2. Load and Explore Data

In [None]:
# Load the original Pokemon types dataset
df_types = pd.read_csv('data/pokemon.csv')
print("Original dataset shape:", df_types.shape)
print("\nFirst few rows:")
df_types.head()

In [None]:
# Count unique Pokemon types
all_types = []
for types in df_types['Type']:
    if pd.notna(types):
        # Split by comma and strip whitespace
        type_list = [t.strip() for t in types.split(',')]
        all_types.extend(type_list)

unique_types = sorted(set(all_types))
print(f"\nTotal number of unique Pokemon types: {len(unique_types)}")
print("\nUnique Pokemon types:")
for i, ptype in enumerate(unique_types, 1):
    print(f"{i:2d}. {ptype}")

In [None]:
# Load the comprehensive dataset with abilities
df_complete = pd.read_csv('data/pokemon_complete.csv', sep=';')
print("\nComplete dataset shape:", df_complete.shape)
print("\nColumn names:")
print(df_complete.columns.tolist())
print("\nFirst few rows:")
df_complete.head()

## 3. Data Cleaning and Preparation

In [None]:
# Parse the abilities column - it's stored as a string representation of a list
import ast

def parse_abilities(abilities_str):
    try:
        abilities = ast.literal_eval(abilities_str)
        # Return the first ability as the primary one
        return abilities[0] if abilities else None
    except:
        return None

df_complete['Primary_Ability'] = df_complete['Abilities'].apply(parse_abilities)

# Parse types
def parse_types(types_str):
    try:
        types = ast.literal_eval(types_str)
        return types
    except:
        return []

df_complete['Types_List'] = df_complete['Types'].apply(parse_types)
df_complete['Type1'] = df_complete['Types_List'].apply(lambda x: x[0] if len(x) > 0 else None)
df_complete['Type2'] = df_complete['Types_List'].apply(lambda x: x[1] if len(x) > 1 else None)

print("Parsed abilities and types successfully!")

In [None]:
# Remove rows with missing abilities or stats
df_clean = df_complete.dropna(subset=['Primary_Ability', 'HP', 'Attack', 'Defense', 
                                      'Special Attack', 'Special Defense', 'Speed', 'Type1'])

print(f"\nDataset after cleaning: {df_clean.shape}")
print(f"Number of unique abilities: {df_clean['Primary_Ability'].nunique()}")

# Show ability distribution
ability_counts = df_clean['Primary_Ability'].value_counts()
print("\nTop 20 most common abilities:")
print(ability_counts.head(20))

In [None]:
# Filter to keep only abilities with at least 10 Pokemon
ability_counts = df_clean['Primary_Ability'].value_counts()
common_abilities = ability_counts[ability_counts >= 10].index.tolist()
df_filtered = df_clean[df_clean['Primary_Ability'].isin(common_abilities)].copy()

print(f"\nDataset after filtering rare abilities: {df_filtered.shape}")
print(f"Number of abilities to predict: {len(common_abilities)}")

## 4. Exploratory Data Analysis

In [None]:
# Visualize stat distributions
stat_columns = ['HP', 'Attack', 'Defense', 'Special Attack', 'Special Defense', 'Speed']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, stat in enumerate(stat_columns):
    axes[idx].hist(df_filtered[stat], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {stat}')
    axes[idx].set_xlabel(stat)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap of stats
plt.figure(figsize=(10, 8))
correlation_matrix = df_filtered[stat_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1)
plt.title('Correlation Heatmap of Pokemon Stats')
plt.show()

In [None]:
# Average stats by type
type_stats = df_filtered.groupby('Type1')[stat_columns].mean().round(1)
type_stats['Total'] = type_stats.sum(axis=1)
type_stats = type_stats.sort_values('Total', ascending=False)

plt.figure(figsize=(12, 8))
type_stats['Total'].plot(kind='bar')
plt.title('Average Total Stats by Primary Type')
plt.xlabel('Pokemon Type')
plt.ylabel('Average Total Stats')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. Feature Engineering

In [None]:
# Create additional features
df_filtered['Total_Stats'] = df_filtered[stat_columns].sum(axis=1)
df_filtered['Physical_Ratio'] = df_filtered['Attack'] / (df_filtered['Attack'] + df_filtered['Special Attack'])
df_filtered['Defense_Ratio'] = df_filtered['Defense'] / (df_filtered['Defense'] + df_filtered['Special Defense'])
df_filtered['Speed_Percentile'] = pd.qcut(df_filtered['Speed'], q=4, labels=['Slow', 'Medium', 'Fast', 'Very Fast'])

# Create binary features for type combinations
for ptype in unique_types:
    df_filtered[f'Has_{ptype}'] = df_filtered['Types_List'].apply(lambda x: 1 if ptype in x else 0)

print("Created additional features successfully!")

In [None]:
# Prepare features for modeling
feature_columns = stat_columns + ['Total_Stats', 'Physical_Ratio', 'Defense_Ratio']
feature_columns += [f'Has_{ptype}' for ptype in unique_types]

X = df_filtered[feature_columns]
y = df_filtered['Primary_Ability']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nFeature matrix shape: {X.shape}")
print(f"Number of classes (abilities): {len(label_encoder.classes_)}")

## 6. Model Building

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

### 6.1 Random Forest Classifier

In [None]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
top_features = feature_importance.head(20)
plt.barh(top_features['feature'], top_features['importance'])
plt.xlabel('Importance')
plt.title('Top 20 Most Important Features - Random Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### 6.2 XGBoost Classifier

In [None]:
# Train XGBoost
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, 
                              random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Make predictions
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)

print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")

### 6.3 Neural Network

In [None]:
# Train Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(128, 64, 32), activation='relu', 
                         solver='adam', max_iter=500, random_state=42)
nn_model.fit(X_train_scaled, y_train)

# Make predictions
nn_pred = nn_model.predict(X_test_scaled)
nn_accuracy = accuracy_score(y_test, nn_pred)

print(f"Neural Network Accuracy: {nn_accuracy:.4f}")

## 7. Model Evaluation

In [None]:
# Compare models
model_results = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'Neural Network'],
    'Accuracy': [rf_accuracy, xgb_accuracy, nn_accuracy]
})

plt.figure(figsize=(10, 6))
plt.bar(model_results['Model'], model_results['Accuracy'])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Comparison - Ability Prediction Accuracy')
for i, v in enumerate(model_results['Accuracy']):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')
plt.show()

In [None]:
# Detailed evaluation of the best model
best_model = rf_model  # Assuming Random Forest performs best
best_pred = rf_pred

# Classification report for top 10 abilities
top_abilities = df_filtered['Primary_Ability'].value_counts().head(10).index
top_ability_indices = [i for i, ability in enumerate(label_encoder.classes_) if ability in top_abilities]

print("\nClassification Report for Top 10 Most Common Abilities:")
print("="*60)

# Filter predictions and true labels for top abilities
mask = np.isin(y_test, top_ability_indices)
if mask.sum() > 0:
    y_test_filtered = y_test[mask]
    best_pred_filtered = best_pred[mask]
    
    report = classification_report(y_test_filtered, best_pred_filtered, 
                                 target_names=[label_encoder.classes_[i] for i in np.unique(y_test_filtered)],
                                 zero_division=0)
    print(report)

## 8. Prediction Function

In [None]:
def predict_pokemon_ability(name, hp, attack, defense, sp_attack, sp_defense, speed, types):
    """
    Predict the ability of a Pokemon based on its stats and types.
    
    Parameters:
    - name: Pokemon name (str)
    - hp, attack, defense, sp_attack, sp_defense, speed: Base stats (int)
    - types: List of Pokemon types (e.g., ['Fire', 'Flying'])
    """
    # Create feature vector
    features = {
        'HP': hp,
        'Attack': attack,
        'Defense': defense,
        'Special Attack': sp_attack,
        'Special Defense': sp_defense,
        'Speed': speed,
        'Total_Stats': hp + attack + defense + sp_attack + sp_defense + speed,
        'Physical_Ratio': attack / (attack + sp_attack),
        'Defense_Ratio': defense / (defense + sp_defense)
    }
    
    # Add type features
    for ptype in unique_types:
        features[f'Has_{ptype}'] = 1 if ptype in types else 0
    
    # Create dataframe with correct column order
    feature_df = pd.DataFrame([features])[feature_columns]
    
    # Make prediction
    ability_pred = best_model.predict(feature_df)[0]
    ability_name = label_encoder.inverse_transform([ability_pred])[0]
    
    # Get prediction probabilities
    probabilities = best_model.predict_proba(feature_df)[0]
    top_3_indices = np.argsort(probabilities)[-3:][::-1]
    top_3_abilities = [(label_encoder.inverse_transform([idx])[0], probabilities[idx]) 
                       for idx in top_3_indices]
    
    print(f"\nPrediction for {name}:")
    print(f"Primary Type: {types[0] if types else 'Unknown'}")
    print(f"Total Stats: {features['Total_Stats']}")
    print("\nPredicted Ability: {}\n".format(ability_name))
    print("Top 3 Most Likely Abilities:")
    for ability, prob in top_3_abilities:
        print(f"  - {ability}: {prob:.2%}")
    
    return ability_name

# Example predictions
print("Example Predictions:")
print("=" * 50)

# Predict for a Fire/Flying type with high attack
predict_pokemon_ability("Charizard", hp=78, attack=84, defense=78, 
                       sp_attack=109, sp_defense=85, speed=100, 
                       types=['Fire', 'Flying'])

# Predict for a Water type with balanced stats
predict_pokemon_ability("Blastoise", hp=79, attack=83, defense=100, 
                       sp_attack=85, sp_defense=105, speed=78, 
                       types=['Water'])

## 9. Summary and Insights

In [None]:
print("Pokemon Ability Prediction Model Summary")
print("=" * 50)
print(f"\nDataset Statistics:")
print(f"- Total Pokemon analyzed: {len(df_filtered)}")
print(f"- Number of unique types: {len(unique_types)}")
print(f"- Number of abilities predicted: {len(common_abilities)}")
print(f"\nModel Performance:")
print(f"- Random Forest Accuracy: {rf_accuracy:.2%}")
print(f"- XGBoost Accuracy: {xgb_accuracy:.2%}")
print(f"- Neural Network Accuracy: {nn_accuracy:.2%}")
print(f"\nMost Important Features for Prediction:")
print(feature_importance.head(5).to_string(index=False))
print("\nThe model can predict Pokemon abilities based on their stats and types with reasonable accuracy.")
print("Stats like HP, Attack, and Speed are the most important predictors of abilities.")