# Data Cleaning

* Formatting data into readable formats
* Removing unnecessary features

In [141]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,  confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split




In [142]:
import csv
import pandas as pd


data = pd.read_csv('Pokemon Database.csv')


for index, pokemon in data.iterrows():
    for column in data.columns:
        if isinstance(pokemon[column],str):
            data.at[index, column] = pokemon[column][1:-1] 
            

data["Alternate Form Name"] = data["Alternate Form Name"].replace({
    "Hisui": "Hisuian",
    "Alola": "Alolan",
    "Galar": "Galarian"
})    
data=data[data["Alternate Form Name"]!="Gigantamax"]
            
for index, pokemon in data.iterrows():
    if pd.isna(pokemon["Secondary Type"]):
        data.at[index, "Secondary Type"] = pokemon["Primary Type"]     
    alternate_form = pokemon['Alternate Form Name']
    if not pd.isna(alternate_form) and isinstance(alternate_form,str):
        if alternate_form=="Mega X" or alternate_form == "Mega Y":
            data.at[index,"Pokemon Name"] = f"Mega {data.at[index,"Pokemon Name"]} {alternate_form[-1]}"
        elif pokemon["Pokemon Name"] in ["Unown", "Hoopa"]:
            data.at[index,"Pokemon Name"] = f"{data.at[index,"Pokemon Name"]} {alternate_form}"
        else:
            data.at[index,"Pokemon Name"] = f"{alternate_form} {data.at[index,"Pokemon Name"]}"
 
columns = ['Pokedex Number', 'Pokemon Name', 'Pokemon Height', 'Pokemon Weight', 
            'Primary Type','Secondary Type', 'Male Ratio', 'Female Ratio', 'Base Happiness','Health Stat', 
            'Attack Stat', 'Defense Stat', 'Special Attack Stat', 'Special Defense Stat', 'Speed Stat', 
            'Base Stat Total', 'Health EV', 'Attack EV', 'Defense EV', 'Special Attack EV', 'Special Defense EV', 
            'Speed EV', 'EV Yield Total', 'Catch Rate', 'Experience Growth Total','Egg Cycle Count']    
        
data = data[columns]
data.to_csv('Processed Data.csv')



In [155]:
features = ['Pokemon Height', 'Pokemon Weight',  'Male Ratio', 'Female Ratio', 'Base Happiness', 'Health Stat', 'Attack Stat', 'Defense Stat', 
            'Special Attack Stat', 'Special Defense Stat', 'Speed Stat', 'Base Stat Total', 'Health EV', 'Attack EV', 'Defense EV', 
            'Special Attack EV', 'Special Defense EV', 'Speed EV', 'EV Yield Total', 'Catch Rate', 'Experience Growth Total','Egg Cycle Count']

scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data[features]), columns=features)

data_scaled
 

Unnamed: 0,Pokemon Height,Pokemon Weight,Male Ratio,Female Ratio,Base Happiness,Health Stat,Attack Stat,Defense Stat,Special Attack Stat,Special Defense Stat,...,Health EV,Attack EV,Defense EV,Special Attack EV,Special Defense EV,Speed EV,EV Yield Total,Catch Rate,Experience Growth Total,Egg Cycle Count
0,-0.216971,-0.494278,1.521864,-0.954514,0.134340,-0.993876,-1.008905,-0.830771,-0.293764,-0.277041,...,-0.345765,-0.577535,-0.385043,0.668869,-0.349988,-0.454746,-1.275222,-0.650938,-0.006715,-0.382050
1,-0.122188,-0.446446,1.521864,-0.954514,0.134340,-0.425201,-0.594607,-0.368212,0.172254,0.266151,...,-0.345765,-0.577535,-0.385043,0.668869,1.219727,-0.454746,0.045998,-0.650938,-0.006715,-0.382050
2,0.193755,0.235741,1.521864,-0.954514,0.134340,0.333033,0.042775,0.292587,0.793612,0.990406,...,-0.345765,-0.577535,-0.385043,1.826527,1.219727,-0.454746,1.367218,-0.650938,-0.006715,-0.382050
3,0.320132,0.670930,1.521864,-0.954514,1.122674,0.333033,0.616419,1.614184,1.477105,1.714662,...,-0.345765,-0.577535,-0.385043,1.826527,1.219727,-0.454746,1.367218,-0.650938,-0.006715,-0.382050
4,-0.248565,-0.481732,1.521864,-0.954514,0.134340,-1.221346,-0.913298,-1.029010,-0.449103,-0.820233,...,-0.345765,-0.577535,-0.385043,-0.488789,-0.349988,0.969633,-1.275222,-0.650938,-0.006715,-0.382050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345,0.067378,0.674851,-1.539356,-1.425147,-2.336495,0.712150,-0.275916,0.854266,1.477105,1.280109,...,-0.345765,-0.577535,-0.385043,2.984185,-0.349988,-0.454746,1.367218,-1.111025,1.236991,0.657455
1346,0.098972,0.055393,-1.539356,-1.425147,-2.336495,3.365967,0.775765,1.184665,1.725648,1.352534,...,4.353395,-0.577535,-0.385043,-0.488789,-0.349988,-0.454746,1.367218,2.109583,1.236991,-1.075054
1347,-0.374943,-0.497414,-1.539356,-1.425147,-2.336495,0.712150,-0.498999,0.358667,-0.293764,0.447214,...,-0.345765,-0.577535,1.171269,-0.488789,-0.349988,-0.454746,-1.275222,2.109583,1.236991,-1.075054
1348,-0.343348,-0.422923,-1.539356,-1.425147,-2.336495,0.901708,0.457074,1.184665,0.948951,1.352534,...,-0.345765,-0.577535,2.727581,-0.488789,2.789442,-0.454746,2.688437,2.109583,1.236991,-1.075054


In [158]:
mlb = MultiLabelBinarizer()
encoded_primary = mlb.fit_transform(data['Primary Type'])
primary = pd.DataFrame(encoded_primary, columns=mlb.classes_)

encoded_secondary = mlb.fit_transform(data['Secondary Type'])
secondary = pd.DataFrame(encoded_secondary, columns=mlb.classes_)

In [156]:
X = data_scaled 
X_train, X_test, y_train, y_test = train_test_split(X, primary, test_size=0.2, random_state=42)

svm_model = SVC(kernel='rbf', probability=True)
svm_model_primary = OneVsRestClassifier(SVC(kernel='rbf', probability=True))
svm_model_secondary = OneVsRestClassifier(SVC(kernel='rbf', probability=True))

svm_model_primary.fit(X_train, y_train)
y_pred_primary = svm_model_primary.predict(X_test)
accuracy_primary = accuracy_score(y_test, y_pred_primary)

print(f"Primary Type Accuracy: {accuracy_primary}")




Primary Type Accuracy: 0.07037037037037037
