In [1]:
import pandas as pd
import numpy as np

In [2]:
tiers = pd.read_json('tiers.json',orient='index')
tiers

Unnamed: 0,tier,doublesTier,natDexTier,isNonstandard
bulbasaur,LC,,,
ivysaur,NFE,,,
venusaur,PU,(DUU),RU,
venusaurmega,Illegal,,UU,Past
venusaurgmax,Illegal,,,Past
...,...,...,...,...
pumpkaboosuper,,,,Past
gourgeistsmall,,,,Past
gourgeistlarge,,,,Past
gourgeistsuper,,,,Past


In [3]:
tiers = tiers[['tier']] # We are only analyzing the singles tier data

In [4]:
tiers.value_counts()

tier   
Illegal    459
LC         227
ZU         191
NFE        108
Uber        52
RU          44
NU          40
CAP         39
OU          36
UU          35
PU          33
CAP LC      28
NUBL        15
UUBL        12
RUBL        12
ZUBL        11
CAP NFE      9
PUBL         7
AG           2
Name: count, dtype: int64

In [5]:
# filter out other tiers
valid = ['AG','Uber','OU','UUBL','UU','RUBL','RU','NUBL','NU','PUBL','PU','ZUBL','ZU'] 
tiers=tiers[tiers['tier'].isin(valid)]

NFE, CAP, and LC formats are for unevolved Pokemon, and are not categorized into viability tiers, so they cannot be used. Although some of the "banlist" tiers do not have very many values, we will ultimately be turning the tiers into a numerical scale, so it is not a particularly big issue. 

In [6]:
# read in pokedex
dex = pd.read_json('pokedex.json',orient='index')


In [7]:
# when merging, only merge in pokemon that we have valid tiering data for
combined = dex.join(tiers, how='right')
combined

Unnamed: 0,num,name,types,genderRatio,baseStats,abilities,heightm,weightkg,color,evos,...,baseForme,cosmeticFormes,maxHP,requiredAbility,battleOnly,requiredMove,requiredItems,cannotDynamax,forceTeraType,tier
venusaur,3,Venusaur,"[Grass, Poison]","{'M': 0.875, 'F': 0.125}","{'hp': 80, 'atk': 82, 'def': 83, 'spa': 100, '...","{'0': 'Overgrow', 'H': 'Chlorophyll'}",2.0,100.0,Green,,...,,,,,,,,,,PU
charizard,6,Charizard,"[Fire, Flying]","{'M': 0.875, 'F': 0.125}","{'hp': 78, 'atk': 84, 'def': 78, 'spa': 109, '...","{'0': 'Blaze', 'H': 'Solar Power'}",1.7,90.5,Red,,...,,,,,,,,,,ZU
blastoise,9,Blastoise,[Water],"{'M': 0.875, 'F': 0.125}","{'hp': 79, 'atk': 83, 'def': 100, 'spa': 85, '...","{'0': 'Torrent', 'H': 'Rain Dish'}",1.6,85.5,Blue,,...,,,,,,,,,,RUBL
arbok,24,Arbok,[Poison],,"{'hp': 60, 'atk': 95, 'def': 69, 'spa': 65, 's...","{'0': 'Intimidate', '1': 'Shed Skin', 'H': 'Un...",3.5,65.0,Purple,,...,,,,,,,,,,ZU
pikachu,25,Pikachu,[Electric],,"{'hp': 35, 'atk': 55, 'def': 40, 'spa': 50, 's...","{'0': 'Static', 'H': 'Lightning Rod'}",0.4,6.0,Yellow,"[Raichu, Raichu-Alola]",...,,,,,,,,,,ZU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ironboulder,1022,Iron Boulder,"[Rock, Psychic]",,"{'hp': 90, 'atk': 120, 'def': 80, 'spa': 68, '...",{'0': 'Quark Drive'},1.5,162.5,Gray,,...,,,,,,,,,,UUBL
ironcrown,1023,Iron Crown,"[Steel, Psychic]",,"{'hp': 90, 'atk': 72, 'def': 100, 'spa': 122, ...",{'0': 'Quark Drive'},1.6,156.0,Blue,,...,,,,,,,,,,OU
terapagos,1024,Terapagos,[Normal],,"{'hp': 90, 'atk': 65, 'def': 85, 'spa': 65, 's...",{'0': 'Tera Shift'},0.2,6.5,Blue,,...,,,,,,,,,Stellar,Uber
terapagosstellar,1024,Terapagos-Stellar,[Normal],,"{'hp': 160, 'atk': 105, 'def': 110, 'spa': 130...",{'0': 'Teraform Zero'},1.7,77.0,Blue,,...,,,,,Terapagos,,,,Stellar,Uber


In [8]:
# some variables are exclusive to very few pokemon, such as those pertaining to gimmicks/mechanics only present in one generation
# as the vast majority of our pokemon will not have valid information for these variables, we are dropping them

combined.dropna(axis=1,thresh=combined.shape[0]*.9, inplace=True)
combined

Unnamed: 0,num,name,types,baseStats,abilities,heightm,weightkg,color,eggGroups,tier
venusaur,3,Venusaur,"[Grass, Poison]","{'hp': 80, 'atk': 82, 'def': 83, 'spa': 100, '...","{'0': 'Overgrow', 'H': 'Chlorophyll'}",2.0,100.0,Green,"[Monster, Grass]",PU
charizard,6,Charizard,"[Fire, Flying]","{'hp': 78, 'atk': 84, 'def': 78, 'spa': 109, '...","{'0': 'Blaze', 'H': 'Solar Power'}",1.7,90.5,Red,"[Monster, Dragon]",ZU
blastoise,9,Blastoise,[Water],"{'hp': 79, 'atk': 83, 'def': 100, 'spa': 85, '...","{'0': 'Torrent', 'H': 'Rain Dish'}",1.6,85.5,Blue,"[Monster, Water 1]",RUBL
arbok,24,Arbok,[Poison],"{'hp': 60, 'atk': 95, 'def': 69, 'spa': 65, 's...","{'0': 'Intimidate', '1': 'Shed Skin', 'H': 'Un...",3.5,65.0,Purple,"[Field, Dragon]",ZU
pikachu,25,Pikachu,[Electric],"{'hp': 35, 'atk': 55, 'def': 40, 'spa': 50, 's...","{'0': 'Static', 'H': 'Lightning Rod'}",0.4,6.0,Yellow,"[Field, Fairy]",ZU
...,...,...,...,...,...,...,...,...,...,...
ironboulder,1022,Iron Boulder,"[Rock, Psychic]","{'hp': 90, 'atk': 120, 'def': 80, 'spa': 68, '...",{'0': 'Quark Drive'},1.5,162.5,Gray,[Undiscovered],UUBL
ironcrown,1023,Iron Crown,"[Steel, Psychic]","{'hp': 90, 'atk': 72, 'def': 100, 'spa': 122, ...",{'0': 'Quark Drive'},1.6,156.0,Blue,[Undiscovered],OU
terapagos,1024,Terapagos,[Normal],"{'hp': 90, 'atk': 65, 'def': 85, 'spa': 65, 's...",{'0': 'Tera Shift'},0.2,6.5,Blue,[Undiscovered],Uber
terapagosstellar,1024,Terapagos-Stellar,[Normal],"{'hp': 160, 'atk': 105, 'def': 110, 'spa': 130...",{'0': 'Teraform Zero'},1.7,77.0,Blue,[Undiscovered],Uber


In [9]:
# "unnest" baseStats column and expand it into its own set of columns

stats = pd.json_normalize(combined['baseStats'])
stats
stats.index = combined.index
combined = pd.concat([combined.drop(columns=['baseStats']), stats], axis=1)
combined


Unnamed: 0,num,name,types,abilities,heightm,weightkg,color,eggGroups,tier,hp,atk,def,spa,spd,spe
venusaur,3,Venusaur,"[Grass, Poison]","{'0': 'Overgrow', 'H': 'Chlorophyll'}",2.0,100.0,Green,"[Monster, Grass]",PU,80,82,83,100,100,80
charizard,6,Charizard,"[Fire, Flying]","{'0': 'Blaze', 'H': 'Solar Power'}",1.7,90.5,Red,"[Monster, Dragon]",ZU,78,84,78,109,85,100
blastoise,9,Blastoise,[Water],"{'0': 'Torrent', 'H': 'Rain Dish'}",1.6,85.5,Blue,"[Monster, Water 1]",RUBL,79,83,100,85,105,78
arbok,24,Arbok,[Poison],"{'0': 'Intimidate', '1': 'Shed Skin', 'H': 'Un...",3.5,65.0,Purple,"[Field, Dragon]",ZU,60,95,69,65,79,80
pikachu,25,Pikachu,[Electric],"{'0': 'Static', 'H': 'Lightning Rod'}",0.4,6.0,Yellow,"[Field, Fairy]",ZU,35,55,40,50,50,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ironboulder,1022,Iron Boulder,"[Rock, Psychic]",{'0': 'Quark Drive'},1.5,162.5,Gray,[Undiscovered],UUBL,90,120,80,68,108,124
ironcrown,1023,Iron Crown,"[Steel, Psychic]",{'0': 'Quark Drive'},1.6,156.0,Blue,[Undiscovered],OU,90,72,100,122,108,98
terapagos,1024,Terapagos,[Normal],{'0': 'Tera Shift'},0.2,6.5,Blue,[Undiscovered],Uber,90,65,85,65,85,60
terapagosstellar,1024,Terapagos-Stellar,[Normal],{'0': 'Teraform Zero'},1.7,77.0,Blue,[Undiscovered],Uber,160,105,110,130,110,85


In [10]:
# total Base State Totals column

combined['BST'] = combined[['hp','atk','def','spa','spd','spe']].sum(axis=1)
combined

Unnamed: 0,num,name,types,abilities,heightm,weightkg,color,eggGroups,tier,hp,atk,def,spa,spd,spe,BST
venusaur,3,Venusaur,"[Grass, Poison]","{'0': 'Overgrow', 'H': 'Chlorophyll'}",2.0,100.0,Green,"[Monster, Grass]",PU,80,82,83,100,100,80,525
charizard,6,Charizard,"[Fire, Flying]","{'0': 'Blaze', 'H': 'Solar Power'}",1.7,90.5,Red,"[Monster, Dragon]",ZU,78,84,78,109,85,100,534
blastoise,9,Blastoise,[Water],"{'0': 'Torrent', 'H': 'Rain Dish'}",1.6,85.5,Blue,"[Monster, Water 1]",RUBL,79,83,100,85,105,78,530
arbok,24,Arbok,[Poison],"{'0': 'Intimidate', '1': 'Shed Skin', 'H': 'Un...",3.5,65.0,Purple,"[Field, Dragon]",ZU,60,95,69,65,79,80,448
pikachu,25,Pikachu,[Electric],"{'0': 'Static', 'H': 'Lightning Rod'}",0.4,6.0,Yellow,"[Field, Fairy]",ZU,35,55,40,50,50,90,320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ironboulder,1022,Iron Boulder,"[Rock, Psychic]",{'0': 'Quark Drive'},1.5,162.5,Gray,[Undiscovered],UUBL,90,120,80,68,108,124,590
ironcrown,1023,Iron Crown,"[Steel, Psychic]",{'0': 'Quark Drive'},1.6,156.0,Blue,[Undiscovered],OU,90,72,100,122,108,98,590
terapagos,1024,Terapagos,[Normal],{'0': 'Tera Shift'},0.2,6.5,Blue,[Undiscovered],Uber,90,65,85,65,85,60,450
terapagosstellar,1024,Terapagos-Stellar,[Normal],{'0': 'Teraform Zero'},1.7,77.0,Blue,[Undiscovered],Uber,160,105,110,130,110,85,700


In [11]:
# turn the type 'lists' into 'sets', which are unordered


combined['typeset'] = combined['types'].apply(set)
combined['typeset'].value_counts()

typeset
{Normal}           24
{Electric}         24
{Water}            22
{Psychic}          19
{Grass}            13
                   ..
{Rock, Ice}         1
{Fire, Water}       1
{Fighting, Ice}     1
{Bug, Fairy}        1
{Psychic, Rock}     1
Name: count, Length: 150, dtype: int64

The original plan was to convert the typings into sets, and then allow our models to interpret the typings as a single variable. However, There are just too many different type combinations, and far too many have only usable Pokemon. Ultimately, I chose to split the typings into two columns.

Another option would have been to have a boolean column for every type. However, I think that adding 16 extra columns would make for a very messy solution.

Although it has no functional impacts gameplay wise, it's still possible that the orders of the Primary/Secondary typings might still be able to predict Pokemon viability. For instance, many of the most important "box legendaries" (on the box art) will have "dragon" as their main typing rather than their secondary typing, so 'dragon' in 'type1' might be more indicitave of a top pokemon than in 'type2'.

In [12]:
# i will never not feel clever using lambda 
combined[['type1','type2']] = combined['types'].apply(lambda x: x if len(x) == 2 else [x[0], x[0]]).tolist()
combined.drop(columns=['types'])

Unnamed: 0,num,name,abilities,heightm,weightkg,color,eggGroups,tier,hp,atk,def,spa,spd,spe,BST,typeset,type1,type2
venusaur,3,Venusaur,"{'0': 'Overgrow', 'H': 'Chlorophyll'}",2.0,100.0,Green,"[Monster, Grass]",PU,80,82,83,100,100,80,525,"{Poison, Grass}",Grass,Poison
charizard,6,Charizard,"{'0': 'Blaze', 'H': 'Solar Power'}",1.7,90.5,Red,"[Monster, Dragon]",ZU,78,84,78,109,85,100,534,"{Fire, Flying}",Fire,Flying
blastoise,9,Blastoise,"{'0': 'Torrent', 'H': 'Rain Dish'}",1.6,85.5,Blue,"[Monster, Water 1]",RUBL,79,83,100,85,105,78,530,{Water},Water,Water
arbok,24,Arbok,"{'0': 'Intimidate', '1': 'Shed Skin', 'H': 'Un...",3.5,65.0,Purple,"[Field, Dragon]",ZU,60,95,69,65,79,80,448,{Poison},Poison,Poison
pikachu,25,Pikachu,"{'0': 'Static', 'H': 'Lightning Rod'}",0.4,6.0,Yellow,"[Field, Fairy]",ZU,35,55,40,50,50,90,320,{Electric},Electric,Electric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ironboulder,1022,Iron Boulder,{'0': 'Quark Drive'},1.5,162.5,Gray,[Undiscovered],UUBL,90,120,80,68,108,124,590,"{Psychic, Rock}",Rock,Psychic
ironcrown,1023,Iron Crown,{'0': 'Quark Drive'},1.6,156.0,Blue,[Undiscovered],OU,90,72,100,122,108,98,590,"{Psychic, Steel}",Steel,Psychic
terapagos,1024,Terapagos,{'0': 'Tera Shift'},0.2,6.5,Blue,[Undiscovered],Uber,90,65,85,65,85,60,450,{Normal},Normal,Normal
terapagosstellar,1024,Terapagos-Stellar,{'0': 'Teraform Zero'},1.7,77.0,Blue,[Undiscovered],Uber,160,105,110,130,110,85,700,{Normal},Normal,Normal


Although not present in Smogon's dataset, I wanted to add the age of different Pokemon into my dataset. A common concept in competitive video games is 'power creep', where developers will add increasingly powerful additions to a game in order to keep people interested in buying newer versions. This is a particularly common complaint in the Pokemon community, where every generation seems to add increasingly overpowered Pokemon. I wanted to see if it would be possible to quantify this through my models.

*Generation*: Nintendo regularly releases a new game every few years. The new Pokemon and mechanics added to each mainline game entry are referred to as "generations" by the community, starting with 1 for the original Red/Blue/Yellow to 9 for the most recent Scarlet/Violet games.

For the  most part, you can cleanly seperate Pokemon generations by the Pokedex number, as each generation just adds their new pokemon to the end of the pokedex.

However, an interesting roadblock presents itself in *regional forms*-- in new games, Nintendo occasionally will release a new "regional variant" of an older Pokemon. while these Pokemon will have new designs and play completely differently from their old variants, they still have the same Pokedex number. Thankfully, all of these variants include their region in their name, so it's simple to just search for any duplicates and adjust their generation based on the Pokemon's name. If no region names are present in the Pokemon's name, then it can be assumed to be a variant released alongside the original Pokemon, as is the case for many of the biggest legendary Pokemon.

In [13]:
generations = {
    range(1, 152): 1, 
    range(152, 252): 2, 
    range(252, 387): 3,
    range(387, 494): 4, 
    range(494, 650): 5, 
    range(650, 722): 6,
    range(722, 810): 7, 
    range(810, 906): 8, 
    range(906, 1026): 9
}

regions = {
    'Kanto': 1, 
    'Johto': 2, 
    'Hoenn': 3, 
    'Sinnoh': 4, 
    'Unova': 5, 
    'Kalos': 6, 
    'Alola': 7, 
    'Galar': 8, 
    'Hisui': 8, 
    'Paldea': 9
}

def assign_generation(num):
    return next((gen for r, gen in generations.items() if num in r), None)

combined['generation'] = combined['num'].apply(assign_generation)

for i in range(1, len(combined)):
    if combined.iloc[i]['num'] == combined.iloc[i-1]['num']: # duplicates
        for keyword, gen in regions.items():
            if keyword in combined.iloc[i]['name']:
                combined.at[combined.index[i], 'generation'] = gen
                break

In order to be able to give the model a numerical value to predict, I just assigned every tier a number, going from 1 up to 13. This is a very simple solution, and a future optimization might be to adjust these values to provide for a more uniform and/or accurate distribution. As the catch-all "trash" tier, ZU for instance has way more Pokemon than any other tier, and there isn't necessariliy an equal gap between a "Banlist" Pokemon and the next tier up.

In [14]:
tiers = {
    'ZU': 1, 
    'ZUBL': 2, 
    'PU': 3, 
    'PUBL': 4, 
    'NU': 5, 
    'NUBL': 6, 
    'RU': 7, 
    'RUBL': 8, 
    'UU': 9, 
    'UUBL': 10,
    'OU': 11,
    'Uber': 12,
    'AG': 13,
}

def assign_tier(value):
    return tiers.get(value)

combined['tiernum'] = combined['tier'].apply(assign_tier)
combined

Unnamed: 0,num,name,types,abilities,heightm,weightkg,color,eggGroups,tier,hp,...,def,spa,spd,spe,BST,typeset,type1,type2,generation,tiernum
venusaur,3,Venusaur,"[Grass, Poison]","{'0': 'Overgrow', 'H': 'Chlorophyll'}",2.0,100.0,Green,"[Monster, Grass]",PU,80,...,83,100,100,80,525,"{Poison, Grass}",Grass,Poison,1,3
charizard,6,Charizard,"[Fire, Flying]","{'0': 'Blaze', 'H': 'Solar Power'}",1.7,90.5,Red,"[Monster, Dragon]",ZU,78,...,78,109,85,100,534,"{Fire, Flying}",Fire,Flying,1,1
blastoise,9,Blastoise,[Water],"{'0': 'Torrent', 'H': 'Rain Dish'}",1.6,85.5,Blue,"[Monster, Water 1]",RUBL,79,...,100,85,105,78,530,{Water},Water,Water,1,8
arbok,24,Arbok,[Poison],"{'0': 'Intimidate', '1': 'Shed Skin', 'H': 'Un...",3.5,65.0,Purple,"[Field, Dragon]",ZU,60,...,69,65,79,80,448,{Poison},Poison,Poison,1,1
pikachu,25,Pikachu,[Electric],"{'0': 'Static', 'H': 'Lightning Rod'}",0.4,6.0,Yellow,"[Field, Fairy]",ZU,35,...,40,50,50,90,320,{Electric},Electric,Electric,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ironboulder,1022,Iron Boulder,"[Rock, Psychic]",{'0': 'Quark Drive'},1.5,162.5,Gray,[Undiscovered],UUBL,90,...,80,68,108,124,590,"{Psychic, Rock}",Rock,Psychic,9,10
ironcrown,1023,Iron Crown,"[Steel, Psychic]",{'0': 'Quark Drive'},1.6,156.0,Blue,[Undiscovered],OU,90,...,100,122,108,98,590,"{Psychic, Steel}",Steel,Psychic,9,11
terapagos,1024,Terapagos,[Normal],{'0': 'Tera Shift'},0.2,6.5,Blue,[Undiscovered],Uber,90,...,85,65,85,60,450,{Normal},Normal,Normal,9,12
terapagosstellar,1024,Terapagos-Stellar,[Normal],{'0': 'Teraform Zero'},1.7,77.0,Blue,[Undiscovered],Uber,160,...,110,130,110,85,700,{Normal},Normal,Normal,9,12


In [15]:
combined.to_csv('combined.csv')

Next up: exploring the variables.