In [None]:
'''
Dataset
Pokemon Dataset you can find here :

pokemon.csv: Contains stats like HP, Attack, Type 1, Legendary, etc., for 800+ Pokemon.
combats.csv: Records 50,000 battle outcomes between Pokemon.

What you need to do:
Data Preparation

Load and merge pokemon.csv and combats.csv.
Fix missing values:
Fill the missing Name for Pokemon #62 (Primeape).
Handle NaN values in Type 2 (mark as “None” if missing).
Calculate each Pokemon’s win percentage using the combat data.
Exploratory Analysis & Visualization

Create a correlation matrix to identify relationships between stats (HP, Attack, Speed) and win percentage.
Plot a Seaborn pairplot or PairGrid for stats vs. win percentage.
Analyze the top 10 Pokemon by win percentage and their stats.
Machine Learning

Split data into training/testing sets (80/20 split).
Train and evaluate 3 regression models (e.g., Linear Regression, Random Forest, XGBoost) to predict win percentage.
Compare model performance using Mean Absolute Error (MAE).

'''

In [1]:
import pandas as pd

# load the files
pokemon = pd.read_csv("/content/pokemon.csv")
combats = pd.read_csv("/content/combats.csv")

# fill missing name for pokemon #62
pokemon.loc[62, 'Name'] = "Primeape"

# column "Type 2" has many missing values.  fill them.
print(pokemon[pokemon["Type 2"].isnull()])
pokemon["Type 2"] = pokemon["Type 2"].fillna("None")


       #        Name  Type 1 Type 2   HP  Attack  Defense  Sp. Atk  Sp. Def  \
4      5  Charmander    Fire    NaN   39      52       43       60       50   
5      6  Charmeleon    Fire    NaN   58      64       58       80       65   
9     10    Squirtle   Water    NaN   44      48       65       50       64   
10    11   Wartortle   Water    NaN   59      63       80       65       80   
11    12   Blastoise   Water    NaN   79      83      100       85      105   
..   ...         ...     ...    ...  ...     ...      ...      ...      ...   
775  776     Sliggoo  Dragon    NaN   68      75       53       83      113   
776  777      Goodra  Dragon    NaN   90     100       70      110      150   
788  789    Bergmite     Ice    NaN   55      69       85       32       35   
789  790     Avalugg     Ice    NaN   95     117      184       44       46   
792  793     Xerneas   Fairy    NaN  126     131       95      131       98   

     Speed  Generation  Legendary  
4       65     

In [2]:
# Calculate each Pokemon’s win percentage using the combat data.

# count the number of games won by pokemons
winners_games_counts = combats["Winner"].value_counts()

# count the number of games that pokemons played - in every list.  (there are 2 lists).
first_list_counts_of_pokemon_games = combats['First_pokemon'].value_counts()
second_list_counts_of_pokemon_games = combats['Second_pokemon'].value_counts()

# you can add 2 pandas.Series - where values are added element-wise based on matching indices
# if there are indices in one Series but not in the other, it's like there was a value of 0 where the indices were missing (fill_value=0)
all_games_counts = first_list_counts_of_pokemon_games.add(second_list_counts_of_pokemon_games, fill_value=0)
# so we now have a list of all pokemon games count
# (BTW, not all pokemons are on the list, because not all pokemons played games. There are 16 pokemons that didn't play at all. But this is not important).

# we will divide the numbers to get the percentage of winning for pokemons that played.
# we use the parameter fill_value=0 , because there's one pokemon that played games but never won, so he doesn't exist on the winners list.
# So, for this pokemon, if we don't use fill_value=0 then we will get NaN instead of 0 percentage.
winning_percentage_of_pokemons_that_played = winners_games_counts.div(all_games_counts, fill_value=0) * 100

# sort the list so we can use it later when checking the relationship to other features.
winning_percentage_of_pokemons_that_played.sort_index(inplace=True)

# print in descending order
print ( winning_percentage_of_pokemons_that_played.sort_values(ascending=False) )


155    98.449612
513    97.478992
704    96.800000
20     96.638655
154    96.453901
         ...    
237     3.252033
639     3.100775
190     2.459016
290     2.173913
231     0.000000
Name: count, Length: 784, dtype: float64


In [3]:
# merging the winning data into the pokemons DataFrame
# we have to do it because there are 16 pokemons that never won any game - so we have to find them and delete them

pokemon['WinningPercentage'] = pokemon['#'].map(winning_percentage_of_pokemons_that_played)

print(pokemon.isnull().sum())

# drop those who don't have a winning score - because there are 16 pokemons that never won any game.
pokemon = pokemon.dropna(subset=['WinningPercentage'])


#                     0
Name                  0
Type 1                0
Type 2                0
HP                    0
Attack                0
Defense               0
Sp. Atk               0
Sp. Def               0
Speed                 0
Generation            0
Legendary             0
WinningPercentage    16
dtype: int64


In [4]:
# examine the relationship between stats (HP, Attack, Speed) and the winning percentage
correlation_matrix = pokemon[['HP', 'Attack', 'Speed', 'WinningPercentage']].corr()

# show the correlations with the target column (WinningPercentage)
print( correlation_matrix['WinningPercentage'].drop('WinningPercentage') )

# we can see that speed is very much correlated to winning


HP        0.261602
Attack    0.502825
Speed     0.938055
Name: WinningPercentage, dtype: float64
