In [None]:
import pandas as pd
import numpy as np

# Getting the data

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/fabricio21x/introduction-to-ml/main/datasets/pokemon_with_stats.csv")

# Performing Exploratory Data Analysis (EDA)

In [None]:
#get the first 4 rows of the table
df.head(4)

In [None]:
# sample return n rows picked randomly
df.sample(4)

In [None]:
# show the shape of the dataframe (rows, columns)
df.shape

In [None]:
df.columns

In [None]:
# we can rename the columns
df.rename(columns = {
    '#': "number",
    'Name': "name", 
    'Type 1': "type_1", 
    'Type 2': "type_2", 
    'Total': "total", 
    'HP': "hp", 
    'Attack': "attack", 
    'Defense': "defense",
    'Sp. Atk': "sp_atk", 
    'Sp. Def': "sp_def", 
    'Speed': "speed", 
    'Generation': "generation", 
    'Legendary': "legendary"
}, inplace=True)

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df["attack"].mean(), df.attack.max(), df["attack"].min()

In [None]:
df["attack"].describe()

## Cleaning the data

For example the pokemons with "Mega" have some unnecesary text at the begining, e.g. CharizardMega Charizard. Let's remove it

In [None]:
df.loc[:,"name"] = df.loc[:,"name"].str.replace("^.*(?=Mega)", "")

In [None]:
df.head()

Also, the "number" columns is not giving much information, let's remove it

In [None]:
# del df["number"] => alternative way
df.drop(["number"], axis=1, inplace=True)

Let's check for null values in our data, it seems that the column "type_2" is almost 50% full of nulls

In [None]:
df.isna().mean().round(4) * 100

In [None]:
df["type_2"].isnull().sum()

We can fill the nulls on that column with a value that makes sense. In this case it will make a lot of sense to just duplicate the "type_1" if that pokemon doesn't have an aditional type

In [None]:
df["type_2"].fillna(df["type_1"], inplace=True)

## Filtering and selecting

In [None]:
df[df["attack"]>10].sample(5)

In [None]:
# this way returns a copy of the dataframe
df[(df["legendary"]==True) & (df["generation"]==1)].head(5)

In [None]:
# with .loc we can access the columns in-place
df.loc[(df["legendary"]==False) & (df["generation"]==1) & ((df["type_1"]=="Fire") | (df["type_1"]=="Dragon")), ["name", "attack", "defense", "type_1", "type_2"]].head()

In [None]:
print(df.loc[1]) # retrieves complete row data from index with value 1

In [None]:
print(df.iloc[1]) #retrieves complete row date from index 1 ; integer version of loc

- loc works on labels in the index.
- iloc works on the positions in the index (so it only takes integers).

In [None]:
print("The unique types are:",df["type_1"].unique())
print("The number of unique types is", df["type_1"].nunique())

Strongest pokemon of each type

In [None]:
strongest = df.sort_values(by=["total"], ascending=False) # we sort the pokemons in descending order
strongest = strongest.drop_duplicates(subset=["type_1"], keep="first") # since the rows are now sorted we can drop all the other pokemons that aren't the top 1
strongest

# Visualizing the data

Pie chart of the distribution of types

In [None]:
df["type_1"].value_counts()*100 / df["type_1"].shape[0]