In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import pandas and Data

In [2]:
import pandas as pd

pokedata = pd.read_csv('/kaggle/input/pokemon-data/pokemon_data.csv')

# Reading Data

In [None]:
# Turn into dataframe

pokedata = pd.DataFrame(pokedata)

pokedata.head()

In [None]:
# Read headers

pokedata.columns

In [None]:
# Read specific columns

pokedata[['Name', 'HP']]

In [None]:
# Read specific rows using iloc (integer based)

pokedata.iloc[6] # Prints row with index of 6
pokedata.iloc[1:4] # Prints rows of index 1-3

In [None]:
# Read specific row(s) using loc (textual based)

pokedata.loc[pokedata['Type 1']=='Fire'] # Only prints pokemon whos type 1 is fire

In [None]:
# Iterate through each row to get specific information

for index, row in pokedata.iterrows():
    print (index, row['Legendary'])

In [None]:
# Read specific location/cell (R, C) e.g. Charmander's name

pokedata.iloc[4, 1]

# Sorting and Describing Data

In [None]:
#Quickly retrieve descriptive stats (which may not be useful for each column)

pokedata.describe()

In [None]:
# Sort data by column

pokedata.sort_values('Name')
pokedata.sort_values('Name', ascending=False) # Asc/desc
pokedata.sort_values(['Type 1', 'HP'], ascending=[1, 0]) # Means Type a->z and HP high->low

# Changing the Data

In [None]:
# Adding stat total column

pokedata['Total'] = pokedata.iloc[:,4:10].sum(axis=1)

"""
iloc[:,4:10] ---
:, means all rows
4:10 is the range of indexes of the columns to add (end parameter'10' is exclusive, so only
adds up to column index 9, which is speed)
axis=1 means add horizontally, axis=0 would add vertically
"""

pokedata.head()

In [None]:
"""
Drop a column using...

pokedata = pokedata.drop(columns=['Total'])
pokedata

"""

In [None]:
# Move column (total column to after type 2)

cols = list(pokedata.columns) # Creates a list of all the columns
pokedata = pokedata[cols[0:4]+[cols[-1]]+cols[4:12]] # Remaps the columns using their index position

# Note that the cols[-1] is around square brackets itself, as it is an individual column it treats it like string, so adding [] removes the issue

pokedata.head()

# Saving and Exporting Data

In [None]:
#CSV

"""pokedata.to_csv('modified.csv')"""

# Or

"""pokedata.to_csv(modified.csv, index=False)""" # Removes index from exported file

pokedata.to_csv(modified.csv, index=False) # Actually save here, as will be using this modified.csv later on, after changing the original dataset.

In [None]:
#Excel

"""df.to_excel('modified.xlsx')""" # Again, can add index=False

In [None]:
#Text

"""df.to_csv('modified.txt', sep=\t)""" # Again, can add index=False - sep=\t means seperate data by tab

# Filtering Data

In [None]:
# Only show pokemon with type 1 grass AND type 2 poison

pokedata.loc[(pokedata['Type 1']=='Grass') & (pokedata['Type 2']=='Poison')]

# In pandas, use & not 'and'
# Use | not 'or'

In [None]:
# Only show pokemon with type 1 grass OR type 2 poison

pokedata.loc[(pokedata['Type 1']=='Grass') | (pokedata['Type 2']=='Poison')]

In [None]:
# Only show pokemon with type 1 grass and type 2 poison and HP > 70

pokedata.loc[(pokedata['Type 1']=='Grass') & (pokedata['Type 2']=='Poison') & (pokedata['HP']>70)]

In [None]:
# Can create new dataframe of this

new_df = pokedata.loc[(pokedata['Type 1']=='Grass') & (pokedata['Type 2']=='Poison') & (pokedata['HP']>70)]

new_df # Still shows old index

new_df = new_df.reset_index(drop=True) # Drops old index column

new_df

In [None]:
# Filter out names that contain 'mega'

pokedata.loc[pokedata['Name'].str.contains('Mega')] # Shows all that contain 'Mega'

pokedata.loc[~pokedata['Name'].str.contains('Mega')] # Shows all that do NOT contain 'Mega'

In [None]:
#Another way of using .str.contains

import re #regular expression/regex package

pokedata.loc[pokedata['Type 1'].str.contains('Fire|Grass', regex=True)] # Uses regex

pokedata.loc[pokedata['Type 1'].str.contains('fire|grass', flags=re.I, regex=True)] # Uses regex, flags=re.I ignores case sensitivity, so produces same as above

In [None]:
# All pokemon names containing 'pi'

pokedata.loc[pokedata['Name'].str.contains('pi[a-z]*', flags=re.I, regex=True)]

"""'pi[a-z]*' means contains pi, any letters a-z after it"""

In [None]:
# All pokemon names beginning with 'pi'

pokedata.loc[pokedata['Name'].str.contains('^pi[a-z]*', flags=re.I, regex=True)]

# """ '^pi[a-z]*' means beginning with pi, any letters a-z after it """

# Conditional Changes

In [None]:
# Changing values in a column

pokedata.loc[pokedata['Type 1'] == 'Flying', 'Type 1'] = 'Fly'

pokedata.loc[(pokedata['Type 1'] == 'Fly')] # See the ones it changed

pokedata.loc[pokedata['Type 1'] == 'Fly', 'Type 1'] = 'Flying' # Changing it back         

In [None]:
# Changing values in a column different to condition column

pokedata.loc[pokedata['Type 1'] == 'Fire', 'Legendary'] = True

pokedata

In [None]:
# Can change multiple conditions can be modified at once - e.g. here changing Gen and Leg column to 'TEST' if the Total column is over 500

pokedata.loc[pokedata['Total'] > 500, ['Generation', 'Legendary']] = 'TEST' # Changes both to TEST

pokedata.loc[pokedata['Total'] > 500, ['Generation', 'Legendary']] = ['TEST1', 'TEST2'] # Changes Gen to 1, Leg to 2

pokedata

# These last two boxes of code have changed the dataset extensively, so will be using the 'modified' dataset from here which we saved earlier on - original data plus total column.


# Resetting DataFrame to what it was before these modifications using the modified dataset...

In [None]:
pokedata = pd.read_csv('/kaggle/working/modified.csv')
pokedata

# Aggregate Statistics (groupby)

In [None]:
# Averages by each Type 1

pokedata.groupby(['Type 1']).mean(numeric_only=True)

pokedata.groupby(['Type 1']).mean(numeric_only=True).sort_values('Defense', ascending=False) # Can then sort the values, highest defence

pokedata.groupby(['Type 1']).mean(numeric_only=True).sort_values('Attack', ascending=False) # Highest attack

In [None]:
# Other aggregates e.g. sum, max, min

pokedata.groupby(['Type 1']).sum(numeric_only=True)

In [None]:
# Count

pokedata.groupby(['Type 1']).count() # Gives you whole dataframe which you may not want/need

"""Can add a count column at the end which will make .count() functions produce cleaner,
simpler output."""

pokedata['Count'] = 1 # Adds count column where for each row, the value is 1 to signify one row

pokedata # Can see count column

pokedata.groupby(['Type 1']).count()['Count'] # Produces clear, readable output

In [None]:
# Group and count by multiple parameters now

pd.DataFrame(pokedata.groupby(['Type 1', 'Type 2']).count()['Count'])