## Loading data into Pandas

In [59]:
import pandas as pd

df = pd.read_csv('pokemon_data.csv')

# print(df.head(5))

# df_xlsx = pd.read_excel('pokemon_data.xlsx')
# print(df_xlsx.head(3))

# df = pd.read_csv('pokemon_data.txt', delimiter='\t') # delimiter tabs = '\t'

# print(df.head(5))

df['HP']

0      45
1      60
2      80
3      80
4      39
       ..
795    50
796    50
797    80
798    80
799    80
Name: HP, Length: 800, dtype: int64

## Reading Data in Pandas

In [60]:
#### Read Headers
print(df.columns)

## Read each Column
print(df[['Name', 'Type 1', 'HP']])

## Read Each Row
print(df.iloc[0:4])

#list comprehension (iterate): 
for index, row in df.iterrows():
    print(index, row['Name'])

#Alternative way to access df data, shows all grass type data:
# df.loc[df['Type 1'] == "Grass"]

## Read a specific location (R,C)
print(df.iloc[2,1])

Index(['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk',
       'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')
                      Name   Type 1  HP
0                Bulbasaur    Grass  45
1                  Ivysaur    Grass  60
2                 Venusaur    Grass  80
3    VenusaurMega Venusaur    Grass  80
4               Charmander     Fire  39
..                     ...      ...  ..
795                Diancie     Rock  50
796    DiancieMega Diancie     Rock  50
797    HoopaHoopa Confined  Psychic  80
798     HoopaHoopa Unbound  Psychic  80
799              Volcanion     Fire  80

[800 rows x 3 columns]
   #                   Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  \
0  1              Bulbasaur  Grass  Poison  45      49       49       65   
1  2                Ivysaur  Grass  Poison  60      62       63       80   
2  3               Venusaur  Grass  Poison  80      82       83      100   
3  3  VenusaurMega Venusaur  Grass  P

## Sorting/Describing Data

In [61]:
# Sort ascending:
df.sort_values(['Type 1'], ascending=True)

# Sort descending:
df.sort_values(['Type 1'], ascending=False)

# Sort with two columns, [1, 0] denotes Type 1 is ascending, while HP is descending
df.sort_values(['Type 1', 'HP'], ascending=[1,0])

print(df)

# Describes Standard Statistics:
df.describe()

       #                   Name   Type 1  Type 2  HP  Attack  Defense  \
0      1              Bulbasaur    Grass  Poison  45      49       49   
1      2                Ivysaur    Grass  Poison  60      62       63   
2      3               Venusaur    Grass  Poison  80      82       83   
3      3  VenusaurMega Venusaur    Grass  Poison  80     100      123   
4      4             Charmander     Fire     NaN  39      52       43   
..   ...                    ...      ...     ...  ..     ...      ...   
795  719                Diancie     Rock   Fairy  50     100      150   
796  719    DiancieMega Diancie     Rock   Fairy  50     160      110   
797  720    HoopaHoopa Confined  Psychic   Ghost  80     110       60   
798  720     HoopaHoopa Unbound  Psychic    Dark  80     160       60   
799  721              Volcanion     Fire   Water  80     110      120   

     Sp. Atk  Sp. Def  Speed  Generation  Legendary  
0         65       65     45           1      False  
1         80   

Unnamed: 0,#,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,362.81375,69.25875,79.00125,73.8425,72.82,71.9025,68.2775,3.32375
std,208.343798,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474,1.66129
min,1.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0
25%,184.75,50.0,55.0,50.0,49.75,50.0,45.0,2.0
50%,364.5,65.0,75.0,70.0,65.0,70.0,65.0,3.0
75%,539.25,80.0,100.0,90.0,95.0,90.0,90.0,5.0
max,721.0,255.0,190.0,230.0,194.0,230.0,180.0,6.0


## Making changes to the data

In [62]:
# Method 1:
#df['Total'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']

In [63]:
# Remove column from memory (even if you do not display or print the column, it generally stays in memory unless removed):
# df = df.drop(columns=['Total'])

In [64]:
# Method 2 (better, not ideal since iloc is hard-coded in...ideally select using index of column labels):
#sum(axis = #), # = 0 (sum vertically), # = 1 (sum horizontally)
df['Total'] = df.iloc[:, 4:10].sum(axis=1)

In [65]:
# Changing column order (ideally use column labels rather than numbers):
cols = list(df.columns)
df = df[cols[0:4] + [cols[-1]] + cols[4:12]]
df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [66]:
# # Changing column order2:
# cols = df.columns.values
# df = df[['Total', 'HP', 'Defense']]
# df.head(5)

In [67]:
# checking sum totals for 1 line:
45+49+49+65+65+45

318

## Saving our Data (Exporting into Desired Format)

In [68]:
# Save to CSV:
# df.to_csv('modified.csv', index=False)

# Save to Excel:
#df.to_excel('modified.xlsx', index=False)

# Save to CSV without Index:
df.to_csv('modified.csv', index=False, sep='\t')

## Filtering Data

In [69]:
# # Single Filter:
# new_df = df.loc(df['Type 1'] == 'Grass')

# Multiple filters:
new_df = df.loc[(df['Type 1'] == 'Grass') & (df['Type 2'] == 'Poison') & (df['HP'] > 70)]

print(new_df)

## Reset index count:
# new_df.reset_index()

# Reset index to remove old index inplace:
new_df.reset_index(drop=True, inplace=True)

## Filtering by Specific Content of a Column:
# new_df = df.loc[df['Name'].str.contains('Mega')]
# new_df = df.loc[~df['Name'].str.contains('Mega')] # Takes the reverse of the above statement, excludes all 'Mega'

new_df.to_csv('filtered.csv')

## Using re package:
# import re
# df.loc[df['Type 1'].str.contains('fire|grass', flags=re.I, regex=True)] #flags=re.I ignores capitalization
## or:
# df.loc[df['Type 1'].str.contains('Fire|Grass', regex=True)]

## specific segments of content:
# df.loc[df['Name'].str.contains('^pi[a-z]*', flags=re.I, regex=True)]

print(new_df)

       #                   Name Type 1  Type 2  Total   HP  Attack  Defense  \
2      3               Venusaur  Grass  Poison    525   80      82       83   
3      3  VenusaurMega Venusaur  Grass  Poison    625   80     100      123   
50    45              Vileplume  Grass  Poison    490   75      80       85   
77    71             Victreebel  Grass  Poison    490   80     105       65   
652  591              Amoonguss  Grass  Poison    464  114      85       70   

     Sp. Atk  Sp. Def  Speed  Generation  Legendary  
2        100      100     80           1      False  
3        122      120     80           1      False  
50       110       90     50           1      False  
77       100       70     70           1      False  
652       85       80     30           5      False  
     #                   Name Type 1  Type 2  Total   HP  Attack  Defense  \
0    3               Venusaur  Grass  Poison    525   80      82       83   
1    3  VenusaurMega Venusaur  Grass  Poison   

## Conditional Changes

In [70]:
# Change specific data values:
# df.loc[df['Type 1'] == 'Fire', 'Type 1'] = 'Flamer'
# Changes all fire type pokemon to legendary:
# df.loc[df['Type 1'] == 'Fire', 'Legendary'] = True

# print(df)

df.loc[df['Total'] > 500, ['Generation','Legendary']] = ['Test 1', 'Test 2']

print(df)

       #                   Name   Type 1  Type 2  Total  HP  Attack  Defense  \
0      1              Bulbasaur    Grass  Poison    318  45      49       49   
1      2                Ivysaur    Grass  Poison    405  60      62       63   
2      3               Venusaur    Grass  Poison    525  80      82       83   
3      3  VenusaurMega Venusaur    Grass  Poison    625  80     100      123   
4      4             Charmander     Fire     NaN    309  39      52       43   
..   ...                    ...      ...     ...    ...  ..     ...      ...   
795  719                Diancie     Rock   Fairy    600  50     100      150   
796  719    DiancieMega Diancie     Rock   Fairy    700  50     160      110   
797  720    HoopaHoopa Confined  Psychic   Ghost    600  80     110       60   
798  720     HoopaHoopa Unbound  Psychic    Dark    680  80     160       60   
799  721              Volcanion     Fire   Water    600  80     110      120   

     Sp. Atk  Sp. Def  Speed Generation

## Aggregate Statistics (Groupby)

In [73]:
df = pd.read_csv('modified.csv', delimiter='\t')

# Adding a count column to the DataFrame, useful if your counting method has NaN in some locations:
df['count'] = 1

print(df)

# Group by single label:
df.groupby(['Type 1']).count() #include ['count'] to only show count column

# Group by multiple labels:
df.groupby(['Type 1', 'Type 2']).count()['count']

# Find best label 1 (Type 1) for given value of label 2 (HP):
# df.groupby(['Type 1']).mean().sort_values('HP', ascending = False)

# can also use sum(), mean(), count(), etc...
# df.groupby(['Type 1']).count()

       #                   Name   Type 1  Type 2  Total  HP  Attack  Defense  \
0      1              Bulbasaur    Grass  Poison    318  45      49       49   
1      2                Ivysaur    Grass  Poison    405  60      62       63   
2      3               Venusaur    Grass  Poison    525  80      82       83   
3      3  VenusaurMega Venusaur    Grass  Poison    625  80     100      123   
4      4             Charmander     Fire     NaN    309  39      52       43   
..   ...                    ...      ...     ...    ...  ..     ...      ...   
795  719                Diancie     Rock   Fairy    600  50     100      150   
796  719    DiancieMega Diancie     Rock   Fairy    700  50     160      110   
797  720    HoopaHoopa Confined  Psychic   Ghost    600  80     110       60   
798  720     HoopaHoopa Unbound  Psychic    Dark    680  80     160       60   
799  721              Volcanion     Fire   Water    600  80     110      120   

     Sp. Atk  Sp. Def  Speed  Generatio

Type 1  Type 2  
Bug     Electric     2
        Fighting     2
        Fire         2
        Flying      14
        Ghost        1
                    ..
Water   Ice          3
        Poison       3
        Psychic      5
        Rock         4
        Steel        1
Name: count, Length: 136, dtype: int64

## Working with large amounts of data

In [72]:
# Loading data in segments:
new_df = pd.DataFrame(columns=df.columns)

for df in pd.read_csv('modified.csv', delimiter='\t', chunksize=5): # Chunk size = 5 rows passed in at a time
    results = df.groupby(['Type 1']).count()
    
    new_df = pd.concat([new_df, results])
    
# # To see chunk working:
# for df in pd.read_csv('modified.csv', delimiter='\t', chunksize=5):
#     print ('CHUNK DF')
#     print(df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
