In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url = 'https://raw.githubusercontent.com/digipodium/Datasets/main/classfication/pokemon_type.csv'
df = pd.read_csv(url) # df is a an object

In [None]:
# optional settings for viewing all columns
pd.set_option('display.max_columns', None)

In [None]:
df # dont display df using print() function 

# basic pandas operations
- `head(n)` : returns the first n rows of a DataFrame
- `tail(n)` : returns the last n rows of a DataFrame
- `info()` : Index, Column Datatype and Memory information
- `describe()` : Summary statistics for numerical columns
- `describe(include=['object'])` : Summary statistics for object columns


In [None]:
df.head() # display first 5 rows

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.describe()

object datatype in pandas is equivalent to string datatype in python

In [None]:
df.describe(include=['object'])

# Selection in pandas
- column selection
    - `df_var['column_name']`
    - `df_var.column_name`
    - `df_var[['column_name1', 'column_name2']]`
- row selection
    - `df_var.loc['row_name']`
    - `df_var.iloc[row_index]`
    - `df_var.iloc[row_index, column_index]`

In [None]:
df['Name'] # series

In [None]:
df.Attack # series

In [None]:
df[['Name', 'Attack']] # dataframe

In [None]:
cols = ['Name','Attack','Defense','Speed', 'HP']
df[cols] # dataframe

for people who are case-insensitive, you can use `df_var.columns.str.lower()` to convert all column names to lowercase

In [None]:
# listing all colums
df.columns.tolist()

In [None]:
df[['Name', 'Attack']].head(25).plot(x='Name', y='Attack',
                                      kind='bar', figsize=(15,5))

In [None]:
df[['Name','Attack','Defense']].head(25).plot(x='Name', y=['Attack','Defense'],
                                               kind='bar', figsize=(15,5))

row selection

In [None]:
df.loc[4] # 4 is taken as label

In [None]:
df.iloc[100] # 100 is taken as index

In [None]:
df.iloc[5:15] # 5: 15 -> 10 rows

In [None]:
df.iloc[5:15, 2: 6] # 5:15 -> 10 rows and 2 to 5 columns

# basic mathemtical operations
`df_var['column_name'].operation()`
- sum()
- mean()
- median()
- mode()
- std()
- var()

In [None]:
print(df['Attack'].sum())
print(df['Attack'].mean())
print(df['Attack'].max(), '@',df['Attack'].argmax()) # argmax() returns index
print(df['Defense'].std())
print(df['Defense'].median())

# statistical operations on dataframe
- `df_var.corr()` : correlation between columns (numerical columns only)
- `df_var.count()` : count of non-null values in each column
- `df_var.max()` : maximum value in each column (numerical columns only)
- `df_var.min()` : minimum value in each column (numerical columns only)

In [None]:
df[['Attack','Defense','Speed','HP','Total']].corr()

In [None]:
sns.relplot(x='Attack', y='Defense', data=df,)

In [None]:
df.count() # count the number of non-null values in each column

In [None]:
df.select_dtypes(include='number').max() # select all numeric columns then find max

dataframe manipulation
- sorting
    - `df_var.sort_values(by='column_name')`
    - `df_var.sort_values(by='column_name', ascending=False)`
- adding new columns
    - `df_var['new_column_name'] = df_var['column_name1'] + df_var['column_name2']`
- dropping columns
    - `df_var.drop('column_name', axis=1)`

In [None]:
df.sort_values(by='Attack').tail(25).plot(x='Name', y=['Attack','Defense','Speed'],
                                            kind='bar', figsize=(15,5))