In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

### What you need to learn
- loading data ✅
- selection
    - cols ✅
    - rows ✅
- manipulation
    - add column ✅
    - remove columns ✅
    - rename columns ✅
    - change data type
    - delete data
    - remove duplicates
    - sort
    - group
    - pivot
    - merge
- visualization
    - line plot
    - bar plot
    - histogram
    - box plot
    - scatter plot
    - pie chart
    - heatmap
    - word cloud

In [None]:
penguins = sns.load_dataset('penguins')
penguins

- analysis
    - univariate analysis
    - bi-variate analysis
    - multi-variate analysis

- loading data
    - custom data from a variable
    - excel data
    - csv/ json/ etc
    - database (sql)

In [None]:
students = {
    'name':['Vaibhav', 'Siya', 'Aman','Amjad', 'Sneha','Aditi','Bhupendra'],
    'college':['SRMCEM', 'SRMCEM', "KMC", "KMC", 'CU','SRMCEM','SRMCEM'],
    'dob':['25 Jan','2 Feb','2 Apr', '16 Aug', '16 Sep','17 Dec','22 May']
}
sdf = pd.DataFrame(students)
sdf

In [None]:
data = pd.read_excel('c:/users/zaid/Documents/example_dataset.xlsx')
data

In [None]:
data.info()

In [None]:
data.columns.tolist()

In [None]:
data['Item'] # column selection as dictionary key

In [None]:
data.amt # column selection as object attribute

In [None]:
# data[collist]
data[['Item','amt']] # column selection as list of columns

operations between columns (numeric)

In [None]:
data['amt'] * data['qty']

adding columns

In [None]:
# data['new_col'] = expression
data['total'] = data['amt'] * data['qty']
data

In [None]:
data['crap'] = 'some crap'
data

removing columns

In [None]:
data.drop(columns=['crap'], inplace=True) # inplace=True to make changes permanent in dataframe
data

In [None]:
data.iloc[0]

In [None]:
data.iloc[:2] # first two rows

In [None]:
data.drop(index=[2])

In [None]:
data

In [None]:
sdf

In [None]:
data['qty'] = data['qty'] - 10

In [None]:
data

In [None]:
data.columns.tolist()

In [None]:
data.rename(
    mapper={'Item': 'product',
            'qty': 'quantity',
            'amt': 'Price',
            'total': 'Expenditure'},
    axis=1,
    inplace=True
)
data

23-oct-23

Operations
- delete data (drop)
    - columns
    - indexes
- remove duplicates
- sort
- group
- pivot
- merge

In [None]:
penguins.columns.tolist()

In [None]:
penguins.drop(columns='body_mass_g')

In [None]:
penguins.drop(columns=['island','body_mass_g'])

In [None]:
penguins.drop(index=2) # drop row 3rd

In [None]:
penguins.drop(index=[0,1,2,3,4]) # drop multiple rows

In [None]:
penguins.drop(index=range(10)) # drop multiple rows using range

In [None]:
np.nan

find the missing number of values in each columns

In [None]:
penguins.isnull().sum()

In [None]:
sns.heatmap(penguins.isnull(), cbar=False) # overall null value visualization

In [None]:
penguins.isnull().sum().plot(kind='bar') # column wise null value visualization

removing missing values (nan values)

In [None]:
penguins.dropna() # drop rows with null values

In [None]:
penguins.shape

In [None]:
penguins.dropna().shape

In [None]:
penguins.dropna(inplace=True)
print(penguins.shape)

In [None]:
penguins.drop_duplicates(subset=['bill_length_mm']) # example

In [None]:
penguins.sort_values(by='sex',ascending=False)

In [None]:
penguins.sort_values(by=['sex','island'])

datatypes in pandas
- int
- float
- boolean
- datetime
- object (if datatype is not understood by pandas then object is used)
- string (not understood by pandas by default, unless you specify)
- category

In [None]:
penguins.info()

In [None]:
penguins.dtypes # data types of columns

In [None]:
penguins.select_dtypes(include='number')

In [None]:
penguins.select_dtypes(include='object')

In [None]:
penguins.select_dtypes(exclude='number') # select all columns except numeric

change the datatype of a column

In [None]:
penguins['island'].astype('category') # not actually applied on the column

In [None]:
penguins['island'] = penguins['island'].astype('category') # casting object to category

In [None]:
penguins.dtypes

In [None]:
penguins.sex = penguins.sex.astype('bool')
penguins.head()

simple visualization - univariate
- for numerical data
    - line plot -> `.plot()`
    - bar plot -> `.plot(kind='bar')`
    - box plot -> `.plot(kind='box')`
    - area plot -> `.plot(kind='area')`
    - histogram -> `.hist()`
- for categorical data
    - bar plot -> `.plot(kind='bar')`
    - pie chart -> `.plot(kind='pie')`


In [None]:
penguins['bill_depth_mm'].hist()

In [None]:
penguins['bill_depth_mm'].plot(figsize=(15,5))

In [None]:
penguins['bill_depth_mm'].plot(kind='area', figsize=(15,5))

In [None]:
penguins['bill_depth_mm'].plot(kind='box', figsize=(15,5))

In [None]:
penguins.columns

In [None]:
penguins['bill_depth_mm'].plot(kind='box', figsize=(5,3), vert=False)

In [None]:
penguins['island'].value_counts()

In [None]:
penguins['island'].value_counts().plot(kind='bar')

comparing univariate data in a single graph

In [None]:
penguins[['bill_depth_mm','bill_length_mm']]

In [None]:
penguins[['bill_depth_mm','bill_length_mm']].plot()

In [None]:
penguins[['bill_depth_mm','bill_length_mm']].plot(kind='box', figsize=(15,5))

In [None]:
penguins[['bill_depth_mm','bill_length_mm']].hist(bins=20, figsize=(10,5))

putting columns visualization in subplots

In [None]:
penguins[['bill_depth_mm','bill_length_mm']].plot(layout=(1, 2),
                                                  subplots=True,
                                                  figsize=(15,5))

the power of seaborn - better visualization

In [None]:
sns.displot(data=penguins, x='bill_depth_mm')

In [None]:
sns.displot(data=penguins, x='body_mass_g', hue='island')

In [None]:
sns.countplot(data=penguins, x='island')

In [None]:
sns.countplot(data=penguins, x='island', hue='species')

bivariate analysis
- scatter plot (numeric vs numeric)
- violin plot (alternative to box plot)
- pair plot

In [None]:
sns.scatterplot(data=penguins, x='bill_length_mm', y='bill_depth_mm')

In [None]:
sns.scatterplot(data=penguins, x='bill_length_mm', y='bill_depth_mm', hue='species')

In [None]:
sns.scatterplot(data=penguins, x='bill_length_mm', y='bill_depth_mm', hue='island')

In [None]:
sns.color_palette('RdBu')

In [None]:
sns.set_palette('rainbow_r')

In [None]:
sns.scatterplot(data=penguins, x='bill_length_mm', 
                y='bill_depth_mm', hue='island', palette='hot')