In [None]:
import pandas as pd # data frame library
import numpy as np # scientific computing library
from plotnine import * # plotting library

import warnings
warnings.filterwarnings('ignore')

### Let's go over the basics of plotnine/ggplot

##### First we load a dataset

In [None]:
titanic = pd.read_csv("02-titanic.csv")
titanic.head()

### Next, we will always use the following combination to draw a plot:

##### 1) What data to use
##### 2) What plot to show
##### 3) Add more information to display

![Image of Yaktocat](https://miro.medium.com/max/1593/1*hd6-LkI_sy4b4nu720eV_A.png)

### 1) What data to use

#### We first call the ggplot function and pass our data variable to it


In [None]:
ggplot(titanic)

#### Depending on the plot, we also need to specify what the x and y axes are

In [None]:
ggplot(titanic,aes(x='age',y='fare'))

#### Next we specify what kind of plot we want to show

In [None]:
ggplot(titanic,aes(x='age',y='fare')) + geom_point()

#### And finally, we get to add extra information to the plot

In [None]:
ggplot(titanic,aes(x='age',y='fare')) + geom_point() + labs(title = "Scatter Plot (age vs fare)")

#### Gaining more insight

In [None]:
ggplot(titanic,aes(x='age',y='fare',color='sex')) + geom_point() + labs(title = "Scatter Plot (age vs fare)")

In [None]:
titanic['survived'] = pd.Categorical(titanic['survived'])
ggplot(titanic,aes(x='age',y='fare',color='survived')) + geom_point() + labs(title = "Scatter Plot (age vs fare)")

### Let's look at other types of plots:

## Histogram
* To show underlying frequency distribution of a continuous data
* Helps identify the distribution, outliers, and skewness of the data

#### We plot histograms of attributes one at a time

In [None]:
ggplot(titanic,aes(x = 'age'))+geom_histogram()

#### We get to see that the median age lies roughly between 20 to 30 years

In [None]:
titanic['age'].describe()

#### We can modify how many bins are generated to better visualize the plot

In [None]:
ggplot(titanic,aes(x = 'age'))+geom_histogram(bins=15)

![Image of Yaktocat](https://chartio.com/assets/7fe114/tutorials/charts/histograms/64918b209c6e60b56bee9d8c7ba22dcd854370078e6b8377f272c85119080728/histogram-example-2.png)

## Bar Plot
* Represents categorical data with rectangular bars with height/length proportional to some attribute

In [None]:
ggplot(titanic,aes(x='sex'))+geom_bar() # how to add coloration?

In [None]:
def num2cat(value):
    if 0 < value <= 8:
        return 'low'
    elif 8 < value <= 31:
        return 'mid'
    else:
        return 'high'

titanic['fare_cat'] = titanic['fare'].apply(num2cat)

In [None]:
ggplot(titanic,aes(x='fare_cat',fill='fare_cat'))+geom_bar()

##### Q. Can we only get the counts for each category using a bar plot?

In [None]:
titanic.groupby(['fare_cat']).mean()

In [None]:
titanic.groupby(['fare_cat'])[['age']].mean()

In [None]:
mean_age = titanic.groupby(['fare_cat'])[['age']].mean().reset_index()
mean_age

In [None]:
ggplot(mean_age,aes(x='fare_cat',y='age'))+geom_bar(stat='identity')

## Box & Whisker Plot
* Depicts numerical data through quartiles
* Ability to detect outliers and overall spread of data

In [None]:
ggplot(titanic,aes(x='survived',y='age',fill='survived'))+geom_boxplot()

In [None]:
ggplot(titanic,aes(x='fare_cat',y='age',fill='fare_cat'))+geom_boxplot()

In [None]:
ggplot(titanic,aes(x='fare_cat',y='age',fill='fare_cat'))+geom_boxplot()+geom_point()

In [None]:
ggplot(titanic,aes(x='fare_cat',y='age',fill='survived'))+geom_boxplot()