# Pandas - Guided Practice
<img src='data/titanic_bw.jpg' width=80%>

---

### Objective
*Review or introduce the most common methods and uses in Pandas.*

## Import Library

In [1]:
# Import Pandas and matplotlib/seaborn for visualizations.
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## Creating Pandas DataFrame

In [4]:
# A Lists of Lists.
pd.DataFrame([
    [1, 2, 3, 4, 5],
    [6, 7, 8, 9, 10]
])

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,7,8,9,10


In [6]:
# A List of Dictionaries
# a = [{'a': 1}, {'b': 2}, {'c': 3}]
a = [{'a': 1, 'b': 10}, {'b': 5, 'a': 11}, {'a': 50, 'b': 100}]
pd.DataFrame(a)

Unnamed: 0,a,b
0,1,10
1,11,5
2,50,100


In [7]:
# A Dictionary of Lists
pd.DataFrame({
    'Age': [25, 31, 65, 41],
    'Gender': ['M', 'F', 'F', 'M']
})

Unnamed: 0,Age,Gender
0,25,M
1,31,F
2,65,F
3,41,M


## Loading Data from `csv`

In [8]:
!ls

2022-04-07.ipynb        [34mdata[m[m
README.md               practice-template.ipynb


In [11]:
# Load and save data as a variable. Show the dataframe.
df = pd.read_csv('data/titanic.csv')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## View Summary and Description of Data

In [13]:
# View summary of data.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


In [14]:
# View descriptive statistics of data.
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [15]:
df.shape

(891, 15)

In [17]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [19]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


## Explore
- Check number of rows / columns.
- Look at individual columns.
- Check if (and where) there are missing values.

In [20]:
# len(df) - not preferred.
len(df)

891

In [23]:
# This is better!
df.shape[0]

891

In [26]:
# Look at single columns
df.shape[1]

15

In [34]:
df['fare']

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: fare, Length: 891, dtype: float64

In [35]:
df[['fare', 'sex']]

Unnamed: 0,fare,sex
0,7.2500,male
1,71.2833,female
2,7.9250,female
3,53.1000,female
4,8.0500,male
...,...,...
886,13.0000,male
887,30.0000,female
888,23.4500,female
889,30.0000,male


In [39]:
# Check for missing values.
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

## Alter
- Group By.
- Drop rows.
- Drop columns.
- Slice out data based on some conditions.
- Alter values based on some criteria.

In [43]:
# Use `group_by` to reindex.
df.groupby(['sex', 'survived']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,pclass,age,sibsp,parch,fare,adult_male,alone
sex,survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,0,2.851852,25.046875,1.209877,1.037037,23.024385,0.0,0.333333
female,1,1.918455,28.847716,0.515021,0.515021,51.938573,0.0,0.424893
male,0,2.476496,31.618056,0.440171,0.207265,21.960993,0.959402,0.741453
male,1,2.018349,27.276022,0.385321,0.357798,40.821484,0.807339,0.587156


In [46]:
df.groupby('sex').agg(['mean', 'median'])

Unnamed: 0_level_0,survived,survived,pclass,pclass,age,age,sibsp,sibsp,parch,parch,fare,fare,adult_male,adult_male,alone,alone
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
female,0.742038,1,2.159236,2,27.915709,27.0,0.694268,0,0.649682,0,44.479818,23.0,0.0,False,0.401274,False
male,0.188908,0,2.389948,3,30.726645,29.0,0.429809,0,0.235702,0,25.523893,10.5,0.930676,True,0.712305,True


In [None]:
# Dropping missing values.


In [None]:
# Dropping rows.


In [None]:
# Dropping columns.


In [None]:
# Slicing data based on a condition. (using square brackets)


In [None]:
# Preferred method (using .loc)


In [None]:
# Altering data based on some criteria.


### Simple Plotting

In [None]:
# Survivors.


In [None]:
# Filtering


In [None]:
# Histogram showing male / female ages.
## LONGHAND: matplotlib and pandas filtering.


In [None]:
## Shorthand: seaborn.


In [None]:
# Most popular `embark_town`.
