# Pandas - Guided Practice
<img src='data/titanic_bw.jpg' width=80%>

---

### Objective
*Review or introduce the most common methods and uses in Pandas.*

## Import Library

In [1]:
# Import Pandas and matplotlib/seaborn for visualizations.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
!ls

README.md               [34mdata[m[m                    practice-template.ipynb


## Creating Pandas DataFrame

In [5]:
# A Lists of Lists.
pd.DataFrame([[1,2,3,4,5],[6,7,8,9,10]])

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,7,8,9,10


In [11]:
# A List of Dictionaries
# A JSON file in this format is handled easily with pandas
a = pd.DataFrame([{'a':1, 'b':10},{'a':4, 'b':6},{'a':3, 'b':9}, {'c':3,'a':5}])
a

Unnamed: 0,a,b,c
0,1,10.0,
1,4,6.0,
2,3,9.0,
3,5,,3.0


In [12]:
a = [{'a':1, 'b':10},{'a':4, 'b':6},{'a':3, 'b':9}, {'c':3,'a':5}]
a

[{'a': 1, 'b': 10}, {'a': 4, 'b': 6}, {'a': 3, 'b': 9}, {'c': 3, 'a': 5}]

In [15]:
# A Dictionary of Lists
pd.DataFrame({'Age': [25,31,45,64],
               'Gender':['M','F','F','M']})

Unnamed: 0,Age,Gender
0,25,M
1,31,F
2,45,F
3,64,M


In [31]:
pd.DataFrame({'Age': [4], 'Gender':['M']})

Unnamed: 0,Age,Gender
0,4,M


In [28]:
pd.DataFrame([{'x':1,'y':2,'z':3 }])

Unnamed: 0,x,y,z
0,1,2,3


## Loading Data from `csv`

In [16]:
# Load and save data as a variable. Show the dataframe.
!ls

README.md               [34mdata[m[m                    practice-template.ipynb


In [32]:
df = pd.read_csv('data/titanic.csv')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## View Summary and Description of Data

In [24]:
# View summary of data.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


In [29]:
# View descriptive statistics of data.
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


## Explore
- Check number of rows / columns.
- Look at individual columns.
- Check if (and where) there are missing values.

In [36]:
# Check shape.
df.shape

(891, 15)

In [33]:
# len(df) - not preferred.
len(df)
#ONLY GIVES THE LENGTH OF FIRST DIMENSION

891

In [34]:
# This is better!
df.shape[0]

891

In [35]:
# Look at single columns
df.shape[1]

15

In [None]:
# Check for missing values.


In [38]:
df[['fare']]  # matrix

Unnamed: 0,fare
0,7.2500
1,71.2833
2,7.9250
3,53.1000
4,8.0500
...,...
886,13.0000
887,30.0000
888,23.4500
889,30.0000


In [39]:
df['fare']   # vector

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: fare, Length: 891, dtype: float64

In [40]:
df['fare'].values

array([  7.25  ,  71.2833,   7.925 ,  53.1   ,   8.05  ,   8.4583,
        51.8625,  21.075 ,  11.1333,  30.0708,  16.7   ,  26.55  ,
         8.05  ,  31.275 ,   7.8542,  16.    ,  29.125 ,  13.    ,
        18.    ,   7.225 ,  26.    ,  13.    ,   8.0292,  35.5   ,
        21.075 ,  31.3875,   7.225 , 263.    ,   7.8792,   7.8958,
        27.7208, 146.5208,   7.75  ,  10.5   ,  82.1708,  52.    ,
         7.2292,   8.05  ,  18.    ,  11.2417,   9.475 ,  21.    ,
         7.8958,  41.5792,   7.8792,   8.05  ,  15.5   ,   7.75  ,
        21.6792,  17.8   ,  39.6875,   7.8   ,  76.7292,  26.    ,
        61.9792,  35.5   ,  10.5   ,   7.2292,  27.75  ,  46.9   ,
         7.2292,  80.    ,  83.475 ,  27.9   ,  27.7208,  15.2458,
        10.5   ,   8.1583,   7.925 ,   8.6625,  10.5   ,  46.9   ,
        73.5   ,  14.4542,  56.4958,   7.65  ,   7.8958,   8.05  ,
        29.    ,  12.475 ,   9.    ,   9.5   ,   7.7875,  47.1   ,
        10.5   ,  15.85  ,  34.375 ,   8.05  , 263.    ,   8.0

In [41]:
df[['fare']].values

array([[  7.25  ],
       [ 71.2833],
       [  7.925 ],
       [ 53.1   ],
       [  8.05  ],
       [  8.4583],
       [ 51.8625],
       [ 21.075 ],
       [ 11.1333],
       [ 30.0708],
       [ 16.7   ],
       [ 26.55  ],
       [  8.05  ],
       [ 31.275 ],
       [  7.8542],
       [ 16.    ],
       [ 29.125 ],
       [ 13.    ],
       [ 18.    ],
       [  7.225 ],
       [ 26.    ],
       [ 13.    ],
       [  8.0292],
       [ 35.5   ],
       [ 21.075 ],
       [ 31.3875],
       [  7.225 ],
       [263.    ],
       [  7.8792],
       [  7.8958],
       [ 27.7208],
       [146.5208],
       [  7.75  ],
       [ 10.5   ],
       [ 82.1708],
       [ 52.    ],
       [  7.2292],
       [  8.05  ],
       [ 18.    ],
       [ 11.2417],
       [  9.475 ],
       [ 21.    ],
       [  7.8958],
       [ 41.5792],
       [  7.8792],
       [  8.05  ],
       [ 15.5   ],
       [  7.75  ],
       [ 21.6792],
       [ 17.8   ],
       [ 39.6875],
       [  7.8   ],
       [ 76.

In [57]:
df[['fare','sex']]  # DROP LOC for quick visualizations

Unnamed: 0,fare,sex
0,7.2500,male
1,71.2833,female
2,7.9250,female
3,53.1000,female
4,8.0500,male
...,...,...
886,13.0000,male
887,30.0000,female
888,23.4500,female
889,30.0000,male


In [64]:
#  ALWAYS PREFER USING .LOC WHEN UPDATING DATA _ THEN IT IS STORED AS A PANDAS OBJECT
df.loc[:,['fare', 'sex']]


Unnamed: 0,fare,sex
0,7.2500,male
1,71.2833,female
2,7.9250,female
3,53.1000,female
4,8.0500,male
...,...,...
886,13.0000,male
887,30.0000,female
888,23.4500,female
889,30.0000,male


## Alter
- Group By.
- Drop rows.
- Drop columns.
- Slice out data based on some conditions.
- Alter values based on some criteria.

In [49]:
df.isna()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
887,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False
889,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [48]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [51]:
df.isna().sum(axis=1)  # summing on the rows

0      1
1      0
2      1
3      0
4      1
      ..
886    1
887    0
888    2
889    0
890    1
Length: 891, dtype: int64

In [53]:
# Use `group_by` to reindex.
df.groupby('sex').mean()

Unnamed: 0_level_0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,0.742038,2.159236,27.915709,0.694268,0.649682,44.479818,0.0,0.401274
male,0.188908,2.389948,30.726645,0.429809,0.235702,25.523893,0.930676,0.712305


In [62]:
df.groupby(['sex', 'survived']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,pclass,age,sibsp,parch,fare,adult_male,alone
sex,survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,0,2.851852,25.046875,1.209877,1.037037,23.024385,0.0,0.333333
female,1,1.918455,28.847716,0.515021,0.515021,51.938573,0.0,0.424893
male,0,2.476496,31.618056,0.440171,0.207265,21.960993,0.959402,0.741453
male,1,2.018349,27.276022,0.385321,0.357798,40.821484,0.807339,0.587156


In [67]:
df.groupby(['sex', 'survived']).agg(['mean', 'median'])

Unnamed: 0_level_0,Unnamed: 1_level_0,pclass,pclass,age,age,sibsp,sibsp,parch,parch,fare,fare,adult_male,adult_male,alone,alone
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
sex,survived,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
female,0,2.851852,3,25.046875,24.5,1.209877,1,1.037037,0,23.024385,15.2458,0.0,False,0.333333,False
female,1,1.918455,2,28.847716,28.0,0.515021,0,0.515021,0,51.938573,26.0,0.0,False,0.424893,False
male,0,2.476496,3,31.618056,29.0,0.440171,0,0.207265,0,21.960993,9.41665,0.959402,True,0.741453,True
male,1,2.018349,2,27.276022,28.0,0.385321,0,0.357798,0,40.821484,26.2875,0.807339,True,0.587156,True


In [None]:
# Dropping missing values.


In [None]:
# Dropping rows.


In [None]:
# Dropping columns.


In [None]:
# Slicing data based on a condition. (using square brackets)


In [None]:
# Preferred method (using .loc)


In [None]:
# Altering data based on some criteria.


### Simple Plotting

In [None]:
# Survivors.


In [None]:
# Filtering


In [None]:
# Histogram showing male / female ages.
## LONGHAND: matplotlib and pandas filtering.


In [None]:
## Shorthand: seaborn.


In [None]:
# Most popular `embark_town`.
