In [2]:
import pandas as pd
import numpy as np

In [19]:
people = {
    'first': ['Griffin', 'Jane', 'John', 'Chris', np.nan, None, 'NA'],
    'last': ['Sargent', 'Doe', 'Doe', 'Sargent', np.nan, np.nan, 'Missing'],
    'email': ['griffin.a.sargent@gmail.com', 'jane@email.com', 'MikeJ@email.com', None, np.nan, 'anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [20]:
df = pd.DataFrame(people)
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)
df

Unnamed: 0,first,last,email,age
0,Griffin,Sargent,griffin.a.sargent@gmail.com,33.0
1,Jane,Doe,jane@email.com,55.0
2,John,Doe,MikeJ@email.com,63.0
3,Chris,Sargent,,36.0
4,,,,
5,,,anonymous@email.com,
6,,,,


# Drop all NA values

In [21]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Griffin,Sargent,griffin.a.sargent@gmail.com,33
1,Jane,Doe,jane@email.com,55
2,John,Doe,MikeJ@email.com,63


In [22]:
df.dropna(axis='index', how='any')  # These are the default parameters

Unnamed: 0,first,last,email,age
0,Griffin,Sargent,griffin.a.sargent@gmail.com,33
1,Jane,Doe,jane@email.com,55
2,John,Doe,MikeJ@email.com,63


In [23]:
# Drop only the rows with *all* missing values, as oppoed to "any" which drops rows with any columns with missing values
df.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age
0,Griffin,Sargent,griffin.a.sargent@gmail.com,33.0
1,Jane,Doe,jane@email.com,55.0
2,John,Doe,MikeJ@email.com,63.0
3,Chris,Sargent,,36.0
5,,,anonymous@email.com,


In [24]:
df.dropna(axis='columns', how='any') #drop columns based on them containing ANY NAs

0
1
2
3
4
5
6


In [25]:
df.dropna(axis='index', how='any', subset=['email'])    #look ONLY at email column to drop rows

Unnamed: 0,first,last,email,age
0,Griffin,Sargent,griffin.a.sargent@gmail.com,33.0
1,Jane,Doe,jane@email.com,55.0
2,John,Doe,MikeJ@email.com,63.0
5,,,anonymous@email.com,


In [26]:
df.dropna(axis='index', how='all', subset=['last', 'email'])

Unnamed: 0,first,last,email,age
0,Griffin,Sargent,griffin.a.sargent@gmail.com,33.0
1,Jane,Doe,jane@email.com,55.0
2,John,Doe,MikeJ@email.com,63.0
3,Chris,Sargent,,36.0
5,,,anonymous@email.com,


In [27]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [33]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Griffin,Sargent,griffin.a.sargent@gmail.com,33
1,Jane,Doe,jane@email.com,55
2,John,Doe,MikeJ@email.com,63
3,Chris,Sargent,0,36
4,0,0,0,0
5,0,0,anonymous@email.com,0
6,0,0,0,0


In [34]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [37]:
df['age'] = df['age'].astype(float)

In [39]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [40]:
df['age'].mean()

46.75