In [2]:
import pandas as pd

In [3]:
people = {
    "first": ["Corey", "Jane", "John"],
    "last": ["Schafer", "Doe", "Doe"],
    "email": ["CoreyMSchafer@gmail.com", "JaneDoe@email.com", "JohnDoe@gmail.com"]
}

people["first"]

['Corey', 'Jane', 'John']

In [4]:
df = pd.DataFrame(people)

In [5]:
df["first"]  # returns a Series

0    Corey
1     Jane
2     John
Name: first, dtype: object

In [6]:
df[["first", "last"]]  # returns a DataFrame

Unnamed: 0,first,last
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [7]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [8]:
df.index = ['cs', 'janed', 'johnd']

In [9]:
df.loc[['cs', 'janed']]

Unnamed: 0,first,last,email
cs,Corey,Schafer,CoreyMSchafer@gmail.com
janed,Jane,Doe,JaneDoe@email.com


In [10]:
df.iloc[[0, 1], [0, 1]]

Unnamed: 0,first,last
cs,Corey,Schafer
janed,Jane,Doe


In [11]:
df.iloc[0:2]

Unnamed: 0,first,last,email
cs,Corey,Schafer,CoreyMSchafer@gmail.com
janed,Jane,Doe,JaneDoe@email.com


In [12]:
df['email']

cs       CoreyMSchafer@gmail.com
janed          JaneDoe@email.com
johnd          JohnDoe@gmail.com
Name: email, dtype: object

In [13]:
df.set_index('email', inplace=True)

In [14]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
CoreyMSchafer@gmail.com,Corey,Schafer
JaneDoe@email.com,Jane,Doe
JohnDoe@gmail.com,John,Doe


In [15]:
df.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@gmail.com'], dtype='object', name='email')

In [16]:
df.reset_index()  # note it returns, you can also use inplace=True

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@gmail.com,John,Doe


In [17]:
filt = df['last'] == 'Doe'

In [18]:
df[filt]

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
JaneDoe@email.com,Jane,Doe
JohnDoe@gmail.com,John,Doe


In [19]:
df.loc[filt]

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
JaneDoe@email.com,Jane,Doe
JohnDoe@gmail.com,John,Doe


In [20]:
df.loc[filt, 'first']

email
JaneDoe@email.com    Jane
JohnDoe@gmail.com    John
Name: first, dtype: object

In [21]:
filt = (df['last'] == 'Schafer') | (df['first'] == 'John')

In [22]:
df.loc[~filt, 'first']

email
JaneDoe@email.com    Jane
Name: first, dtype: object

In [23]:
df = pd.DataFrame(people)

In [24]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@gmail.com


In [25]:
df['first'] + ' ' + df['last']

0    Corey Schafer
1         Jane Doe
2         John Doe
dtype: object

In [26]:
df['full_name'] = df['first'] + ' ' + df['last']

In [27]:
df.drop(columns=['first', 'last'])

Unnamed: 0,email,full_name
0,CoreyMSchafer@gmail.com,Corey Schafer
1,JaneDoe@email.com,Jane Doe
2,JohnDoe@gmail.com,John Doe


In [28]:
df['full_name'].str.split(' ')

0    [Corey, Schafer]
1         [Jane, Doe]
2         [John, Doe]
Name: full_name, dtype: object

In [29]:
df['full_name'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [30]:
df

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,CoreyMSchafer@gmail.com,Corey Schafer
1,Jane,Doe,JaneDoe@email.com,Jane Doe
2,John,Doe,JohnDoe@gmail.com,John Doe


In [31]:
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)

In [32]:
df

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,CoreyMSchafer@gmail.com,Corey Schafer
1,Jane,Doe,JaneDoe@email.com,Jane Doe
2,John,Doe,JohnDoe@gmail.com,John Doe


In [33]:
df.append({'first': 'Tony'}, ignore_index=True)

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,CoreyMSchafer@gmail.com,Corey Schafer
1,Jane,Doe,JaneDoe@email.com,Jane Doe
2,John,Doe,JohnDoe@gmail.com,John Doe
3,Tony,,,


In [34]:
people = {
    "first": ["Tony", "Steve"],
    "last": ["Stark", "Rogers"],
    "email": ["ironman@avenge.com", "cap@avenge.com"]
}
df2 = pd.DataFrame(people)


In [35]:
df.append(df2, ignore_index=True)

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,CoreyMSchafer@gmail.com,Corey Schafer
1,Jane,Doe,JaneDoe@email.com,Jane Doe
2,John,Doe,JohnDoe@gmail.com,John Doe
3,Tony,Stark,ironman@avenge.com,
4,Steve,Rogers,cap@avenge.com,


In [36]:
df = df.append(df2, ignore_index=True)

In [37]:
the_does = df['last'] == 'Doe'
df.drop(index=df[the_does].index, inplace=True)

In [38]:
df

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,CoreyMSchafer@gmail.com,Corey Schafer
3,Tony,Stark,ironman@avenge.com,
4,Steve,Rogers,cap@avenge.com,


In [39]:
people = {
    "first": ["Tony", "Steve"],
    "last": ["Stark", "Rogers"],
    "email": ["ironman@avenge.com", "cap@avenge.com"]
}
df = pd.DataFrame(people)


In [40]:
df.sort_values(by='last')

Unnamed: 0,first,last,email
1,Steve,Rogers,cap@avenge.com
0,Tony,Stark,ironman@avenge.com


In [41]:
people = {
    "first": ["Corey", "Jane", "John"],
    "last": ["Schafer", "Doe", "Doe"],
    "email": ["CoreyMSchafer@gmail.com", "JaneDoe@email.com", "JohnDoe@gmail.com"]
}

df = pd.DataFrame(people)

In [42]:
df.sort_values(by='last')

Unnamed: 0,first,last,email
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@gmail.com
0,Corey,Schafer,CoreyMSchafer@gmail.com


In [43]:
df.sort_values(by=['last', 'first'], ascending=False)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
2,John,Doe,JohnDoe@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [44]:
df.sort_values(by=['last', 'first'], ascending=[True, False], inplace=True)

In [45]:
df

Unnamed: 0,first,last,email
2,John,Doe,JohnDoe@gmail.com
1,Jane,Doe,JaneDoe@email.com
0,Corey,Schafer,CoreyMSchafer@gmail.com


In [46]:
df.sort_index(inplace=True)

In [47]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@gmail.com


In [48]:
import numpy as np

In [49]:
people = {
    "first": ["Corey", "Jane", "John", "Chris", np.nan, None, "NA"],
    "last": ["Schafer", "Doe", "Doe", "Schafer", np.nan, np.nan, "Missing"],
    "email": ["CoreyMSchafer@gmail.com", "JaneDoe@email.com", "JohnDoe@gmail.com", None, np.nan, "Anonymous@email.com", "NA"],
    "age": [33, 55, 63, 36, None, None, "Missing"]
}

In [73]:
df = pd.DataFrame(people)

In [74]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [75]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@gmail.com,63
6,,Missing,,Missing


In [76]:
df.dropna(how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [77]:
df.dropna(subset=['email'])  # drop any with missing email

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@gmail.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [78]:
df.dropna(how='all', subset=['last', 'email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [79]:
df.replace(['Missing', 'NA'], np.nan).dropna(how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@gmail.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [80]:
df.fillna('Missing')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,Missing,36
4,Missing,Missing,Missing,Missing
5,Missing,Missing,Anonymous@email.com,Missing
6,,Missing,,Missing


In [81]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [82]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [86]:
df['age'].replace('Missing', None, inplace=True)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@gmail.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,


In [88]:
df['age'] = df['age'].astype(float)

In [90]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [91]:
df.mean()  # wouldn't have worked when had 'Missing' as age for one row

age    46.75
dtype: float64