# DataFrame and Series Basics

In [68]:
import pandas as pd

In [69]:
people = {
    "first": ["Corey", "Jane", "John"],
    "last": ["Schafer", "Doe", "Doe"],
    "email": ["CoreyMSchafer@gmail.com", "JaneDoe@email.com", "JohnDoe@gmail.com"]
}

people["first"]

['Corey', 'Jane', 'John']

In [70]:
df = pd.DataFrame(people)

In [71]:
df["first"]  # returns a Series

0    Corey
1     Jane
2     John
Name: first, dtype: object

In [72]:
df[["first", "last"]]  # returns a DataFrame

Unnamed: 0,first,last
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [73]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [74]:
df.index = ['cs', 'janed', 'johnd']

In [75]:
df.loc[['cs', 'janed']]

Unnamed: 0,first,last,email
cs,Corey,Schafer,CoreyMSchafer@gmail.com
janed,Jane,Doe,JaneDoe@email.com


In [76]:
df.iloc[[0, 1], [0, 1]]

Unnamed: 0,first,last
cs,Corey,Schafer
janed,Jane,Doe


In [77]:
df.iloc[0:2]

Unnamed: 0,first,last,email
cs,Corey,Schafer,CoreyMSchafer@gmail.com
janed,Jane,Doe,JaneDoe@email.com


In [78]:
df['email']

cs       CoreyMSchafer@gmail.com
janed          JaneDoe@email.com
johnd          JohnDoe@gmail.com
Name: email, dtype: object

# Using indexes

In [79]:
df.set_index('email', inplace=True)

In [80]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
CoreyMSchafer@gmail.com,Corey,Schafer
JaneDoe@email.com,Jane,Doe
JohnDoe@gmail.com,John,Doe


In [81]:
df.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@gmail.com'], dtype='object', name='email')

In [82]:
df.reset_index()  # note it returns, you can also use inplace=True

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@gmail.com,John,Doe


# Filtering

In [84]:
critieria = df['last'] == 'Doe'

In [85]:
df[critieria]

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
JaneDoe@email.com,Jane,Doe
JohnDoe@gmail.com,John,Doe


In [86]:
df.loc[critieria]

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
JaneDoe@email.com,Jane,Doe
JohnDoe@gmail.com,John,Doe


In [87]:
df.loc[critieria, 'first']

email
JaneDoe@email.com    Jane
JohnDoe@gmail.com    John
Name: first, dtype: object

In [88]:
critieria = (df['last'] == 'Schafer') | (df['first'] == 'John')


In [89]:
df.loc[critieria, 'first']

email
CoreyMSchafer@gmail.com    Corey
JohnDoe@gmail.com           John
Name: first, dtype: object

In [90]:
df.loc[~critieria, 'first']

email
JaneDoe@email.com    Jane
Name: first, dtype: object

# Updating Rows and Columns

In [145]:
df = pd.DataFrame(people)

In [146]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [147]:
df.columns = ['first_name', 'last_name', 'email']
df

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@gmail.com


In [148]:
df.columns = [col.upper() for col in df.columns]
df

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@gmail.com


In [149]:
df.columns = df.columns.str.upper()
df

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@gmail.com


In [150]:
df.rename(columns = {'FIRST_NAME': 'first', 'LAST_NAME': 'last', 'EMAIL': 'email'}, inplace=True)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@gmail.com


In [151]:
df.loc[2] = ['John', 'Smith', 'JohnSmith@email.com']
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnSmith@email.com


In [152]:
df.loc[2, ['last', 'email']] = ['Doe', 'JohnDoe@email.com']
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [153]:
df.loc[2, 'last'] = 'Smith'
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnDoe@email.com


In [154]:
df.at[2, 'last'] = 'Smith'
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnDoe@email.com


What happens if you forget to use `.at` or `.loc` to update a value? It sometimes goes wrong for people like this...

In [155]:
criteria = (df['email'] == 'JohnDoe@email.com')
df[criteria]['last'] = 'Doe'  # causes a warning for (fairly) obvious reasons

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [156]:
df  # not updated

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Smith,JohnDoe@email.com


In [157]:
df.loc[criteria, 'last'] = 'Doe'
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [158]:
df['email'] = df['email'].str.lower()
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Doe,johndoe@email.com


## apply()

In [159]:
df['email'].apply(len)

0    23
1    17
2    17
Name: email, dtype: int64

In [160]:
def update_email(email):
    return email.upper()

In [166]:
df['email'] = df['email'].apply(str.lower)
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Doe,johndoe@email.com


In [171]:
df.apply(pd.Series.min)  # on a DataFrame apply runs the function on each column

first                      Corey
last                         Doe
email    coreymschafer@gmail.com
dtype: object

In [172]:
df.apply(pd.Series.min, axis='columns')  # unless you tell it change the axis

0    Corey
1      Doe
2      Doe
dtype: object

## applymap

In [173]:
df.applymap(len)

Unnamed: 0,first,last,email
0,5,7,23
1,4,3,17
2,4,3,17


In [174]:
df.applymap(str.lower)

Unnamed: 0,first,last,email
0,corey,schafer,coreymschafer@gmail.com
1,jane,doe,janedoe@email.com
2,john,doe,johndoe@email.com


## map

In [176]:
df['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

## replace

In [181]:
df['first'] = df['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})
df

Unnamed: 0,first,last,email
0,Chris,Schafer,coreymschafer@gmail.com
1,Mary,Doe,janedoe@email.com
2,John,Doe,johndoe@email.com
