In [110]:
import pandas as pd

In [111]:
contacts = {
            'name': ['Susan Calvin', 'Bently Powell', 'Gregory Powell', 'Mike Donovan'],
            'city': ['London', 'Kathmandu', 'Moskow', 'Bangalore'],
            'phone': ['056152358', '096523995', '895712365', '886549702'],
            'age' : ['28', '42', '66', '67'],
            'e-mail': ['SusanCalvin@email.com', 'BentlyP@email.com', 'GregP14@email.com', 'MDonovan@email.com']
            }

In [112]:
# create a dataframe that can be worked with pandas
df = pd.DataFrame(contacts)

In [113]:
# print the dataframe, that's the best way to do so.
df

Unnamed: 0,name,city,phone,age,e-mail
0,Susan Calvin,London,56152358,28,SusanCalvin@email.com
1,Bently Powell,Kathmandu,96523995,42,BentlyP@email.com
2,Gregory Powell,Moskow,895712365,66,GregP14@email.com
3,Mike Donovan,Bangalore,886549702,67,MDonovan@email.com


In [114]:
# shape of the df (rows, columns)
df.shape

(4, 5)

In [115]:
df.columns

Index(['name', 'city', 'phone', 'age', 'e-mail'], dtype='object')

### Working with columns

In [116]:
# select a column
df['name']

0      Susan Calvin
1     Bently Powell
2    Gregory Powell
3      Mike Donovan
Name: name, dtype: object

In [117]:
# selecting 2 or more columns, notice the extra braket
df[['name', 'city']]

Unnamed: 0,name,city
0,Susan Calvin,London
1,Bently Powell,Kathmandu
2,Gregory Powell,Moskow
3,Mike Donovan,Bangalore


In [118]:
# select an entry on a given column
df['e-mail'][1]

'BentlyP@email.com'

### Renaming columns

In [119]:
# observe that it's a list
df.columns

Index(['name', 'city', 'phone', 'age', 'e-mail'], dtype='object')

In [120]:
# renaming all at once
df.columns = ['NAME', 'CITY', 'PHONE', 'AGE', 'E-MAIL']
df

Unnamed: 0,NAME,CITY,PHONE,AGE,E-MAIL
0,Susan Calvin,London,56152358,28,SusanCalvin@email.com
1,Bently Powell,Kathmandu,96523995,42,BentlyP@email.com
2,Gregory Powell,Moskow,895712365,66,GregP14@email.com
3,Mike Donovan,Bangalore,886549702,67,MDonovan@email.com


In [121]:
# using list comprehension
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,name,city,phone,age,e-mail
0,Susan Calvin,London,56152358,28,SusanCalvin@email.com
1,Bently Powell,Kathmandu,96523995,42,BentlyP@email.com
2,Gregory Powell,Moskow,895712365,66,GregP14@email.com
3,Mike Donovan,Bangalore,886549702,67,MDonovan@email.com


In [122]:
# item by item
df.rename(columns = {'name': 'full_name', 'e-mail': 'email'}, inplace=True)
df

Unnamed: 0,full_name,city,phone,age,email
0,Susan Calvin,London,56152358,28,SusanCalvin@email.com
1,Bently Powell,Kathmandu,96523995,42,BentlyP@email.com
2,Gregory Powell,Moskow,895712365,66,GregP14@email.com
3,Mike Donovan,Bangalore,886549702,67,MDonovan@email.com


### Working with rows
The most common way to access rows are from 2 comands **iloc** and **loc**.
* iloc uses numbered index to access items.
* loc can use column labels to access items and it gives more options.

In [123]:
df.iloc[3]

full_name          Mike Donovan
city                  Bangalore
phone                 886549702
age                          67
email        MDonovan@email.com
Name: 3, dtype: object

In [124]:
# to access more than one column or row use the brakets
df.loc[[1,3,0], ['full_name', 'email']]

Unnamed: 0,full_name,email
1,Bently Powell,BentlyP@email.com
3,Mike Donovan,MDonovan@email.com
0,Susan Calvin,SusanCalvin@email.com


In [125]:
# no brakets needed to slicing, differently to the python standard the stop index is inclusive
# [row, column]
df.loc[1:3, 'full_name':'phone']

Unnamed: 0,full_name,city,phone
1,Bently Powell,Kathmandu,96523995
2,Gregory Powell,Moskow,895712365
3,Mike Donovan,Bangalore,886549702


### Updating rows

In [126]:
df.loc[3]

full_name          Mike Donovan
city                  Bangalore
phone                 886549702
age                          67
email        MDonovan@email.com
Name: 3, dtype: object

In [127]:
# updating all items
df.loc[3] = ['Mike Howard', 'New Delhi', '225896337', '68', 'MHoward@email.com']
df

Unnamed: 0,full_name,city,phone,age,email
0,Susan Calvin,London,56152358,28,SusanCalvin@email.com
1,Bently Powell,Kathmandu,96523995,42,BentlyP@email.com
2,Gregory Powell,Moskow,895712365,66,GregP14@email.com
3,Mike Howard,New Delhi,225896337,68,MHoward@email.com


In [128]:
# updating selected items
df.loc[3, ['full_name', 'email']] = ['Mike Donovan', 'MDonovan@email.com']
df

Unnamed: 0,full_name,city,phone,age,email
0,Susan Calvin,London,56152358,28,SusanCalvin@email.com
1,Bently Powell,Kathmandu,96523995,42,BentlyP@email.com
2,Gregory Powell,Moskow,895712365,66,GregP14@email.com
3,Mike Donovan,New Delhi,225896337,68,MDonovan@email.com


### Indexing

In [129]:
# set email column as index. To apply the change inplace must be True.
df.set_index('email', inplace=True)
df

Unnamed: 0_level_0,full_name,city,phone,age
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SusanCalvin@email.com,Susan Calvin,London,56152358,28
BentlyP@email.com,Bently Powell,Kathmandu,96523995,42
GregP14@email.com,Gregory Powell,Moskow,895712365,66
MDonovan@email.com,Mike Donovan,New Delhi,225896337,68


In [130]:
# now the email is used as the index and loc cannot use the index numbers anymore 
df.loc['GregP14@email.com']

full_name    Gregory Powell
city                 Moskow
phone             895712365
age                      66
Name: GregP14@email.com, dtype: object

In [131]:
df.loc['MDonovan@email.com', ['full_name', 'phone']]

full_name    Mike Donovan
phone           225896337
Name: MDonovan@email.com, dtype: object

In [132]:
# iloc can still be used with the index number
df.iloc[1]

full_name    Bently Powell
city             Kathmandu
phone            096523995
age                     42
Name: BentlyP@email.com, dtype: object

In [133]:
# sorting the index, ascending order is the default
df.sort_index(inplace=True)
df

Unnamed: 0_level_0,full_name,city,phone,age
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BentlyP@email.com,Bently Powell,Kathmandu,96523995,42
GregP14@email.com,Gregory Powell,Moskow,895712365,66
MDonovan@email.com,Mike Donovan,New Delhi,225896337,68
SusanCalvin@email.com,Susan Calvin,London,56152358,28


In [134]:
# sorting the index in the descent order 
df.sort_index(ascending=False, inplace=True)
df

Unnamed: 0_level_0,full_name,city,phone,age
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SusanCalvin@email.com,Susan Calvin,London,56152358,28
MDonovan@email.com,Mike Donovan,New Delhi,225896337,68
GregP14@email.com,Gregory Powell,Moskow,895712365,66
BentlyP@email.com,Bently Powell,Kathmandu,96523995,42


In [135]:
# to reset the index
df.reset_index(inplace=True)
df

Unnamed: 0,email,full_name,city,phone,age
0,SusanCalvin@email.com,Susan Calvin,London,56152358,28
1,MDonovan@email.com,Mike Donovan,New Delhi,225896337,68
2,GregP14@email.com,Gregory Powell,Moskow,895712365,66
3,BentlyP@email.com,Bently Powell,Kathmandu,96523995,42


### Adding/removing Columns and rows

In [136]:
# splitting column full_name into 2 new columns: first and last
# expand: Boolean value, returns a data frame with different value in different columns if True. Else it returns a series with list of strings.
# https://www.geeksforgeeks.org/python-pandas-split-strings-into-two-list-columns-using-str-split/

df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)
df

Unnamed: 0,email,full_name,city,phone,age,first,last
0,SusanCalvin@email.com,Susan Calvin,London,56152358,28,Susan,Calvin
1,MDonovan@email.com,Mike Donovan,New Delhi,225896337,68,Mike,Donovan
2,GregP14@email.com,Gregory Powell,Moskow,895712365,66,Gregory,Powell
3,BentlyP@email.com,Bently Powell,Kathmandu,96523995,42,Bently,Powell


In [137]:
# adding items from 2 columns to form a new one
df['full_name_2'] = df['first'] + ' ' + df['last']
df

Unnamed: 0,email,full_name,city,phone,age,first,last,full_name_2
0,SusanCalvin@email.com,Susan Calvin,London,56152358,28,Susan,Calvin,Susan Calvin
1,MDonovan@email.com,Mike Donovan,New Delhi,225896337,68,Mike,Donovan,Mike Donovan
2,GregP14@email.com,Gregory Powell,Moskow,895712365,66,Gregory,Powell,Gregory Powell
3,BentlyP@email.com,Bently Powell,Kathmandu,96523995,42,Bently,Powell,Bently Powell


In [138]:
# removing columns
df.drop(columns=['full_name','full_name_2'], inplace=True)
df

Unnamed: 0,email,city,phone,age,first,last
0,SusanCalvin@email.com,London,56152358,28,Susan,Calvin
1,MDonovan@email.com,New Delhi,225896337,68,Mike,Donovan
2,GregP14@email.com,Moskow,895712365,66,Gregory,Powell
3,BentlyP@email.com,Kathmandu,96523995,42,Bently,Powell


In [139]:
# removing rows, to apply use inplace=True
df.drop(index= [1, 3])

Unnamed: 0,email,city,phone,age,first,last
0,SusanCalvin@email.com,London,56152358,28,Susan,Calvin
2,GregP14@email.com,Moskow,895712365,66,Gregory,Powell


In [140]:
### Updating rows

### apply
apply() - applies a function to the selected row or column

In [146]:
# with auxiliary function str
df['email'] = df['email'].str.upper() 
df

Unnamed: 0,email,city,phone,age,first,last
0,SUSANCALVIN@EMAIL.COM,London,56152358,28,Susan,Calvin
1,MDONOVAN@EMAIL.COM,New Delhi,225896337,68,Mike,Donovan
2,GREGP14@EMAIL.COM,Moskow,895712365,66,Gregory,Powell
3,BENTLYP@EMAIL.COM,Kathmandu,96523995,42,Bently,Powell


In [147]:
# lowering case of all emails usins apply. Notice the function has to be written withouth the end ()
df['email'] = df['email'].apply(str.lower) 
df

Unnamed: 0,email,city,phone,age,first,last
0,susancalvin@email.com,London,56152358,28,Susan,Calvin
1,mdonovan@email.com,New Delhi,225896337,68,Mike,Donovan
2,gregp14@email.com,Moskow,895712365,66,Gregory,Powell
3,bentlyp@email.com,Kathmandu,96523995,42,Bently,Powell


In [148]:
df['email'].apply(len)

0    21
1    18
2    17
3    17
Name: email, dtype: int64

### Filtering

### Concatenating