# Pandas DataFrame Tips & Tricks

In [1]:
# import pandas
import pandas as pd

## 1. Select multiple rows and columns using .loc

In [2]:
countries = pd.DataFrame({
'country': ['United States', 'The Netherlands', 'Spain', 'Mexico', 'Australia'],
'capital': ['Washington D.C.', 'Amsterdam', 'Madrid', 'Mexico City', 'Canberra'],
'continent': ['North America', 'Europe', 'Europe', 'North America', 'Australia'],
'language': ['English', 'Dutch', 'Spanish', 'Spanish', 'English']})

In [3]:
# By using the loc operator, we are able to select subsets of rows and columns on the basis of their index label and column name
countries.loc[:, 'country':'continent']

Unnamed: 0,country,capital,continent
0,United States,Washington D.C.,North America
1,The Netherlands,Amsterdam,Europe
2,Spain,Madrid,Europe
3,Mexico,Mexico City,North America
4,Australia,Canberra,Australia


In [4]:
countries.loc[0:2, 'country':'continent']

Unnamed: 0,country,capital,continent
0,United States,Washington D.C.,North America
1,The Netherlands,Amsterdam,Europe
2,Spain,Madrid,Europe


In [5]:
countries.loc[[0, 4], ['country', 'language']]

Unnamed: 0,country,language
0,United States,English
4,Australia,English


## 2. Filter DataFrames by category

In [6]:
# For a single category, we are able to do this by using the == operator. However, for multiple categories, we have to make use of the isin method
countries[countries.continent == 'Europe']

Unnamed: 0,country,capital,continent,language
1,The Netherlands,Amsterdam,Europe,Dutch
2,Spain,Madrid,Europe,Spanish


In [7]:
countries[countries.language.isin(['Dutch', 'English'])]

Unnamed: 0,country,capital,continent,language
0,United States,Washington D.C.,North America,English
1,The Netherlands,Amsterdam,Europe,Dutch
4,Australia,Canberra,Australia,English


## 3. Filter DataFrames by excluding categories

In [8]:
# As opposed to filtering by category, we may want to filter our DataFrame by excluding certain categories. 
# We do this by making use of the ~ (tilde) sign, which is the complement operator
countries[~countries.continent.isin(['Europe'])]

Unnamed: 0,country,capital,continent,language
0,United States,Washington D.C.,North America,English
3,Mexico,Mexico City,North America,Spanish
4,Australia,Canberra,Australia,English


In [9]:
countries[~countries.language.isin(['Dutch', 'English'])]

Unnamed: 0,country,capital,continent,language
2,Spain,Madrid,Europe,Spanish
3,Mexico,Mexico City,North America,Spanish


## 4. Rename columns

In [10]:
# You might want to change the name of certain columns because e.g. the name is incorrect or incomplete. 
# For example, we might want to change the ‘capital’ column name to ‘capital_city’ and ‘language’ to ‘most_spoken_language’. 
countries.rename({'capital': 'capital_city', 'language': 'most_spoken_language'}, axis='columns')

Unnamed: 0,country,capital_city,continent,most_spoken_language
0,United States,Washington D.C.,North America,English
1,The Netherlands,Amsterdam,Europe,Dutch
2,Spain,Madrid,Europe,Spanish
3,Mexico,Mexico City,North America,Spanish
4,Australia,Canberra,Australia,English


In [13]:
countries.columns = ['country', 'capital_city', 'continent', 'most_spoken_language']
countries.head()

Unnamed: 0,country,capital_city,continent,most_spoken_language
0,United States,Washington D.C.,North America,English
1,The Netherlands,Amsterdam,Europe,Dutch
2,Spain,Madrid,Europe,Spanish
3,Mexico,Mexico City,North America,Spanish
4,Australia,Canberra,Australia,English


## 5. Reverse row order

In [14]:
# To reverse the row order, we make use of the loc operator
countries.loc[::-1]

Unnamed: 0,country,capital_city,continent,most_spoken_language
4,Australia,Canberra,Australia,English
3,Mexico,Mexico City,North America,Spanish
2,Spain,Madrid,Europe,Spanish
1,The Netherlands,Amsterdam,Europe,Dutch
0,United States,Washington D.C.,North America,English


In [15]:
# We have to make use of the reset_index function to reset the indexes
countries.loc[::-1].reset_index(drop=True)

Unnamed: 0,country,capital_city,continent,most_spoken_language
0,Australia,Canberra,Australia,English
1,Mexico,Mexico City,North America,Spanish
2,Spain,Madrid,Europe,Spanish
3,The Netherlands,Amsterdam,Europe,Dutch
4,United States,Washington D.C.,North America,English


## 6. Reverse column order

In [16]:
# Reversing the column order goes in a similar way as for the rows
countries.loc[:, ::-1]

Unnamed: 0,most_spoken_language,continent,capital_city,country
0,English,North America,Washington D.C.,United States
1,Dutch,Europe,Amsterdam,The Netherlands
2,Spanish,Europe,Madrid,Spain
3,Spanish,North America,Mexico City,Mexico
4,English,Australia,Canberra,Australia


## 7. Split a DataFrame into two random subsets

In [19]:
# In some cases, we want to split a DataFrame into two random subsets. For this, we make use of the sample function
countries_1 = countries.sample(frac=0.6, random_state=999)
countries_2 = countries.drop(countries_1.index)

In [20]:
countries_1

Unnamed: 0,country,capital_city,continent,most_spoken_language
3,Mexico,Mexico City,North America,Spanish
2,Spain,Madrid,Europe,Spanish
1,The Netherlands,Amsterdam,Europe,Dutch


In [21]:
countries_2

Unnamed: 0,country,capital_city,continent,most_spoken_language
0,United States,Washington D.C.,North America,English
4,Australia,Canberra,Australia,English


## 8. Create dummy variables

In [22]:
students = pd.DataFrame({
'name': ['Ben', 'Tina', 'John', 'Eric'],
'gender': ['male', 'female', 'male', 'male']})

In [23]:
# We might want to convert categorical variables into dummy/indicator variables. We can do so by making use of the get_dummies function
pd.get_dummies(students)

Unnamed: 0,name_Ben,name_Eric,name_John,name_Tina,gender_female,gender_male
0,True,False,False,False,False,True
1,False,False,False,True,True,False
2,False,False,True,False,False,True
3,False,True,False,False,False,True


In [24]:
# To get rid of the redundant columns, we have to add drop_first=True
pd.get_dummies(students, drop_first=True)

Unnamed: 0,name_Eric,name_John,name_Tina,gender_male
0,False,False,False,True
1,False,False,True,False
2,False,True,False,True
3,True,False,False,True


## 9. Check equality of columns

In [26]:
# When the goal is to check equality of two different columns, one might at first think of the == operator, since this is mostly used when we are concerned with checking equality conditions. 
# However, this operator does not handle NaN values properly, so we make use of the equals function
df = pd.DataFrame({'col_1': [1, 0], 'col_2': [0, 1], 'col_3': [1, 0]})
df

Unnamed: 0,col_1,col_2,col_3
0,1,0,1
1,0,1,0


In [27]:
df['col_1'].equals(df['col_2'])

False

In [28]:
df['col_1'].equals(df['col_3'])

True

# 10. Concatenate DataFrames

In [32]:
# We might want to combine two DataFrames into one DataFrame that contains all data points. This can be achieved by using the concat function
df_1 = pd.DataFrame({'col_1': [6, 7, 8], 'col_2': [1, 2, 3], 'col_3': [5, 6, 7]})
df_1

Unnamed: 0,col_1,col_2,col_3
0,6,1,5
1,7,2,6
2,8,3,7


In [31]:
pd.concat([df, df_1]).reset_index(drop=True)

Unnamed: 0,col_1,col_2,col_3
0,1,0,1
1,0,1,0
2,6,1,5
3,7,2,6
4,8,3,7
