# Wrangling DataFrames with Pandas    
## Author: Erika Duan    

![](../02_figures/04_manipulating-pandas-header.jpg)

In [62]:
#-----load Python packages-----  
import os  
import numpy as np  
import pandas as pd   

Pandas DataFrames can be created by:  
+ A  
+ B  

In [63]:
#-----example 1-----
students = {"id": [1, 2, 3, 4, 5],
            "subject": ["maths", "science", "english", "science", "music"],
            "hours_per_day": [2, 2, 1.5, 1, 3]}  

students = pd.DataFrame(students, columns = ["id", "subject", "hours_per_day"])

students

Unnamed: 0,id,subject,hours_per_day
0,1,maths,2.0
1,2,science,2.0
2,3,english,1.5
3,4,science,1.0
4,5,music,3.0


In [65]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             5 non-null      int64  
 1   subject        5 non-null      object 
 2   hours_per_day  5 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes


In [66]:
students.columns

Index(['id', 'subject', 'hours_per_day'], dtype='object')

# Subsetting rows  

+ loc is primarily label based indexing 
+ iloc is primarily integer based indexing

In [67]:
#-----example 2----- 
print(students.iloc[0,0]) # first row, first column
print(students.iloc[0]) # first row 
print(students.iloc[:, 0]) # first column 

# function(a)
# a.method() 

1
id                   1
subject          maths
hours_per_day        2
Name: 0, dtype: object
0    1
1    2
2    3
3    4
4    5
Name: id, dtype: int64


In [68]:
students.loc[:, "subject"]

# note that students.loc["subject"] produces an error

0      maths
1    science
2    english
3    science
4      music
Name: subject, dtype: object

Subsetting rows or columns based on positions  

In [35]:
# subset DataFrame columns
students[["id", "subject"]]  

print(type(students["id"])) # series
print(type(students[["id"]])) # DataFrame

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


Subsetting (filtering) rows based on values  

In [39]:
students[students["id"] == 2] # filtering by rows  

Unnamed: 0,id,subject,hours_per_day
1,2,science,2.0


In [43]:
student_list = [1, 2, 5]

students[students["id"].isin(student_list)] # use the isin() method for a list of values  

Unnamed: 0,id,subject,hours_per_day
0,1,maths,2.0
1,2,science,2.0
4,5,music,3.0


In [69]:
#-----transform columns i.e.------
students[["hours_per_week"]] =  students.apply(lambda row: row[["hours_per_day"]] * 7, axis = 1)  

students

Unnamed: 0,id,subject,hours_per_day,hours_per_week
0,1,maths,2.0,14.0
1,2,science,2.0,14.0
2,3,english,1.5,10.5
3,4,science,1.0,7.0
4,5,music,3.0,21.0


In [60]:
a = [1, 2, 3]
print(a) 

b = [4, 5, 6] 
print(b)

a[0]

[1, 2, 3]
[4, 5, 6]


1