In [1]:
import pandas as pd

d0 = {
    "day 1": pd.Series(["front side legs", "calfs", "abs"], index = ["main group", "group 2", "group 3"]),
    "day 2": pd.Series(["back side legs", "back", "abs"], index = ["main group", "group 2", "group 3"]),
    "day 3": pd.Series(["pecs", "shoulders", "arms", "abs"], index = ["main group", "group 2", "group 3", "group 4"])
}
dfa = pd.DataFrame(d0)
dfa # index are ordered by alphabet automatically

Unnamed: 0,day 1,day 2,day 3
group 2,calfs,back,shoulders
group 3,abs,abs,arms
group 4,,,abs
main group,front side legs,back side legs,pecs


In [2]:
del dfa["day 3"] # hot to delete a column from a dataframe
print(dfa)
dfa["day 1"]

                      day 1           day 2
group 2               calfs            back
group 3                 abs             abs
group 4                 NaN             NaN
main group  front side legs  back side legs


group 2                 calfs
group 3                   abs
group 4                   NaN
main group    front side legs
Name: day 1, dtype: object

In [3]:
# how to add a new column to a dataframe
dfa["day 3"] = pd.Series(["pecs", "shoulders", "arms", "abs"], index = ["main group", "group 2", "group 3", "group 4"]) 
# index list is fundamental, otherwise it add only a NaN column
print(dfa)

                      day 1           day 2      day 3
group 2               calfs            back  shoulders
group 3                 abs             abs       arms
group 4                 NaN             NaN        abs
main group  front side legs  back side legs       pecs


In [4]:
dfa.loc["main group"] # how to select a row by label

day 1    front side legs
day 2     back side legs
day 3               pecs
Name: main group, dtype: object

In [5]:
dfa.iloc[2] # how to select a row by r-index

day 1    NaN
day 2    NaN
day 3    abs
Name: group 4, dtype: object

In [6]:
dfb = pd.DataFrame(
    data = [["front side legs", "calfs", "abs"], 
            ["back side legs", "back", "abs"], 
            ["pecs", "shoulders", "arms", "abs"]
           ],
    index = ["day 1", "day 2", "day 3"],
    columns = ["muscle group 1", "muscle group 2", "muscle group 3", "muscle group 4"],
    dtype = str
) 
dfb["cardio"] = ["cyclette", "elliptical machine", "row machine"]
print(dfb)
dfc = pd.DataFrame([["treadmill"]], index = ["day 4"], columns = ["cardio"]) 
dfd = dfb.append(dfc) # how to add a row to a dataframe. Assignation is really necessary, 
# see pd.DataFrame.drop() example below.
print(dfd)

        muscle group 1 muscle group 2 muscle group 3 muscle group 4  \
day 1  front side legs          calfs            abs           None   
day 2   back side legs           back            abs           None   
day 3             pecs      shoulders           arms            abs   

                   cardio  
day 1            cyclette  
day 2  elliptical machine  
day 3         row machine  
        muscle group 1 muscle group 2 muscle group 3 muscle group 4  \
day 1  front side legs          calfs            abs           None   
day 2   back side legs           back            abs           None   
day 3             pecs      shoulders           arms            abs   
day 4              NaN            NaN            NaN            NaN   

                   cardio  
day 1            cyclette  
day 2  elliptical machine  
day 3         row machine  
day 4           treadmill  


In [7]:
dfd[2:4] # how to select a certain number of rows by r-index

Unnamed: 0,muscle group 1,muscle group 2,muscle group 3,muscle group 4,cardio
day 3,pecs,shoulders,arms,abs,row machine
day 4,,,,,treadmill


In [8]:
dfe = dfd.drop('day 4') # pd.DataFrame.append() (see above), pd.DataFrame.drop() and etc return a new DataFrame,
# so it has to stored in a variable. 
dfe

Unnamed: 0,muscle group 1,muscle group 2,muscle group 3,muscle group 4,cardio
day 1,front side legs,calfs,abs,,cyclette
day 2,back side legs,back,abs,,elliptical machine
day 3,pecs,shoulders,arms,abs,row machine


In [9]:
dff = pd.read_csv('https://raw.githubusercontent.com/QuantEcon/lecture-source-py/master/source/_static/lecture_specific/pandas/data/test_pwt.csv')
dff # a dataframe from an external cdv file
print(type(dff))
dff

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,country isocode,year,POP,XRAT,tcgdp,cc,cg
0,Argentina,ARG,2000,37335.653,0.9995,295072.2,75.716805,5.578804
1,Australia,AUS,2000,19053.186,1.72483,541804.7,67.759026,6.720098
2,India,IND,2000,1006300.297,44.9416,1728144.0,64.575551,14.072206
3,Israel,ISR,2000,6114.57,4.07733,129253.9,64.436451,10.266688
4,Malawi,MWI,2000,11801.505,59.543808,5026.222,74.707624,11.658954
5,South Africa,ZAF,2000,45064.098,6.93983,227242.4,72.71871,5.726546
6,United States,USA,2000,282171.957,1.0,9898700.0,72.347054,6.032454
7,Uruguay,URY,2000,3219.793,12.099592,25255.96,78.97874,5.108068


In [10]:
dff[3:6] # from row 3 to row 6

Unnamed: 0,country,country isocode,year,POP,XRAT,tcgdp,cc,cg
3,Israel,ISR,2000,6114.57,4.07733,129253.89423,64.436451,10.266688
4,Malawi,MWI,2000,11801.505,59.543808,5026.221784,74.707624,11.658954
5,South Africa,ZAF,2000,45064.098,6.93983,227242.36949,72.71871,5.726546


In [11]:
dff[["POP", "XRAT", "tcgdp"]] # prints just these columns of these rows

Unnamed: 0,POP,XRAT,tcgdp
0,37335.653,0.9995,295072.2
1,19053.186,1.72483,541804.7
2,1006300.297,44.9416,1728144.0
3,6114.57,4.07733,129253.9
4,11801.505,59.543808,5026.222
5,45064.098,6.93983,227242.4
6,282171.957,1.0,9898700.0
7,3219.793,12.099592,25255.96


In [17]:
# use pd.DataFrame.iloc to print just some cells of a dataframe by indeces

dff.iloc[4:7, 2:5] # prints just from row 4 to row 6 (i.e. 7-1) and from column 2 to column 4 (i.e. 5-1). 

# HINT: Review advanced indexing in Python, in general, and Numpy arrays. 

Unnamed: 0,year,POP,XRAT
4,2000,11801.505,59.543808
5,2000,45064.098,6.93983


In [25]:
# use pd.DataFrame.loc to print just some cells of a dataframe by some index range and some label list

dff.loc[2:5, ['country', 'POP', 'XRAT']] # in this case column 'country', POP' and 'XRAT' 
# from row 2 to row 5, no exclusions for last element of the row range

# HINT: Review advanced indexing in Python, in general, and Numpy arrays. Read Pandas documentation about this.

Unnamed: 0,country,POP,XRAT
2,India,1006300.297,44.9416
3,Israel,6114.57,4.07733
4,Malawi,11801.505,59.543808
5,South Africa,45064.098,6.93983


In [34]:
dfg = dff[['country', 'POP', 'XRAT']]
dfg = dfg.set_index('country') # returns a new dataframe, assignation needed
dfg

Unnamed: 0_level_0,POP,XRAT
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentina,37335.653,0.9995
Australia,19053.186,1.72483
India,1006300.297,44.9416
Israel,6114.57,4.07733
Malawi,11801.505,59.543808
South Africa,45064.098,6.93983
United States,282171.957,1.0
Uruguay,3219.793,12.099592


In [36]:
# refactoring column names
dfg.columns = ["Population", "x-rat"] # 'country' is set as index, so it's not a simple column anymore
dfg

Unnamed: 0_level_0,Population,x-rat
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentina,37335.653,0.9995
Australia,19053.186,1.72483
India,1006300.297,44.9416
Israel,6114.57,4.07733
Malawi,11801.505,59.543808
South Africa,45064.098,6.93983
United States,282171.957,1.0
Uruguay,3219.793,12.099592
