# Pandas

In [919]:
import numpy as np
import pandas as pd

## Creating a Pandas Series

pd.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)

One-dimensional ndarray with axis labels (including time series).

In [920]:
labels = ['a','b','c']
my_list = [10, 20, 30]
arr = np.array([10, 20, 30])
d = {'a': 10, 'b': 20, 'c': 30}

### **Using Lists**

In [921]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [922]:
pd.Series(my_list)

0    10
1    20
2    30
dtype: int64

In [923]:
pd.Series([10, 20, 30])

0    10
1    20
2    30
dtype: int64

### **Using NumPy Arrays**

In [924]:
arr

array([10, 20, 30])

In [925]:
pd.Series(data = arr, index = labels)

a    10
b    20
c    30
dtype: int64

### **Using Dictionary**

In [926]:
d

{'a': 10, 'b': 20, 'c': 30}

In [927]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [928]:
# ***
pd.Series(data = d, index= ['q', 'c', 'y'])

q     NaN
c    30.0
y     NaN
dtype: float64

### Using Scalar Value

In [929]:
pd.Series(data = "Steve")

0    Steve
dtype: object

In [930]:
pd.Series(data = "Steve", index = range(10,13))

10    Steve
11    Steve
12    Steve
dtype: object

In [931]:
pd.Series(data = 10, index= ['a', 'b', 'c'])

a    10
b    10
c    10
dtype: int64

### Data in a Series

A pandas Series can hold a variety of object types:

{} or set()

In [932]:
set1 = (1,2,3,6)

In [933]:
pd.Series(set1)

0    1
1    2
2    3
3    6
dtype: int64

In [934]:
pd.Series([set, list, dict])

0     <class 'set'>
1    <class 'list'>
2    <class 'dict'>
dtype: object

In [935]:
pd.Series(set)[0]([2,5,7,5,8,2])

{2, 5, 7, 8}

In [936]:
pd.Series([sum, print, len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [937]:
pd.Series([sum, print, len])[1]("a")

a


### Basic Attributes of Series

In [938]:
ser = pd.Series(np.random.randint(0,100,7))
ser

0    79
1    14
2    61
3    61
4    46
5    61
6    50
dtype: int64

In [1446]:
type(ser)

pandas.core.series.Series

In [1445]:
ser.dtype

dtype('int64')

In [941]:
ser.shape

(7,)

In [942]:
ser.size

7

In [943]:
len(ser)

7

In [944]:
ser.ndim

1

In [945]:
ser

0    79
1    14
2    61
3    61
4    46
5    61
6    50
dtype: int64

In [946]:
ser.index

RangeIndex(start=0, stop=7, step=1)

In [947]:
list(ser.index)

[0, 1, 2, 3, 4, 5, 6]

In [948]:
for i in ser.index:
    print(i)

0
1
2
3
4
5
6


In [949]:
ser.keys()

RangeIndex(start=0, stop=7, step=1)

In [950]:
ser.values

array([79, 14, 61, 61, 46, 61, 50])

In [951]:
[i for i in ser.values]

[79, 14, 61, 61, 46, 61, 50]

In [952]:
ser.items()

<zip at 0x127c19940>

In [953]:
list(ser.items())

[(0, 79), (1, 14), (2, 61), (3, 61), (4, 46), (5, 61), (6, 50)]

In [954]:
for index, value in ser.items():
    print(f"Index : {index}, Value : {value}")

Index : 0, Value : 79
Index : 1, Value : 14
Index : 2, Value : 61
Index : 3, Value : 61
Index : 4, Value : 46
Index : 5, Value : 61
Index : 6, Value : 50


In [955]:
ser = pd.Series(data = np.random.randint(0,25,10), index = [i for i in "cbaefdgihj"])

In [956]:
ser

c    11
b    22
a    19
e    24
f     2
d     4
g    18
i     6
h    20
j     8
dtype: int64

In [957]:
ser.head(3)

c    11
b    22
a    19
dtype: int64

In [958]:
ser.tail(2)

h    20
j     8
dtype: int64

In [959]:
ser.sample(2)

a    19
d     4
dtype: int64

In [960]:
ser.sort_index(ascending = True)

a    19
b    22
c    11
d     4
e    24
f     2
g    18
h    20
i     6
j     8
dtype: int64

In [961]:
ser.sort_values(ascending = False)

e    24
b    22
h    20
a    19
g    18
c    11
j     8
i     6
d     4
f     2
dtype: int64

In [962]:
ser

c    11
b    22
a    19
e    24
f     2
d     4
g    18
i     6
h    20
j     8
dtype: int64

In [963]:
ser[ser.isin([2, 13])]

f    2
dtype: int64

In [964]:
ser.isin([3, 9])

c    False
b    False
a    False
e    False
f    False
d    False
g    False
i    False
h    False
j    False
dtype: bool

## Indexing Pandas Series

The key to using a Series is understanding its index. Pandas makes use of these index names or numbers by allowing for fast look up of information.

Let's see some examples of how to grab information from a Series. Let us create two sereis, ser1 and ser2:

In [965]:
ser1 = pd.Series([1,2,3,4],index = ['USA', 'Germany','USSR', 'Japan'])                      

In [966]:
ser2 = pd.Series([1,2,5,4],index = ['USA', 'Germany','Italy', 'Japan'])

In [967]:
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [968]:
ser1.sort_index()

Germany    2
Japan      4
USA        1
USSR       3
dtype: int64

In [969]:
ser2.sort_values()

USA        1
Germany    2
Japan      4
Italy      5
dtype: int64

In [970]:
ser1[1]

2

In [971]:
ser1['Germany']

2

In [972]:
ser2['Japan']

4

In [973]:
ser1.Germany

2

Operations are then also done based off of index:

In [974]:
ser1 + ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

In [975]:
ser3 = pd.Series(data = ["a", "b", "c", "d"])

ser3

0    a
1    b
2    c
3    d
dtype: object

In [976]:
ser3[0]

'a'

In [977]:
ser3[2:]

2    c
3    d
dtype: object

In [978]:
ser3[::-1]

3    d
2    c
1    b
0    a
dtype: object

### pandas_series[index] | pandas_series[[indices, indices...]]

In [979]:
panser = pd.Series(data = [121, 200, 150, 99], index = ["terry", "micheal", "orion", "jason"])
panser

terry      121
micheal    200
orion      150
jason       99
dtype: int64

In [980]:
panser['terry']

121

In [981]:
panser[0]

121

In [982]:
index1 = ['terry', 'micheal', 'jason']

In [983]:
panser[index1]

terry      121
micheal    200
jason       99
dtype: int64

In [984]:
panser['terry':'orion']

terry      121
micheal    200
orion      150
dtype: int64

In [985]:
panser[0:3]

terry      121
micheal    200
orion      150
dtype: int64

### Recap for Several Selecting Attributes and Some New Attributes

In [986]:
panser.keys()

Index(['terry', 'micheal', 'orion', 'jason'], dtype='object')

In [987]:
panser.index

Index(['terry', 'micheal', 'orion', 'jason'], dtype='object')

In [988]:
panser.values

array([121, 200, 150,  99])

In [989]:
panser.items()

<zip at 0x16b4f31c0>

In [990]:
list(panser.items())

[('terry', 121), ('micheal', 200), ('orion', 150), ('jason', 99)]

In [991]:
for index, value in panser.items():
    print(index, value)

terry 121
micheal 200
orion 150
jason 99


In [992]:
panser

terry      121
micheal    200
orion      150
jason       99
dtype: int64

In [993]:
'terry' in panser

True

In [994]:
'bob' in panser

False

In [995]:
121 in panser.values

True

In [996]:
55 in panser.values

False

In [997]:
panser.isin([200,150])

terry      False
micheal     True
orion       True
jason      False
dtype: bool

In [998]:
panser

terry      121
micheal    200
orion      150
jason       99
dtype: int64

In [999]:
panser['terry'] = 90

In [1000]:
panser

terry       90
micheal    200
orion      150
jason       99
dtype: int64

In [1001]:
panser[panser < 100]

terry    90
jason    99
dtype: int64

In [1002]:
panser[panser < 100] = 100
panser

terry      100
micheal    200
orion      150
jason      100
dtype: int64

In [1003]:
panser[panser > 100]

micheal    200
orion      150
dtype: int64

# Data Frames

 - ### ``DataFrames`` are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

 ## Creating a DataFrame using the ``list``s of data and columns

pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

Two-dimensional, size-mutable, potentially heterogeneous tabular data.

In [1004]:
data = [1, 3, 5, 7, 9, 18]
data

[1, 3, 5, 7, 9, 18]

In [1005]:
pd.DataFrame(data)

Unnamed: 0,0
0,1
1,3
2,5
3,7
4,9
5,18


In [1006]:
pd.Series(data)

0     1
1     3
2     5
3     7
4     9
5    18
dtype: int64

In [1007]:
pd.DataFrame(data, columns=['column1'])

Unnamed: 0,column1
0,1
1,3
2,5
3,7
4,9
5,18


 ## Creating a DataFrame using a ``NumPy Arrays``

In [1008]:
m = np.arange(1,24,2).reshape(3,4)
m

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [1009]:
df=pd.DataFrame(data=m, columns=['var1','var2','var3','var4'])
df

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [1010]:
df.head(2)

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15


In [1011]:
df.head(2).T

Unnamed: 0,0,1
var1,1,9
var2,3,11
var3,5,13
var4,7,15


In [1012]:
df.tail(2)

Unnamed: 0,var1,var2,var3,var4
1,9,11,13,15
2,17,19,21,23


In [1013]:
df.sample(2).T

Unnamed: 0,2,1
var1,17,9
var2,19,11
var3,21,13
var4,23,15


In [1014]:
df=pd.DataFrame(data=m, columns=['Var 1','var_2','Var  3','vaR4'])
df

Unnamed: 0,Var 1,var_2,Var 3,vaR4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [1015]:
df.columns = df.columns.str.lower().str.replace(" ","").str.replace("_","")

In [1016]:
df.columns = df.columns.str.lower().str.replace(" ","_")

In [1017]:
df.columns

Index(['var1', 'var2', 'var3', 'var4'], dtype='object')

In [1018]:
df["var1"]  

0     1
1     9
2    17
Name: var1, dtype: int64

In [1019]:
df[["var1"]]

Unnamed: 0,var1
0,1
1,9
2,17


In [1020]:
for i in df.columns:
    print(i)

var1
var2
var3
var4


In [1021]:
df.columns=['new1','new2','new3','new4']
df

Unnamed: 0,new1,new2,new3,new4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [1022]:
df.index = ["a","b","c"]
df

Unnamed: 0,new1,new2,new3,new4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [1023]:
df.rename(columns={"new1": "a", "new2": "b"})

Unnamed: 0,a,b,new3,new4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [1024]:
df.rename(index={"a": 1, "b": 2})

Unnamed: 0,new1,new2,new3,new4
1,1,3,5,7
2,9,11,13,15
c,17,19,21,23


In [1025]:
df

Unnamed: 0,new1,new2,new3,new4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [1026]:
df.shape

(3, 4)

In [1027]:
df.shape[1]

4

In [1028]:
df.ndim

2

In [1029]:
df.size

12

In [1030]:
len(df)

3

In [1031]:
df.values

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [1032]:
type(df)

pandas.core.frame.DataFrame

In [1033]:
type(df.values)

numpy.ndarray

 ## Creating a DataFrame using a ``dict``

In [1034]:
s1 = np.random.randint(2,10, size = 4)
s2 = np.random.randint(3,10, size = 4)
s3 = np.random.randint(4,15, size = 4)

In [1035]:
s1

array([5, 5, 8, 5])

In [1036]:
s2

array([9, 6, 7, 9])

In [1037]:
s3

array([6, 4, 7, 5])

In [1038]:
myDict= {'var1':s1,'var2':s2,'var3':s3}

In [1039]:
df1 = pd.DataFrame(myDict)
df1

Unnamed: 0,var1,var2,var3
0,5,9,6
1,5,6,4
2,8,7,7
3,5,9,5


In [1040]:
print(*df1.index)

0 1 2 3


In [1041]:
[i for i in df1.index]

[0, 1, 2, 3]

In [1042]:
# we can check any column name whether it belongs to the DataFrame or not
"var2" in df1

True

In [1043]:
'var5' in df1

False

In [1044]:
df1

Unnamed: 0,var1,var2,var3
0,5,9,6
1,5,6,4
2,8,7,7
3,5,9,5


### Now, let's examine again the ***idexing, selection*** and ***slicing*** methods and several ***attributes*** using a different DataFrame

In [1451]:
from numpy.random import randn

In [1452]:
np.random.seed(101)
df3 = pd.DataFrame(randn(5,4), index = 'A B C D E'.split(), columns = 'W X Y Z'.split())
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1453]:
# creating a DataFrame by "positional arguments"
pd.DataFrame(randn(5,4), 'a b c d e'.split(), 'w x y z'.split())

Unnamed: 0,w,x,y,z
a,0.302665,1.693723,-1.706086,-1.159119
b,-0.134841,0.390528,0.166905,0.184502
c,0.807706,0.07296,0.638787,0.329646
d,-0.497104,-0.75407,-0.943406,0.484752
e,-0.116773,1.901755,0.238127,1.996652


In [1454]:
# creating a DataFrame by "keyword arguments"
pd.DataFrame(randn(5,4),columns='w x y z'.split(), index='a b c d e'.split())

Unnamed: 0,w,x,y,z
a,-0.993263,0.1968,-1.136645,0.000366
b,1.025984,-0.156598,-0.031579,0.649826
c,2.154846,-0.610259,-0.755325,-0.346419
d,0.147027,-0.479448,0.558769,1.02481
e,-0.925874,1.862864,-1.133817,0.610478


## Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [1455]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1456]:
df3['Y']

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [1457]:
# SQL Syntax (NOT RECOMMENDED!)
df3.Y

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

#### DataFrame Columns are just Series

In [1458]:
type(df3['Y'])

pandas.core.series.Series

In [1459]:
df3[['Y']]

Unnamed: 0,Y
A,0.907969
B,-0.848077
C,0.528813
D,-0.933237
E,2.605967


In [1460]:
type(df3[['Y']])

pandas.core.frame.DataFrame

In [1461]:
list_columns = ['Z','X']

In [1462]:
df3[list_columns]

Unnamed: 0,Z,X
A,0.503826,0.628133
B,0.605965,-0.319318
C,-0.589001,0.740122
D,0.955057,-0.758872
E,0.683509,1.978757


In [1463]:
# Pass a list of column names
df3[['Z','X']]

Unnamed: 0,Z,X
A,0.503826,0.628133
B,0.605965,-0.319318
C,-0.589001,0.740122
D,0.955057,-0.758872
E,0.683509,1.978757


In [1464]:
#df3['Z','X'] gives error

In [1465]:
df3["X":"Z"]

Unnamed: 0,W,X,Y,Z


In [1466]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1467]:
df3['B':'D']

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [1468]:
#df3['C','D'] gives error

In [1469]:
df3["A":"C"][["Y", "Z"]]

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
C,0.528813,-0.589001


**Creating a new column:**

In [1470]:
df3


Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1471]:
df3['X*Y']=df3['X']*df3['Y']
df3

Unnamed: 0,W,X,Y,Z,X*Y
A,2.70685,0.628133,0.907969,0.503826,0.570325
B,0.651118,-0.319318,-0.848077,0.605965,0.270806
C,-2.018168,0.740122,0.528813,-0.589001,0.391387
D,0.188695,-0.758872,-0.933237,0.955057,0.708208
E,0.190794,1.978757,2.605967,0.683509,5.156577


In [1472]:
df3["T"] = [1,2,3,4,5]
df3

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


## [Removing Columns & Rows](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-drop.ipynb)

 ### Removing Columns

In [1473]:
df3

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [1474]:
df3.drop(["X*Y", "T"], axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1475]:
df3

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [1476]:
df3.drop(columns = ["X*Y", "T"])

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1479]:
# Not inplace unless specified!
df3.drop(["X*Y", "T"], axis=1, inplace=True)

KeyError: "['X*Y', 'T'] not found in axis"

In [1480]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


 ### Removing rows

In [1481]:
df3.drop('C',axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1482]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1483]:
df3.drop(index=['C'])

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1484]:
drop_list = ['A', 'B']

In [1485]:
# the default value of axis is 0 (axis = 0)
df4 = df3.drop(drop_list, axis=0)
df4

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1486]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


## Selecting Rows

### First, let's take a quick look at [`.loc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-loc.ipynb) | [`.iloc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-iloc.ipynb)

#### `.loc[]` → allows us to select data using **labels** (names) of rows (index) & columns

#### `.iloc[]` → allows us to select data using **index numbers** of rows (index) & columns. it's like classical indexing logic

In [1487]:
import pandas as pd
import numpy as np

In [1488]:
m = np.random.randint(1,40, size=(8,4))
df4 = pd.DataFrame(m, columns = ["var1","var2","var3",'var4'])
df4

Unnamed: 0,var1,var2,var3,var4
0,4,38,30,23
1,22,22,18,24
2,31,37,8,21
3,28,12,6,23
4,26,19,14,39
5,4,15,24,14
6,25,21,1,30
7,12,28,34,25


In [1489]:
df4.loc[4]

var1    26
var2    19
var3    14
var4    39
Name: 4, dtype: int64

In [1490]:
df4.iloc[4]

var1    26
var2    19
var3    14
var4    39
Name: 4, dtype: int64

In [1491]:
df4.loc[[4]]

Unnamed: 0,var1,var2,var3,var4
4,26,19,14,39


In [1492]:
# Slicing produces the same type of the data. Here, DataFrame
df4.loc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,31,37,8,21
3,28,12,6,23
4,26,19,14,39
5,4,15,24,14


In [1493]:
df4.iloc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,31,37,8,21
3,28,12,6,23
4,26,19,14,39


In [1494]:
df4

Unnamed: 0,var1,var2,var3,var4
0,4,38,30,23
1,22,22,18,24
2,31,37,8,21
3,28,12,6,23
4,26,19,14,39
5,4,15,24,14
6,25,21,1,30
7,12,28,34,25


In [1495]:
df4.index='a b c d e f g h'.split()
df4

Unnamed: 0,var1,var2,var3,var4
a,4,38,30,23
b,22,22,18,24
c,31,37,8,21
d,28,12,6,23
e,26,19,14,39
f,4,15,24,14
g,25,21,1,30
h,12,28,34,25


In [1496]:
df4.iloc[1:4]

Unnamed: 0,var1,var2,var3,var4
b,22,22,18,24
c,31,37,8,21
d,28,12,6,23


In [1497]:
#df4.loc[1:4] gives error

In [1498]:
df4.loc['c':'g']

Unnamed: 0,var1,var2,var3,var4
c,31,37,8,21
d,28,12,6,23
e,26,19,14,39
f,4,15,24,14
g,25,21,1,30


In [1499]:
df4

Unnamed: 0,var1,var2,var3,var4
a,4,38,30,23
b,22,22,18,24
c,31,37,8,21
d,28,12,6,23
e,26,19,14,39
f,4,15,24,14
g,25,21,1,30
h,12,28,34,25


In [1500]:
df4.iloc[4,1]

19

In [1501]:
df4.loc['d':'g','var3']

d     6
e    14
f    24
g     1
Name: var3, dtype: int64

In [1502]:
df4.loc['d':'g']['var3']

d     6
e    14
f    24
g     1
Name: var3, dtype: int64

In [1503]:
# how can we select these data as a DataFrame not a series
df4.loc['d':'g'][['var3']]

Unnamed: 0,var3
d,6
e,14
f,24
g,1


In [1504]:
df4.loc['d':'g', ["var3"]]

Unnamed: 0,var3
d,6
e,14
f,24
g,1


In [1505]:
df4.iloc[2:5,2]

c     8
d     6
e    14
Name: var3, dtype: int64

In [1506]:
df4.iloc[2:5][['var2']]

Unnamed: 0,var2
c,37
d,12
e,19


#### Let's continue to examine `.loc[]` and `.iloc[]` using ``df3`` again

In [1507]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [1508]:
df3.loc['C']

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

Or select based off of position instead of label 

In [1102]:
df3.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
T    3.000000
Name: C, dtype: float64

In [1103]:
# returns as a DataFrame
df3.loc[['C']]

Unnamed: 0,W,X,Y,Z,T
C,-2.018168,0.740122,0.528813,-0.589001,3


In [1104]:
# returns as a DataFrame
df3.iloc[[2]]

Unnamed: 0,W,X,Y,Z,T
C,-2.018168,0.740122,0.528813,-0.589001,3


In [1105]:
# Well, how can we select entire column "Y" using ".iloc[]"
df3.iloc[:,2]

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [1106]:
df3.iloc[:,[2]]

Unnamed: 0,Y
A,0.907969
B,-0.848077
C,0.528813
D,-0.933237
E,2.605967


In [1107]:
df3[['Y','X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
C,0.528813,0.740122
D,-0.933237,-0.758872
E,2.605967,1.978757


### Selecting subset of rows and columns

 - ### `.loc[[row labels|names], [column labels|names]]`

 - ### `.iloc[[row index numbers], [column index numbers]]`

In [1108]:
df3

Unnamed: 0,W,X,Y,Z,T
A,2.70685,0.628133,0.907969,0.503826,1
B,0.651118,-0.319318,-0.848077,0.605965,2
C,-2.018168,0.740122,0.528813,-0.589001,3
D,0.188695,-0.758872,-0.933237,0.955057,4
E,0.190794,1.978757,2.605967,0.683509,5


In [1109]:
df3.loc['C','Z']

-0.5890005332865824

In [1110]:
# let's select the same data as a DataFrame
df3.loc[['C'],['Z']]

Unnamed: 0,Z
C,-0.589001


In [1111]:
df3.loc[['A','C'],['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
C,-2.018168,-0.589001


In [1112]:
df3.iloc[[0,2],[0,3]]

Unnamed: 0,W,Z
A,2.70685,0.503826
C,-2.018168,-0.589001


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [1113]:
df3

Unnamed: 0,W,X,Y,Z,T
A,2.70685,0.628133,0.907969,0.503826,1
B,0.651118,-0.319318,-0.848077,0.605965,2
C,-2.018168,0.740122,0.528813,-0.589001,3
D,0.188695,-0.758872,-0.933237,0.955057,4
E,0.190794,1.978757,2.605967,0.683509,5


In [1114]:
# returns a DataFrame consists of bool type
df3>0.5

Unnamed: 0,W,X,Y,Z,T
A,True,True,True,True,True
B,True,False,False,True,True
C,False,True,True,False,True
D,False,False,False,True,True
E,False,True,True,True,True


In [1115]:
df3[df3>0.5]

Unnamed: 0,W,X,Y,Z,T
A,2.70685,0.628133,0.907969,0.503826,1
B,0.651118,,,0.605965,2
C,,0.740122,0.528813,,3
D,,,,0.955057,4
E,,1.978757,2.605967,0.683509,5


In [1116]:
# It returns based on rows.
df3[df3['Z']>0.5]

Unnamed: 0,W,X,Y,Z,T
A,2.70685,0.628133,0.907969,0.503826,1
B,0.651118,-0.319318,-0.848077,0.605965,2
D,0.188695,-0.758872,-0.933237,0.955057,4
E,0.190794,1.978757,2.605967,0.683509,5


In [1117]:
df3[['Z']]

Unnamed: 0,Z
A,0.503826
B,0.605965
C,-0.589001
D,0.955057
E,0.683509


In [1118]:
df3

Unnamed: 0,W,X,Y,Z,T
A,2.70685,0.628133,0.907969,0.503826,1
B,0.651118,-0.319318,-0.848077,0.605965,2
C,-2.018168,0.740122,0.528813,-0.589001,3
D,0.188695,-0.758872,-0.933237,0.955057,4
E,0.190794,1.978757,2.605967,0.683509,5


In [1119]:
df3[df3['X']<1]['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
Name: W, dtype: float64

In [1120]:
# how can we select the data as a DataFrame


In [1121]:
df3[df3['X']<1][['W']]

Unnamed: 0,W
A,2.70685
B,0.651118
C,-2.018168
D,0.188695


In [1122]:
df3[df3['Y']>0][['Z','W','Y']]

Unnamed: 0,Z,W,Y
A,0.503826,2.70685,0.907969
C,-0.589001,-2.018168,0.528813
E,0.683509,0.190794,2.605967


#### For two conditions you can use **|** → `or`,  **&** →  `and` with parenthesis:

In [1123]:
df3

Unnamed: 0,W,X,Y,Z,T
A,2.70685,0.628133,0.907969,0.503826,1
B,0.651118,-0.319318,-0.848077,0.605965,2
C,-2.018168,0.740122,0.528813,-0.589001,3
D,0.188695,-0.758872,-0.933237,0.955057,4
E,0.190794,1.978757,2.605967,0.683509,5


In [1124]:
df3[(df3['W']>0) & (df3['Y']<1)] = 0

In [1125]:
df3

Unnamed: 0,W,X,Y,Z,T
A,0.0,0.0,0.0,0.0,0
B,0.0,0.0,0.0,0.0,0
C,-2.018168,0.740122,0.528813,-0.589001,3
D,0.0,0.0,0.0,0.0,0
E,0.190794,1.978757,2.605967,0.683509,5


### Conditional selection using ``.loc[]`` and ``.iloc[]``

In [1126]:
df3.loc[(df3.X>0), ['X','Z']]

Unnamed: 0,X,Z
C,0.740122,-0.589001
E,1.978757,0.683509


In [1127]:
df3.loc[((df3.W>1) | (df3.Y<1)), ['Y','Z']]

Unnamed: 0,Y,Z
A,0.0,0.0
B,0.0,0.0
C,0.528813,-0.589001
D,0.0,0.0


In [1128]:
df3.loc[((df3.W>1) | (df3.Y<1)), ['Y','Z']] = 1

In [1129]:
df3

Unnamed: 0,W,X,Y,Z,T
A,0.0,0.0,1.0,1.0,0
B,0.0,0.0,1.0,1.0,0
C,-2.018168,0.740122,1.0,1.0,3
D,0.0,0.0,1.0,1.0,0
E,0.190794,1.978757,2.605967,0.683509,5


## More Index Details

Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!

In [1130]:
df3

Unnamed: 0,W,X,Y,Z,T
A,0.0,0.0,1.0,1.0,0
B,0.0,0.0,1.0,1.0,0
C,-2.018168,0.740122,1.0,1.0,3
D,0.0,0.0,1.0,1.0,0
E,0.190794,1.978757,2.605967,0.683509,5


In [1131]:
# Reset to default 0,1...n index
df3.reset_index()

Unnamed: 0,index,W,X,Y,Z,T
0,A,0.0,0.0,1.0,1.0,0
1,B,0.0,0.0,1.0,1.0,0
2,C,-2.018168,0.740122,1.0,1.0,3
3,D,0.0,0.0,1.0,1.0,0
4,E,0.190794,1.978757,2.605967,0.683509,5


In [1132]:
df3

Unnamed: 0,W,X,Y,Z,T
A,0.0,0.0,1.0,1.0,0
B,0.0,0.0,1.0,1.0,0
C,-2.018168,0.740122,1.0,1.0,3
D,0.0,0.0,1.0,1.0,0
E,0.190794,1.978757,2.605967,0.683509,5


In [1133]:
df3.reset_index(drop=True, inplace=True)

In [1134]:
df3

Unnamed: 0,W,X,Y,Z,T
0,0.0,0.0,1.0,1.0,0
1,0.0,0.0,1.0,1.0,0
2,-2.018168,0.740122,1.0,1.0,3
3,0.0,0.0,1.0,1.0,0
4,0.190794,1.978757,2.605967,0.683509,5


In [1135]:
newindx='CA NY WY OR CO'.split()
newindx

['CA', 'NY', 'WY', 'OR', 'CO']

In [1136]:
df3['newidx']=newindx

In [1137]:
df3

Unnamed: 0,W,X,Y,Z,T,newidx
0,0.0,0.0,1.0,1.0,0,CA
1,0.0,0.0,1.0,1.0,0,NY
2,-2.018168,0.740122,1.0,1.0,3,WY
3,0.0,0.0,1.0,1.0,0,OR
4,0.190794,1.978757,2.605967,0.683509,5,CO


In [1138]:
df3.set_index('newidx')

Unnamed: 0_level_0,W,X,Y,Z,T
newidx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,0.0,0.0,1.0,1.0,0
NY,0.0,0.0,1.0,1.0,0
WY,-2.018168,0.740122,1.0,1.0,3
OR,0.0,0.0,1.0,1.0,0
CO,0.190794,1.978757,2.605967,0.683509,5


In [1139]:
df3

Unnamed: 0,W,X,Y,Z,T,newidx
0,0.0,0.0,1.0,1.0,0,CA
1,0.0,0.0,1.0,1.0,0,NY
2,-2.018168,0.740122,1.0,1.0,3,WY
3,0.0,0.0,1.0,1.0,0,OR
4,0.190794,1.978757,2.605967,0.683509,5,CO


In [1140]:
df3.set_index('newidx',inplace=True)

In [1141]:
df3

Unnamed: 0_level_0,W,X,Y,Z,T
newidx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,0.0,0.0,1.0,1.0,0
NY,0.0,0.0,1.0,1.0,0
WY,-2.018168,0.740122,1.0,1.0,3
OR,0.0,0.0,1.0,1.0,0
CO,0.190794,1.978757,2.605967,0.683509,5


## Multi-Index and Index Hierarchy

Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:

In [1142]:
# Index Levels
outside = ['M1', 'M1', 'M1', 'M2', 'M2', 'M2','M3', 'M3', 'M3']
inside = [1, 2, 3, 1, 2, 3, 5, 6, 7]
multi_index = list(zip(outside, inside))
multi_index

[('M1', 1),
 ('M1', 2),
 ('M1', 3),
 ('M2', 1),
 ('M2', 2),
 ('M2', 3),
 ('M3', 5),
 ('M3', 6),
 ('M3', 7)]

In [1143]:
hier_index=pd.MultiIndex.from_tuples(multi_index)

In [1144]:
hier_index

MultiIndex([('M1', 1),
            ('M1', 2),
            ('M1', 3),
            ('M2', 1),
            ('M2', 2),
            ('M2', 3),
            ('M3', 5),
            ('M3', 6),
            ('M3', 7)],
           )

In [1145]:
df=pd.DataFrame(np.random.randn(9,4), index = hier_index, columns=['A','B','C','D'])
df

Unnamed: 0,Unnamed: 1,A,B,C,D
M1,1,-0.758436,-0.454696,1.297617,-0.825378
M1,2,0.251915,0.518763,0.587968,-0.148194
M1,3,-0.876702,0.79275,0.539118,0.669774
M2,1,-1.270484,-0.446181,0.779475,0.4799
M2,2,-0.960697,-2.002399,-1.263599,-0.696232
M2,3,-1.14822,1.607435,-1.22687,1.405532
M3,5,-1.137201,-0.535478,2.142717,1.691452
M3,6,0.275225,-0.852057,0.298659,-0.56537
M3,7,0.358325,0.699676,0.417366,-0.238049


Now let's show how to index this! For index hierarchy we use ``df.loc[]``, if this was on the columns axis, you would just use normal bracket notation ``df[]``. Calling one level of the index returns the sub-dataframe:

In [1146]:
df.loc['M1']

Unnamed: 0,A,B,C,D
1,-0.758436,-0.454696,1.297617,-0.825378
2,0.251915,0.518763,0.587968,-0.148194
3,-0.876702,0.79275,0.539118,0.669774


In [1147]:
df.loc['M1'].loc[2]

A    0.251915
B    0.518763
C    0.587968
D   -0.148194
Name: 2, dtype: float64

In [1148]:
df.loc['M1'].loc[[2]]

Unnamed: 0,A,B,C,D
2,0.251915,0.518763,0.587968,-0.148194


In [1149]:
df.index.names

FrozenList([None, None])

In [1150]:
df.index.names = ['Group','Num']

In [1151]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,-0.758436,-0.454696,1.297617,-0.825378
M1,2,0.251915,0.518763,0.587968,-0.148194
M1,3,-0.876702,0.79275,0.539118,0.669774
M2,1,-1.270484,-0.446181,0.779475,0.4799
M2,2,-0.960697,-2.002399,-1.263599,-0.696232
M2,3,-1.14822,1.607435,-1.22687,1.405532
M3,5,-1.137201,-0.535478,2.142717,1.691452
M3,6,0.275225,-0.852057,0.298659,-0.56537
M3,7,0.358325,0.699676,0.417366,-0.238049


### let's take a quick look at the [``.xs()``](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-xs.ipynb)

In [1152]:
df.xs('M1')

Unnamed: 0_level_0,A,B,C,D
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.758436,-0.454696,1.297617,-0.825378
2,0.251915,0.518763,0.587968,-0.148194
3,-0.876702,0.79275,0.539118,0.669774


In [1153]:
df.loc['M1']

Unnamed: 0_level_0,A,B,C,D
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.758436,-0.454696,1.297617,-0.825378
2,0.251915,0.518763,0.587968,-0.148194
3,-0.876702,0.79275,0.539118,0.669774


In [1154]:
df.xs(['M1',2])

TypeError: list keys are not supported in xs, pass a tuple instead

In [1155]:
df.xs(('M1',2))

A    0.251915
B    0.518763
C    0.587968
D   -0.148194
Name: (M1, 2), dtype: float64

In [1156]:
df.xs(('M1',2), level=[0,1])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,0.251915,0.518763,0.587968,-0.148194


In [1157]:
df.xs(('M1',2), level=["Group","Num"])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,0.251915,0.518763,0.587968,-0.148194


In [1158]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,-0.758436,-0.454696,1.297617,-0.825378
M1,2,0.251915,0.518763,0.587968,-0.148194
M1,3,-0.876702,0.79275,0.539118,0.669774
M2,1,-1.270484,-0.446181,0.779475,0.4799
M2,2,-0.960697,-2.002399,-1.263599,-0.696232
M2,3,-1.14822,1.607435,-1.22687,1.405532
M3,5,-1.137201,-0.535478,2.142717,1.691452
M3,6,0.275225,-0.852057,0.298659,-0.56537
M3,7,0.358325,0.699676,0.417366,-0.238049


In [1159]:
#df.loc[2] #gives an error
#df.xs(2) #gives an error
df.xs(2, level = 'Num')

Unnamed: 0_level_0,A,B,C,D
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1,0.251915,0.518763,0.587968,-0.148194
M2,-0.960697,-2.002399,-1.263599,-0.696232


In [1160]:
df.xs(5, level = 'Num')

Unnamed: 0_level_0,A,B,C,D
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3,-1.137201,-0.535478,2.142717,1.691452


In [1161]:
df.xs(5, level = 1)

Unnamed: 0_level_0,A,B,C,D
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3,-1.137201,-0.535478,2.142717,1.691452


In [1162]:
df.xs('C', axis=1)

Group  Num
M1     1      1.297617
       2      0.587968
       3      0.539118
M2     1      0.779475
       2     -1.263599
       3     -1.226870
M3     5      2.142717
       6      0.298659
       7      0.417366
Name: C, dtype: float64

### Let's learn new functions/attributes/methods on "iris dataset" 

In [1163]:
import seaborn as sns

In [1164]:
df=sns.load_dataset('iris')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [1165]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [1166]:
df.shape

(150, 5)

In [1167]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [1168]:
df.sample(4)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
2,4.7,3.2,1.3,0.2,setosa
100,6.3,3.3,6.0,2.5,virginica
89,5.5,2.5,4.0,1.3,versicolor
125,7.2,3.2,6.0,1.8,virginica


In [1169]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [1170]:
# df.describe().T
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal_length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal_width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal_length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal_width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


In [1171]:
df.describe(include="all") # "number" and "object" can be used as include/exclude parameter

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,setosa
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [1509]:
df.describe(include="object")

SyntaxError: invalid syntax (159412137.py, line 1)

In [1510]:
df.corr()

ValueError: could not convert string to float: 'setosa'

In [1511]:

df.corr()[["sepal_length"]]

ValueError: could not convert string to float: 'setosa'

In [1512]:
df['petal_length'].corr(df["petal_width"])

0.9628654314027961

In [1513]:
df.species.value_counts(dropna=False)

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [1177]:
df['species'].value_counts(dropna = False, normalize = True)

species
setosa        0.333333
versicolor    0.333333
virginica     0.333333
Name: proportion, dtype: float64

In [1178]:
df.mean()

TypeError: Could not convert ['setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosaversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorvirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginica'] to numeric

In [1179]:
df.sum(axis=0)

sepal_length                                                876.5
sepal_width                                                 458.6
petal_length                                                563.7
petal_width                                                 179.9
species         setosasetosasetosasetosasetosasetosasetosaseto...
dtype: object

In [1180]:
df.sum(axis=1)

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [1181]:
df.sepal_length.sum()

876.5

In [1182]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [1183]:
df.species.nunique()

3

In [1184]:
df.loc[df["species"] == "setosa", "sepal_length"]

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


0     5.1
1     4.9
2     4.7
3     4.6
4     5.0
5     5.4
6     4.6
7     5.0
8     4.4
9     4.9
10    5.4
11    4.8
12    4.8
13    4.3
14    5.8
15    5.7
16    5.4
17    5.1
18    5.7
19    5.1
20    5.4
21    5.1
22    4.6
23    5.1
24    4.8
25    5.0
26    5.0
27    5.2
28    5.2
29    4.7
30    4.8
31    5.4
32    5.2
33    5.5
34    4.9
35    5.0
36    5.5
37    4.9
38    4.4
39    5.1
40    5.0
41    4.5
42    4.4
43    5.0
44    5.1
45    4.8
46    5.1
47    4.6
48    5.3
49    5.0
Name: sepal_length, dtype: float64

In [1185]:
df[(df.sepal_length>4) & (df.sepal_length<5)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
6,4.6,3.4,1.4,0.3,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa
11,4.8,3.4,1.6,0.2,setosa
12,4.8,3.0,1.4,0.1,setosa
13,4.3,3.0,1.1,0.1,setosa
22,4.6,3.6,1.0,0.2,setosa


In [1186]:
df[(df.species == "virginica") & (df.sepal_length>4)  & (df.sepal_length<5)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
106,4.9,2.5,4.5,1.7,virginica


In [1187]:
df.sort_values(by = 'sepal_length', ascending = True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
13,4.3,3.0,1.1,0.1,setosa
42,4.4,3.2,1.3,0.2,setosa
38,4.4,3.0,1.3,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
41,4.5,2.3,1.3,0.3,setosa
...,...,...,...,...,...
122,7.7,2.8,6.7,2.0,virginica
118,7.7,2.6,6.9,2.3,virginica
117,7.7,3.8,6.7,2.2,virginica
135,7.7,3.0,6.1,2.3,virginica


In [1188]:
df.species.value_counts(dropna=False)

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

# Basic aggregation methods:

* ``count()``
* ``mean()``
* ``median()``
* ``min()``
* ``max()``
* ``std()``
* ``var()``
* ``sum()``
* ``idxmin()``
* ``idxmax()``
* ``corr()``

In [1189]:
import numpy as np
import pandas as pd
import seaborn as sns

In [1190]:
np.random.seed(42)
df = pd.DataFrame(np.random.randint(0,100,size=(7,5)),
                  columns=["x1","x2","x3","x4","x5"])
df

Unnamed: 0,x1,x2,x3,x4,x5
0,51,92,14,71,60
1,20,82,86,74,74
2,87,99,23,2,21
3,52,1,87,29,37
4,1,63,59,20,32
5,75,57,21,88,48
6,90,58,41,91,59


In [1191]:
df.count()

x1    7
x2    7
x3    7
x4    7
x5    7
dtype: int64

In [1192]:
df.x1.count()

7

In [1193]:
df.mean()

x1    53.714286
x2    64.571429
x3    47.285714
x4    53.571429
x5    47.285714
dtype: float64

In [1194]:
df.x2.mean()

64.57142857142857

In [1195]:
df.median()

x1    52.0
x2    63.0
x3    41.0
x4    71.0
x5    48.0
dtype: float64

In [1196]:
df.x3.median()

41.0

In [1197]:
df.min()

x1     1
x2     1
x3    14
x4     2
x5    21
dtype: int64

In [1198]:
df.x4.min()

2

In [1199]:
df.idxmin()
#df.argmin() # it gives an error

x1    4
x2    3
x3    0
x4    2
x5    2
dtype: int64

In [1200]:
df.x5.idxmin()
#df.x5.argmin()

2

In [1201]:
df.std()

x1    33.673502
x2    32.623392
x3    30.663302
x4    35.818325
x5    18.454577
dtype: float64

In [1202]:
df[["x1", "x2"]].std()

x1    33.673502
x2    32.623392
dtype: float64

In [1203]:
df.var()

x1    1133.904762
x2    1064.285714
x3     940.238095
x4    1282.952381
x5     340.571429
dtype: float64

In [1204]:
df[["x1", "x2"]].var()

x1    1133.904762
x2    1064.285714
dtype: float64

In [1205]:
df.sum(axis=0)

x1    376
x2    452
x3    331
x4    375
x5    331
dtype: int64

In [1206]:
df.sum(axis=1)

0    288
1    336
2    232
3    206
4    175
5    289
6    339
dtype: int64

In [1207]:
df.x1.sum()

376

In [1208]:
df.describe()

Unnamed: 0,x1,x2,x3,x4,x5
count,7.0,7.0,7.0,7.0,7.0
mean,53.714286,64.571429,47.285714,53.571429,47.285714
std,33.673502,32.623392,30.663302,35.818325,18.454577
min,1.0,1.0,14.0,2.0,21.0
25%,35.5,57.5,22.0,24.5,34.5
50%,52.0,63.0,41.0,71.0,48.0
75%,81.0,87.0,72.5,81.0,59.5
max,90.0,99.0,87.0,91.0,74.0


In [1209]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x1,7.0,53.714286,33.673502,1.0,35.5,52.0,81.0,90.0
x2,7.0,64.571429,32.623392,1.0,57.5,63.0,87.0,99.0
x3,7.0,47.285714,30.663302,14.0,22.0,41.0,72.5,87.0
x4,7.0,53.571429,35.818325,2.0,24.5,71.0,81.0,91.0
x5,7.0,47.285714,18.454577,21.0,34.5,48.0,59.5,74.0


# Groupby  & Aggregation

## DataFrame.groupby()

- The **groupby** method allows you to group rows of data together and call aggregate functions

In [1210]:
df=sns.load_dataset('iris')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [1211]:
df.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x16b721e20>

In [1212]:
df.groupby("species").mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [1213]:
df.groupby("species").describe()

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,...,petal_length,petal_length,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
setosa,50.0,5.006,0.35249,4.3,4.8,5.0,5.2,5.8,50.0,3.428,...,1.575,1.9,50.0,0.246,0.105386,0.1,0.2,0.2,0.3,0.6
versicolor,50.0,5.936,0.516171,4.9,5.6,5.9,6.3,7.0,50.0,2.77,...,4.6,5.1,50.0,1.326,0.197753,1.0,1.2,1.3,1.5,1.8
virginica,50.0,6.588,0.63588,4.9,6.225,6.5,6.9,7.9,50.0,2.974,...,5.875,6.9,50.0,2.026,0.27465,1.4,1.8,2.0,2.3,2.5


In [1214]:
df.groupby("species").describe().T

Unnamed: 0,species,setosa,versicolor,virginica
sepal_length,count,50.0,50.0,50.0
sepal_length,mean,5.006,5.936,6.588
sepal_length,std,0.35249,0.516171,0.63588
sepal_length,min,4.3,4.9,4.9
sepal_length,25%,4.8,5.6,6.225
sepal_length,50%,5.0,5.9,6.5
sepal_length,75%,5.2,6.3,6.9
sepal_length,max,5.8,7.0,7.9
sepal_width,count,50.0,50.0,50.0
sepal_width,mean,3.428,2.77,2.974


In [1215]:
df.groupby('species')['sepal_length'].sum()

species
setosa        250.3
versicolor    296.8
virginica     329.4
Name: sepal_length, dtype: float64

In [1216]:
df.groupby('species')[['sepal_length']].sum()

Unnamed: 0_level_0,sepal_length
species,Unnamed: 1_level_1
setosa,250.3
versicolor,296.8
virginica,329.4


In [1217]:
df.groupby('species')[['sepal_length', "sepal_width"]].sum()

Unnamed: 0_level_0,sepal_length,sepal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1
setosa,250.3,171.4
versicolor,296.8,138.5
virginica,329.4,148.7


In [1218]:
data = {'Company':['GOOG', 'GOOG', 'MSFT', 'MSFT', 'GOOG', 'MSFT', 'GOOG', 'MSFT'],
        'Department':['HR', 'IT', 'IT', 'HR', 'HR', 'IT', 'IT', 'HR'],
        'Person':['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah', 'Tom', 'Terry'],
        'Age':[30, 28, 35, 40, 42, 25, 32, 48],
        'Sales':[200, 120, 340, 124, 243, 350, 180, 220]}

In [1219]:
df1 = pd.DataFrame(data)
df1

Unnamed: 0,Company,Department,Person,Age,Sales
0,GOOG,HR,Sam,30,200
1,GOOG,IT,Charlie,28,120
2,MSFT,IT,Amy,35,340
3,MSFT,HR,Vanessa,40,124
4,GOOG,HR,Carl,42,243
5,MSFT,IT,Sarah,25,350
6,GOOG,IT,Tom,32,180
7,MSFT,HR,Terry,48,220


In [1220]:
df1.groupby('Company')[['Age','Sales']].mean()

Unnamed: 0_level_0,Age,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
GOOG,33.0,185.75
MSFT,37.0,258.5


In [1221]:
df1.groupby('Company')["Sales"].mean()

Company
GOOG    185.75
MSFT    258.50
Name: Sales, dtype: float64

In [1222]:
df1.groupby('Company')[["Sales"]].mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
GOOG,185.75
MSFT,258.5


In [1223]:
df1.groupby(['Company', "Department"]).mean()

TypeError: Could not convert SamCarl to numeric

In [None]:
df1.groupby(['Company', "Department"])[["Sales"]].mean()

In [1224]:
df1.groupby(['Company', "Department"])[["Sales"]].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
Company,Department,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
GOOG,HR,2.0,221.5,30.405592,200.0,210.75,221.5,232.25,243.0
GOOG,IT,2.0,150.0,42.426407,120.0,135.0,150.0,165.0,180.0
MSFT,HR,2.0,172.0,67.882251,124.0,148.0,172.0,196.0,220.0
MSFT,IT,2.0,345.0,7.071068,340.0,342.5,345.0,347.5,350.0


You can save this object as a new variable:

In [1225]:
df1.groupby('Company')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x16b8189a0>

In [1226]:
by_comp=df1.groupby('Company')

And then call aggregate methods off the object:

In [1227]:
by_comp.mean()

TypeError: Could not convert HRITHRIT to numeric

# DataFrame/Series Operations

- ### `.aggregate()/agg()`
- ### `.filter()`
- ### `.transform()`
- ### `.apply()`
- ### `.applymap()`
- ### `.map()`
- ### `.pivot() & .pivot_table()`
- ### `.stack() & .unstack()`

## ``.aggregate()/agg()``

### DataFrame.agg()

In [1231]:
df2 = pd.DataFrame({'groups': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
                   'var1': [10, 23, 33, 22, 11, 99, 76, 84, 45],
                   'var2': [100, 253, 333, 262, 111, 969, 405, 578, 760]})
df2

Unnamed: 0,groups,var1,var2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969
6,A,76,405
7,B,84,578
8,C,45,760


In [1232]:
df2.agg([sum, min])

Unnamed: 0,groups,var1,var2
sum,ABCABCABC,403,3771
min,A,10,100


In [1233]:
df2[["var1", "var2"]].agg([sum, min])

Unnamed: 0,var1,var2
sum,403,3771
min,10,100


In [1234]:
df2.agg({"var1":sum, "var2":min})

var1    403
var2    100
dtype: int64

In [1235]:
df2.agg({"var1":[sum], "var2":[min]})

Unnamed: 0,var1,var2
sum,403.0,
min,,100.0


In [1236]:
df2.agg({"var1":[sum, np.mean], "var2":[min, max]})

Unnamed: 0,var1,var2
sum,403.0,
mean,44.777778,
min,,100.0
max,,969.0


### DataFrame.groupby().agg()

In [1255]:
df2.groupby('groups').aggregate([np.min,np.median,np.max])

Unnamed: 0_level_0,var1,var1,var1,var2,var2,var2
Unnamed: 0_level_1,amin,median,amax,amin,median,amax
groups,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,10,22.0,76,100,262.0,405
B,11,23.0,84,111,253.0,578
C,33,45.0,99,333,760.0,969


In [1260]:
df2.groupby('groups').agg([np.min,"median",max])

Unnamed: 0_level_0,var1,var1,var1,var2,var2,var2
Unnamed: 0_level_1,amin,median,max,amin,median,max
groups,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,10,22.0,76,100,262.0,405
B,11,23.0,84,111,253.0,578
C,33,45.0,99,333,760.0,969


In [1261]:
df2.groupby('groups').agg({'var1':[min,'max'], 'var2':'median'})

Unnamed: 0_level_0,var1,var1,var2
Unnamed: 0_level_1,min,max,median
groups,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,10,76,262.0
B,11,84,253.0
C,33,99,760.0


## .filter()

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.filter.html

https://www.sharpsightlabs.com/blog/pandas-filter/

### DataFrame.filter()

- DataFrame.filter(items=None, like=None, regex=None, axis=None)
- Subset the dataframe rows or columns according to the specified index labels.
- Note that this routine does not filter a dataframe on its contents. The filter is applied to the labels of the index.

In [1262]:
df2

Unnamed: 0,groups,var1,var2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969
6,A,76,405
7,B,84,578
8,C,45,760


In [1263]:
df2.filter(["groups", "var1"])
#df2[["groups", "var1"]]

Unnamed: 0,groups,var1
0,A,10
1,B,23
2,C,33
3,A,22
4,B,11
5,C,99
6,A,76
7,B,84
8,C,45


In [1264]:
df2.filter(regex = "^var", axis=1)

Unnamed: 0,var1,var2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969
6,76,405
7,84,578
8,45,760


In [1265]:
# The ‘like‘ parameter enables you to identify items that contain a certain string.

df2.filter(like="var", axis=1)

Unnamed: 0,var1,var2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969
6,76,405
7,84,578
8,45,760


In [None]:
df2.filter(like="1", axis=0)

Unnamed: 0,groups,var1,var2
1,B,23,253


### DataFrame.groupby().filter()

- DataFrameGroupBy.filter(func, dropna=True, *args, **kwargs)
- Return a copy of a DataFrame excluding filtered elements.
- Elements from groups are filtered if they do not satisfy the boolean criterion specified by func.

In [1266]:
df2.groups.unique()

array(['A', 'B', 'C'], dtype=object)

In [1267]:
df2.groupby("groups").mean()


Unnamed: 0_level_0,var1,var2
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
A,36.0,255.666667
B,39.333333,314.0
C,59.0,687.333333


In [1268]:
def filter_func(x):
    return x["var1"].mean() > 39

In [1269]:
# returns the rows that meet the condition, as a DataFrame
df2.groupby('groups').filter(filter_func)

Unnamed: 0,groups,var1,var2
1,B,23,253
2,C,33,333
4,B,11,111
5,C,99,969
7,B,84,578
8,C,45,760


In [1271]:
df2.groupby("groups").sum()

Unnamed: 0_level_0,var1,var2
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
A,108,767
B,118,942
C,177,2062


In [1272]:
df2.groupby('groups').filter(lambda x : x['var2'].sum()<800)

Unnamed: 0,groups,var1,var2
0,A,10,100
3,A,22,262
6,A,76,405


## .transform()

- Python’s **Transform** function returns a self-produced dataframe with transformed values after applying the function specified in its parameter.
- This dataframe has the same length as the passed dataframe.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.transform.html

https://www.analyticsvidhya.com/blog/2020/03/understanding-transform-function-python/

### DataFrame.transform()

- DataFrame.transform(func, axis=0, *args, **kwargs)
- Call func on self producing a DataFrame with the same axis shape as self.

In [1273]:
df_num = df2.iloc[:,1:3]
df_num

Unnamed: 0,var1,var2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969
6,76,405
7,84,578
8,45,760


In [None]:
df_num.transform(lambda x : x+10)
#df_num+10

Unnamed: 0,var1,var2
0,20,110
1,33,263
2,43,343
3,32,272
4,21,121
5,109,979
6,86,415
7,94,588
8,55,770


In [1275]:
# returns A DataFrame that must have the same length as self. Standardization(mean 0 std 1) Normalization (0-1)
df_num.transform(lambda x : (x-x.mean())/x.std())
#(df_num-df_num.mean())/df_num.std()

Unnamed: 0,var1,var2
0,-1.040605,-1.078044
1,-0.651625,-0.560989
2,-0.352409,-0.290633
3,-0.681547,-0.530573
4,-1.010684,-1.04087
5,1.622413,1.858697
6,0.934218,-0.047312
7,1.17359,0.537332
8,0.006649,1.152392


In [1276]:
df_num.var1.transform(lambda x : (x-x.mean())/x.std())

0   -1.040605
1   -0.651625
2   -0.352409
3   -0.681547
4   -1.010684
5    1.622413
6    0.934218
7    1.173590
8    0.006649
Name: var1, dtype: float64

In [1281]:
df_num.var1.transform(np.sqrt)

0    3.162278
1    4.795832
2    5.744563
3    4.690416
4    3.316625
5    9.949874
6    8.717798
7    9.165151
8    6.708204
Name: var1, dtype: float64

In [1282]:
df_num.var1.transform([np.sqrt, np.exp])

Unnamed: 0,sqrt,exp
0,3.162278,22026.47
1,4.795832,9744803000.0
2,5.744563,214643600000000.0
3,4.690416,3584913000.0
4,3.316625,59874.14
5,9.949874,9.889030000000001e+42
6,8.717798,1.0148e+33
7,9.165151,3.025077e+36
8,6.708204,3.493427e+19


### DataFrame.groupby().transform()

- DataFrameGroupBy.transform(func, *args, engine=None, engine_kwargs=None, **kwargs)
- Call function producing a like-indexed DataFrame on each group and return a DataFrame having the same indexes as the original object filled with the transformed values.

In [1283]:
df2.groupby("groups")["var1"].mean()

groups
A    36.000000
B    39.333333
C    59.000000
Name: var1, dtype: float64

In [1284]:
df2.groupby("groups")["var1"].transform("mean")

0    36.000000
1    39.333333
2    59.000000
3    36.000000
4    39.333333
5    59.000000
6    36.000000
7    39.333333
8    59.000000
Name: var1, dtype: float64

In [1285]:
df2["var1_mean_transform"] = df2.groupby("groups")["var1"].transform("mean")
df2

Unnamed: 0,groups,var1,var2,var1_mean_transform
0,A,10,100,36.0
1,B,23,253,39.333333
2,C,33,333,59.0
3,A,22,262,36.0
4,B,11,111,39.333333
5,C,99,969,59.0
6,A,76,405,36.0
7,B,84,578,39.333333
8,C,45,760,59.0


In [1287]:
df2["var2_median_transform"] = df2.groupby("groups")["var2"].transform("median")
df2

Unnamed: 0,groups,var1,var2,var1_mean_transform,var2_median_transform
0,A,10,100,36.0,262.0
1,B,23,253,39.333333,253.0
2,C,33,333,59.0,760.0
3,A,22,262,36.0,262.0
4,B,11,111,39.333333,253.0
5,C,99,969,59.0,760.0
6,A,76,405,36.0,262.0
7,B,84,578,39.333333,253.0
8,C,45,760,59.0,760.0


## .apply()

### Series.apply() - df["col"].apply()

- Series.apply(func, convert_dtype=True, args=(), **kwargs)
- Invoke function on values of Series.
- Can be ufunc (a NumPy function that applies to the entire Series) or a Python function that only works on single values.

In [1290]:
df3 = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})

df3

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [1291]:
def squared(x):
    return x**2

In [1292]:
df3['col1'].apply(squared)

0     1
1     4
2     9
3    16
Name: col1, dtype: int64

In [1293]:
df3['col2'].apply(np.log)

0    6.095825
1    6.318968
2    6.501290
3    6.095825
Name: col2, dtype: float64

In [1294]:
df3['col3'].apply(len)

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [1295]:
df3['col3'].apply(lambda x: x[0] * 3)

0    aaa
1    ddd
2    ggg
3    xxx
Name: col3, dtype: object

In [1296]:
df3['col2'].apply(lambda x: "low" if x > 500 else "high")

0    high
1     low
2     low
3    high
Name: col2, dtype: object

### DataFrame.apply()

- DataFrame.apply(func, axis=0, raw=False, result_type=None, args=(), **kwargs)
- Apply a function along an axis of the DataFrame.

In [1324]:
df2

Unnamed: 0,groups,var1,var2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969
6,A,76,405
7,B,84,578
8,C,45,760


In [1325]:
df2.drop(["var1_mean_transform", "var2_median_transform"], axis=1, inplace=True)
df2

KeyError: "['var1_mean_transform', 'var2_median_transform'] not found in axis"

In [1326]:
df2.apply(np.sum)

groups    ABCABCABC
var1            403
var2           3771
dtype: object

In [1327]:
# It gives an error due to the non-numeric column : "groups"

# df2.apply(np.sum, axis=1)

In [1328]:
df_num

Unnamed: 0,var1,var2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969
6,76,405
7,84,578
8,45,760


In [1329]:
df_num.apply(np.sum)
#df_num.sum()

var1     403
var2    3771
dtype: int64

In [1330]:
df_num.apply(np.sum, axis=1)
#df_num.sum(axis=1)

0     110
1     276
2     366
3     284
4     122
5    1068
6     481
7     662
8     805
dtype: int64

In [1331]:
df_num.apply(lambda x : x + 10)

Unnamed: 0,var1,var2
0,20,110
1,33,263
2,43,343
3,32,272
4,21,121
5,109,979
6,86,415
7,94,588
8,55,770


In [1332]:
df2.groupby('groups').apply(np.mean)

groups
A    145.833333
B    176.666667
C    373.166667
dtype: float64

In [1333]:
df2.groupby("groups").mean()

Unnamed: 0_level_0,var1,var2
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
A,36.0,255.666667
B,39.333333,314.0
C,59.0,687.333333


### DataFrame.applymap()

- DataFrame.applymap(func, na_action=None, **kwargs)
- Apply a function to a Dataframe elementwise.
- This method applies a function that accepts and returns a scalar to every element of a DataFrame.

In [1321]:
df_num

Unnamed: 0,var1,var2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969
6,76,405
7,84,578
8,45,760


In [1336]:
df_num.applymap(lambda x:x*5)
#df_num.apply(lambda x: x*5)
#df_num*5

Unnamed: 0,var1,var2
0,50,500
1,115,1265
2,165,1665
3,110,1310
4,55,555
5,495,4845
6,380,2025
7,420,2890
8,225,3800


In [1337]:
df_num.applymap(lambda x: len(str(x*5)))

Unnamed: 0,var1,var2
0,2,3
1,3,4
2,3,4
3,3,4
4,2,3
5,3,4
6,3,4
7,3,4
8,3,4


In [1338]:
df_num.apply(lambda x: len(str(x*5)))

var1    105
var2    114
dtype: int64

- **applymap()** is only available in DataFrame and used for element-wise operation across the whole DataFrame. It has been optimized and some cases work **much faster than apply()**

- **applymap()** method only works on a pandas dataframe where function is applied on every element individually. **apply()** method can be applied both to series and dataframes where function can be applied both series and individual elements based on the type of function provided.

## .map()
### Series.map() - df["col"].map()

- Python's **map()** is a built-in function that allows you to process and transform all the items in an iterable without using an explicit for loop, a technique commonly known as mapping.
- **map()** is useful when you need to apply a transformation function to each item in an iterable and transform them into a new iterable.

https://realpython.com/python-map-function/#:~:text=Python's%20map()%20is%20a,them%20into%20a%20new%20iterable.

- The **map()** function is used to map values of Series according to input correspondence. Used for substituting each value in a Series with another value, that may be derived from a function, a dict or a Series.
- **map()** accepts a dict or a Series. Values that are not found in the dict are converted to NaN, unless the dict has a default value.

https://www.w3resource.com/pandas/series/series-map.php

In [1345]:
df3

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [1346]:
df3.col1.map({1:"A", 2:"B"})

0      A
1      B
2    NaN
3    NaN
Name: col1, dtype: object

In [1347]:
s = pd.Series(['fox', 'cow', np.nan, 'dog'])
s

0    fox
1    cow
2    NaN
3    dog
dtype: object

In [1348]:
s.map('I am a {}'.format)

0    I am a fox
1    I am a cow
2    I am a nan
3    I am a dog
dtype: object

In [1349]:
s.map('I am a {}'.format, na_action='ignore')

0    I am a fox
1    I am a cow
2           NaN
3    I am a dog
dtype: object

- **apply()** is used to apply a function along an axis of the DataFrame or on values of Series.
- **applymap()** is used to apply a function to a DataFrame elementwise.
- **map()** is used to substitute each value in a Series with another value.

https://towardsdatascience.com/introduction-to-pandas-apply-applymap-and-map-5d3e044e93ff#:~:text=apply()%20is%20used%20to,a%20Series%20with%20another%20value.

https://stackoverflow.com/questions/19798153/difference-between-map-applymap-and-apply-methods-in-pandas

## df.transform() vs df.apply()

**Similarties**
- Both apply() and transform() can be used to manipulate the entire DataFrame.
- Both apply() and transform() support lambda expression.
- Both apply() and transform() can be used for manipulating a single column.

In [1350]:
df5 = pd.DataFrame({'A': [1,2,3], 'B': [10,20,30] })
df5

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [1351]:
df5.apply(lambda x: x+10)

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


In [1352]:
df5.transform(lambda x: x+10)

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


In [1353]:
df5['B_ap'] = df5['B'].apply(lambda x: x+10)
df5['B_tr'] = df5['B'].transform(lambda x: x+10)
df5

Unnamed: 0,A,B,B_ap,B_tr
0,1,10,20,20
1,2,20,30,30
2,3,30,40,40


### Differences between .apply() and .transform() when manupulating data
- **transform()** works with function, a string function, a list of functions, and a dict. However, **apply()** is only allowed with function.
- **transform()** cannot produce aggregated results.
- **apply()** works with multiple Series at a time. But, **transform()** is only allowed to work with a single Series at a time.

In [1355]:
df5 = df5[["A","B"]]
df5

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [1357]:
df5.transform('sqrt')
#df5.apply('sqrt') # gives an error

Unnamed: 0,A,B
0,1.0,3.162278
1,1.414214,4.472136
2,1.732051,5.477226


In [1358]:
df5.transform(np.sqrt)
#df5.apply(np.sqrt)

Unnamed: 0,A,B
0,1.0,3.162278
1,1.414214,4.472136
2,1.732051,5.477226


In [1359]:
df5.transform([np.sqrt, np.exp])
#df5.apply([np.sqrt, np.exp])

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,sqrt,exp,sqrt,exp
0,1.0,2.718282,3.162278,22026.47
1,1.414214,7.389056,4.472136,485165200.0
2,1.732051,20.085537,5.477226,10686470000000.0


In [1360]:
df5.transform({'A': np.sqrt, 'B': np.exp})
#df5.apply({'A': np.sqrt, 'B': np.exp})

Unnamed: 0,A,B
0,1.0,22026.47
1,1.414214,485165200.0
2,1.732051,10686470000000.0


In [1363]:
df5.apply(lambda x: x.sum())
# df5.transform(lambda x:x.sum()) # gives an error

A     6
B    60
dtype: int64

In [1364]:
df5.apply(lambda x: x["B"]-x["A"], axis=1)
# df5.transform(lambda x: x["B"]-x["A"], axis=1) # gives an error

0     9
1    18
2    27
dtype: int64

### Differences Between .apply() and .transform() when using them in conjunction with groupby()
- **transform()** returns a DataFrame that has the same length as the input
- **apply()** works with multiple Series at a time. But, **transform()** is only allowed to work with a single Series at a time.

In [1367]:
df6 = pd.DataFrame({'key': ['a','b','c'] * 3,
                    'A': np.arange(9),
                    'B': [1,2,3] * 3})
df6

Unnamed: 0,key,A,B
0,a,0,1
1,b,1,2
2,c,2,3
3,a,3,1
4,b,4,2
5,c,5,3
6,a,6,1
7,b,7,2
8,c,8,3


In [1368]:
df6.groupby('key')['A'].sum()

key
a     9
b    12
c    15
Name: A, dtype: int64

In [1369]:
df6.groupby('key')['A'].apply(lambda x: x.sum())

key
a     9
b    12
c    15
Name: A, dtype: int64

In [1370]:
df6.groupby('key')['A'].transform(lambda x: x.sum())

0     9
1    12
2    15
3     9
4    12
5    15
6     9
7    12
8    15
Name: A, dtype: int64

In [1372]:
df6.groupby('key').apply(lambda x: x["B"]-x["A"])
# df6.groupby('key').transform(lambda x: x["B"]-x["A"]) # gives an error

key   
a    0    1
     3   -2
     6   -5
b    1    1
     4   -2
     7   -5
c    2    1
     5   -2
     8   -5
dtype: int64

# pivot() vs pivot_table()

**pivot_table**
- pandas.pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All', observed=False, sort=True).
- Create a spreadsheet-style pivot table as a DataFrame.
- The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame.

**pivot**
- DataFrame.pivot(index=None, columns=None, values=None).
- Return reshaped DataFrame organized by given index / column values.
- Reshape data (produce a “pivot” table) based on column values. Uses unique values from specified index / columns to form axes of the resulting DataFrame.
- This function does not support data aggregation, multiple values will result in a MultiIndex in the columns.

**Differences**
- **Pivot_table** is a generalization of pivot that can handle duplicate values for one pivoted index/column pair.
- **Pivot_table** will only allow numeric types as "values=", whereas **pivot** will take string types as "values=".

In [1400]:
data = {'gender':['male', 'female', 'female', 'male', 'female', 'male'],
        'sport':['tennis', 'tennis', 'basketball', 'football', 'voleyball', 'basketball'],
        'status':["professional","professional","professional","amateur","amateur","amateur"],
        'age':[20, 24, 26, 23, 22, 21],
        'height':[185, 172, 175, 178, 182, 196],
        'weight':[83, 58, 62, 80, 65, 90]}

df7 = pd.DataFrame(data)

df7

Unnamed: 0,gender,sport,status,age,height,weight
0,male,tennis,professional,20,185,83
1,female,tennis,professional,24,172,58
2,female,basketball,professional,26,175,62
3,male,football,amateur,23,178,80
4,female,voleyball,amateur,22,182,65
5,male,basketball,amateur,21,196,90


In [1401]:
df7.pivot_table(index='gender',
                columns='sport',
                values=['age'],
                aggfunc='mean')

Unnamed: 0_level_0,age,age,age,age
sport,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,26.0,,24.0,22.0
male,21.0,23.0,20.0,


In [1402]:
df7.pivot_table(index='gender',
                columns='sport',
                values=['age','height','weight'],
                aggfunc='mean')

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


Unnamed: 0_level_0,age,age,age,age,height,height,height,height,weight,weight,weight,weight
sport,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
female,26.0,,24.0,22.0,175.0,,172.0,182.0,62.0,,58.0,65.0
male,21.0,23.0,20.0,,196.0,178.0,185.0,,90.0,80.0,83.0,


In [1403]:
df7.pivot(index='gender',
          columns='sport',
          values=['age','height','weight'])

Unnamed: 0_level_0,age,age,age,age,height,height,height,height,weight,weight,weight,weight
sport,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
female,26.0,,24.0,22.0,175.0,,172.0,182.0,62.0,,58.0,65.0
male,21.0,23.0,20.0,,196.0,178.0,185.0,,90.0,80.0,83.0,


In [1404]:
df7.pivot(index='gender',
          columns='sport',
          values='status')

sport,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,professional,,professional,amateur
male,amateur,amateur,professional,


In [1405]:
#gives an error because no numeric types to aggregate
#df7.pivot_table(index='gender',
#                columns='sport',
#                values='status')

In [1406]:
df7

Unnamed: 0,gender,sport,status,age,height,weight
0,male,tennis,professional,20,185,83
1,female,tennis,professional,24,172,58
2,female,basketball,professional,26,175,62
3,male,football,amateur,23,178,80
4,female,voleyball,amateur,22,182,65
5,male,basketball,amateur,21,196,90


In [1407]:
df7.loc[2,"sport"] = "tennis"
df7

Unnamed: 0,gender,sport,status,age,height,weight
0,male,tennis,professional,20,185,83
1,female,tennis,professional,24,172,58
2,female,tennis,professional,26,175,62
3,male,football,amateur,23,178,80
4,female,voleyball,amateur,22,182,65
5,male,basketball,amateur,21,196,90


In [1412]:
df7.pivot_table(index='gender',
                columns='sport',
                values=['age','height','weight'],
                aggfunc='mean')

Unnamed: 0_level_0,age,age,age,age,height,height,height,height,weight,weight,weight,weight
sport,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
female,,,25.0,22.0,,,173.5,182.0,,,60.0,65.0
male,21.0,23.0,20.0,,196.0,178.0,185.0,,90.0,80.0,83.0,


In [1413]:
#gives an error because of the dublicate values
#df7.pivot(index='gender',
#          columns='sport',
#          values=['age','height','weight'])

In [1414]:
df7.pivot(columns='sport',
          values=['age','height','weight'])

Unnamed: 0_level_0,age,age,age,age,height,height,height,height,weight,weight,weight,weight
sport,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball
0,,,20.0,,,,185.0,,,,83.0,
1,,,24.0,,,,172.0,,,,58.0,
2,,,26.0,,,,175.0,,,,62.0,
3,,23.0,,,,178.0,,,,80.0,,
4,,,,22.0,,,,182.0,,,,65.0
5,21.0,,,,196.0,,,,90.0,,,


In [1415]:
df7.pivot_table(index=['gender', 'sport'],
                values=['age','height','weight'],
                aggfunc='mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,age,height,weight
gender,sport,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,tennis,25,173.5,60
female,voleyball,22,182.0,65
male,basketball,21,196.0,90
male,football,23,178.0,80
male,tennis,20,185.0,83


In [1418]:
#gives an error because of the lack of "columns" parameter
#df7.pivot(index=['gender', 'sport'],
#          values=['age','height','weight'])

In [1419]:
df7.groupby(["gender","sport"]).mean()

TypeError: Could not convert professionalprofessional to numeric

In [1420]:
df7.pivot_table(index=['gender', 'sport'],
                columns = "status",
                values=['age','height','weight'],
                aggfunc='mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,height,height,weight,weight
Unnamed: 0_level_1,status,amateur,professional,amateur,professional,amateur,professional
gender,sport,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,tennis,,25.0,,173.5,,60.0
female,voleyball,22.0,,182.0,,65.0,
male,basketball,21.0,,196.0,,90.0,
male,football,23.0,,178.0,,80.0,
male,tennis,,20.0,,185.0,,83.0


In [1421]:
#gives an error because of the dublicate values
#df7.pivot(index=['gender', 'sport'],
#          columns = "status",
#          values=['age','height','weight'])

In [1422]:
df7.pivot(columns = "status",
          values=['age','height','weight'])

Unnamed: 0_level_0,age,age,height,height,weight,weight
status,amateur,professional,amateur,professional,amateur,professional
0,,20.0,,185.0,,83.0
1,,24.0,,172.0,,58.0
2,,26.0,,175.0,,62.0
3,23.0,,178.0,,80.0,
4,22.0,,182.0,,65.0,
5,21.0,,196.0,,90.0,


# .stack() & .unstack()

**stack()**
- DataFrame.stack(level=- 1, dropna=True)
- Stack the prescribed level(s) from columns to index.
- Return a reshaped DataFrame or Series having a multi-level index with one or more new inner-most levels compared to the current DataFrame.
- The new inner-most levels are created by pivoting the columns of the current dataframe:
    - if the columns have a single level, the output is a Series;
    - if the columns have multiple levels, the new index level(s) is (are) taken from the prescribed level(s) and the output is a DataFrame.

In [1426]:
df7

Unnamed: 0,gender,sport,status,age,height,weight
0,male,tennis,professional,20,185,83
1,female,tennis,professional,24,172,58
2,female,tennis,professional,26,175,62
3,male,football,amateur,23,178,80
4,female,voleyball,amateur,22,182,65
5,male,basketball,amateur,21,196,90


In [1427]:
df7["level"] = ["high", "high", "low", "high", "low", "low"]
df7

Unnamed: 0,gender,sport,status,age,height,weight,level
0,male,tennis,professional,20,185,83,high
1,female,tennis,professional,24,172,58,high
2,female,tennis,professional,26,175,62,low
3,male,football,amateur,23,178,80,high
4,female,voleyball,amateur,22,182,65,low
5,male,basketball,amateur,21,196,90,low


In [1428]:
df8 = df7.pivot_table(index=['gender','sport'],
                columns=["status","level"], 
                values=['age','height','weight'],
                aggfunc='mean')
df8

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,age,age,height,height,height,height,weight,weight,weight,weight
Unnamed: 0_level_1,status,amateur,amateur,professional,professional,amateur,amateur,professional,professional,amateur,amateur,professional,professional
Unnamed: 0_level_2,level,high,low,high,low,high,low,high,low,high,low,high,low
gender,sport,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
female,tennis,,,24.0,26.0,,,172.0,175.0,,,58.0,62.0
female,voleyball,,22.0,,,,182.0,,,,65.0,,
male,basketball,,21.0,,,,196.0,,,,90.0,,
male,football,23.0,,,,178.0,,,,80.0,,,
male,tennis,,,20.0,,,,185.0,,,,83.0,


In [1429]:
df8.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,age,height,height,weight,weight
Unnamed: 0_level_1,Unnamed: 1_level_1,status,amateur,professional,amateur,professional,amateur,professional
gender,sport,level,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
female,tennis,high,,24.0,,172.0,,58.0
female,tennis,low,,26.0,,175.0,,62.0
female,voleyball,low,22.0,,182.0,,65.0,
male,basketball,low,21.0,,196.0,,90.0,
male,football,high,23.0,,178.0,,80.0,
male,tennis,high,,20.0,,185.0,,83.0


In [1437]:
# try "-1,-2,0,1,2" as level parameter
df8.stack(level=-1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,age,height,height,weight,weight
Unnamed: 0_level_1,Unnamed: 1_level_1,status,amateur,professional,amateur,professional,amateur,professional
gender,sport,level,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
female,tennis,high,,24.0,,172.0,,58.0
female,tennis,low,,26.0,,175.0,,62.0
female,voleyball,low,22.0,,182.0,,65.0,
male,basketball,low,21.0,,196.0,,90.0,
male,football,high,23.0,,178.0,,80.0,
male,tennis,high,,20.0,,185.0,,83.0


In [1438]:
df8.stack(level=-1, dropna=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,age,height,height,weight,weight
Unnamed: 0_level_1,Unnamed: 1_level_1,status,amateur,professional,amateur,professional,amateur,professional
gender,sport,level,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
female,tennis,high,,24.0,,172.0,,58.0
female,tennis,low,,26.0,,175.0,,62.0
female,voleyball,high,,,,,,
female,voleyball,low,22.0,,182.0,,65.0,
male,basketball,high,,,,,,
male,basketball,low,21.0,,196.0,,90.0,
male,football,high,23.0,,178.0,,80.0,
male,football,low,,,,,,
male,tennis,high,,20.0,,185.0,,83.0
male,tennis,low,,,,,,


In [1439]:
df8.stack().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,age,height,weight
gender,sport,level,status,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,tennis,high,professional,24.0,172.0,58.0
female,tennis,low,professional,26.0,175.0,62.0
female,voleyball,low,amateur,22.0,182.0,65.0
male,basketball,low,amateur,21.0,196.0,90.0
male,football,high,amateur,23.0,178.0,80.0
male,tennis,high,professional,20.0,185.0,83.0


In [1440]:
df8.stack().stack().stack()

gender  sport       level  status              
female  tennis      high   professional  age        24.0
                                         height    172.0
                                         weight     58.0
                    low    professional  age        26.0
                                         height    175.0
                                         weight     62.0
        voleyball   low    amateur       age        22.0
                                         height    182.0
                                         weight     65.0
male    basketball  low    amateur       age        21.0
                                         height    196.0
                                         weight     90.0
        football    high   amateur       age        23.0
                                         height    178.0
                                         weight     80.0
        tennis      high   professional  age        20.0
                                        

**unstack()**
- DataFrame.unstack(level=- 1, fill_value=None)
- Pivot a level of the (necessarily hierarchical) index labels.
- Returns a DataFrame having a new level of column labels whose inner-most level consists of the pivoted index labels.
- If the index is not a MultiIndex, the output will be a Series.

In [1443]:
df8

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,age,age,height,height,height,height,weight,weight,weight,weight
Unnamed: 0_level_1,status,amateur,amateur,professional,professional,amateur,amateur,professional,professional,amateur,amateur,professional,professional
Unnamed: 0_level_2,level,high,low,high,low,high,low,high,low,high,low,high,low
gender,sport,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
female,tennis,,,24.0,26.0,,,172.0,175.0,,,58.0,62.0
female,voleyball,,22.0,,,,182.0,,,,65.0,,
male,basketball,,21.0,,,,196.0,,,,90.0,,
male,football,23.0,,,,178.0,,,,80.0,,,
male,tennis,,,20.0,,,,185.0,,,,83.0,


In [1444]:
df8.unstack()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,age,age,...,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight
status,amateur,amateur,amateur,amateur,amateur,amateur,amateur,amateur,professional,professional,...,amateur,amateur,professional,professional,professional,professional,professional,professional,professional,professional
level,high,high,high,high,low,low,low,low,high,high,...,low,low,high,high,high,high,low,low,low,low
sport,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,...,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
female,,,,,,,,22.0,,,...,,65.0,,,58.0,,,,62.0,
male,,23.0,,,21.0,,,,,,...,,,,,83.0,,,,,


In [None]:
# try "-1,0,1" as level parameter
df8.unstack(level=-1)

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,age,age,...,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight
status,amateur,amateur,amateur,amateur,amateur,amateur,amateur,amateur,professional,professional,...,amateur,amateur,professional,professional,professional,professional,professional,professional,professional,professional
level,high,high,high,high,low,low,low,low,high,high,...,low,low,high,high,high,high,low,low,low,low
sport,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,...,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
female,,,,,,,,22.0,,,...,,65.0,,,58.0,,,,62.0,
male,,23.0,,,21.0,,,,,,...,,,,,83.0,,,,,


In [None]:
df8.unstack(level=-1, fill_value="-")

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,age,age,...,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight
status,amateur,amateur,amateur,amateur,amateur,amateur,amateur,amateur,professional,professional,...,amateur,amateur,professional,professional,professional,professional,professional,professional,professional,professional
level,high,high,high,high,low,low,low,low,high,high,...,low,low,high,high,high,high,low,low,low,low
sport,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,...,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
female,-,-,,,-,-,,22.0,-,-,...,,65.0,-,-,58.0,,-,-,62.0,
male,,23.0,,-,21.0,,,-,,,...,,-,,,83.0,-,,,,-


In [None]:
df8.unstack().unstack()

        status        level  sport       gender
age     amateur       high   basketball  female     NaN
                                         male       NaN
                             football    female     NaN
                                         male      23.0
                             tennis      female     NaN
                                                   ... 
weight  professional  low    football    male       NaN
                             tennis      female    62.0
                                         male       NaN
                             voleyball   female     NaN
                                         male       NaN
Length: 96, dtype: float64

In [None]:
pd.DataFrame(df8.unstack().unstack())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,0
Unnamed: 0_level_1,status,level,sport,gender,Unnamed: 5_level_1
age,amateur,high,basketball,female,
age,amateur,high,basketball,male,
age,amateur,high,football,female,
age,amateur,high,football,male,23.0
age,amateur,high,tennis,female,
...,...,...,...,...,...
weight,professional,low,football,male,
weight,professional,low,tennis,female,62.0
weight,professional,low,tennis,male,
weight,professional,low,voleyball,female,
